nspider一个轻量级的node爬虫框架

网友投稿 882 2022-10-25

nspider一个轻量级的node爬虫框架

nspider一个轻量级的node爬虫框架

nspider

A lightweight crawling/scraping package for Node.

Features:

server-side DOM & automatic jQuery insertion with CheerioControl rate limitPriority queue of requestsCompatible with 4.x or newer version

Get started

How to install

$ npm install nspider22

Usage

Basic use

//basic usevar nspider=require('nspider22')var nsp=new nspider({name:'baidu'});nsp.onHtml('a',function(ele){ console.log(ele.$.attr('href'));})nsp.visit("http://baidu.com");

Set headers

//set headersvar nspider=require('nspider22')var nsp=new nspider({name:'zhihu'});nsp.setHeaders({ "Cache-Control":"private,no-store,max-age=0,no-cache,must-revalidate,post-check=0,pre-check=0", "Connection":"keep-alive", "Content-Encoding":"gzip", "Content-Security-Policy":"default-src * blob:;img-src * data: blob:;frame-src 'self' *.zhihu.com *.zhihu.dev getpocket.com note.youdao.com safari-extension://com.evernote.safari.clipper-Q79WDW8YH9 weixin: zhihujs: v.qq.com v.youku.com bilibili.com *.vzuu.com;script-src 'self' *.zhihu.com *.google-analytics.com zhstatic.zhihu.com res.wx.qq.com 'unsafe-eval' unpkg.zhimg.com unicom.zhimg.com blob:;style-src 'self' *.zhihu.com *.zhihu.dev unicom.zhimg.com 'unsafe-inline';connect-src * wss:", "Content-Type":"text/html; charset=utf-8", "Date":"Thu, 19 Oct 2017 02:37:30 GMT", "Expires":"Fri, 02 Jan 2000 00:00:00 GMT", "Pragma":"no-cache", "Server":"ZWS", "Set-Cookie":"", "Transfer-Encoding":"chunked", "Vary":"Accept-Encoding", "X-Backend-Server":"heifetz.heifetz.fba20226---10.3.183.2:31036[10.3.183.2:31036]", "X-Frame-Options":"DENY", "X-Req-ID":"3570E3F59E80FE9", "X-Req-SSL":"proto=TLSv1.2,sni=,cipher=ECDHE-RSA-AES256-GCM-SHA384"}) nsp.onHtml('.tab-panel a.question_link',function(ele){ if(ele.tag=='zhihu'){ console.log(ele.$.attr('href')); nsp.visit('https://zhihu.com'+ele.$.attr('href'),'item') }})nsp.onHtml("body",function(ele){ if(ele.tag=='item'){ console.log(ele.$.text()) }})nsp.visit("https://zhihu.com/explore","zhihu");

set limiter

var nspider=require('nspider22')var nsp=new nspider({name:'baidu'});var limitRule = new nspider.LimitRule({ maxConnections:1, delayTime:1000 })nsp.setLimiter(limitRule)nsp.onHtml('a',function(ele){ console.log(ele.$.attr('href'));})nsp.visit("http://baidu.com");

Rough todolist

Introducing zombie to deal with page with complex ajaxRefactoring the code to be more maintainableCommander supportMiddleware support

版权声明:本文内容由网络用户投稿,版权归原作者所有,本站不拥有其著作权,亦不承担相应法律责任。如果您发现本站中有涉嫌抄袭或描述失实的内容,请联系我们jiasou666@gmail.com 处理,核实后本网站将在24小时内删除侵权内容。

上一篇:SpringBoot实现多数据源的切换实践
下一篇:Spring事务的失效场景你知道多少
相关文章

 发表评论

暂时没有评论,来抢沙发吧~