微信开发中 ACCESS TOKEN 过期失效的解决方案详解
565
2022-10-18
Scrapy 框架学习
案例 jd图书爬虫
jd图书网站爬取比较容易,主要是数据的提取
spider 代码:
import scrapy from jdbook.pipelines import JdbookPipeline import re from copy import deepcopy class JdbookspiderSpider(scrapy.Spider): name = 'jdbookspider' allowed_domains = ['jd.com'] start_urls = ['https://book.jd.com/booksort.html'] # 处理分类页面的数据 def parse(self, response): # 这里借助了selenium 先访问jd图书网,因为直接get请求jdbook 获取到只是一堆js代码,没有有用的html元素,通过selenium正常访问网页,将page_source(就是当前网页的页面内容,selenium提供的属性)返回给spider进行数据处理 # 处理大分类的列表页 response_data, driver = JdbookPipeline.gain_response_data(url='https://book.jd.com/booksort.html') driver.close() item = {} # 由于selenium返回的page_source是字符串,所以不能直接使用xpath,使用了正则(也可以借助bs4 再使用正则) middle_group_link = re.findall('.*?.*?.*?', response_data, re.S) big_group_name = re.findall('
pipeline 代码:
# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html # useful for handling different item types with a single interface import csv from itemadapter import ItemAdapter from selenium import webdriver import time class JdbookPipeline: # 将数据写入csv文件 def process_item(self, item, spider): with open('./jdbook.csv', 'a+', encoding='utf-8') as file: fieldnames = ['big_group_name', 'big_group_link', 'middle_group_name', 'middle_group_link', 'detail_name', 'detail_content', 'detail_link', 'detail_price'] writer = csv.DictWriter(file, fieldnames=fieldnames) writer.writerow(item) return item def open_spider(self, spider): with open('./jdbook.csv', 'w+', encoding='utf-8') as file: fieldnames = ['big_group_name', 'big_group_link', 'middle_group_name', 'middle_group_link', 'detail_name', 'detail_content', 'detail_link', 'detail_price'] writer = csv.DictWriter(file, fieldnames=fieldnames) writer.writeheader() # 提供的正常访问jdbook 方法,借助selenium @staticmethod def gain_response_data(url): drivers = webdriver.Chrome("E:\python_study\spider\data\chromedriver_win32\chromedriver.exe") drivers.implicitly_wait(2) drivers.get(url) drivers.implicitly_wait(2) time.sleep(2) # print(tb_cookie) return drivers.page_source, drivers
案例 当当图书爬虫
当当网的爬取也是比较容易, 但是这里需要结合scrapy-redis来实现分布式爬取数据
import urllib from copy import deepcopy import scrapy from scrapy_redis.spiders import RedisSpider import re # 不再是继承Spider类,而是继承自scrapy_redis的RedisSpider类 class DangdangspiderSpider(RedisSpider): name = 'dangdangspider' allowed_domains = ['dangdang.com'] # http://book.dangdang.com/ # 同时,start_urls 也不在使用, 而是定义一个redis_key, spider要爬取的request对象就以该值为key, url为值存储在redis中,spider爬取时就从redis 中获取 redis_key = "dangdang" # 处理图书分类数据 def parse(self, response): div_list = response.xpath("//div[@class='con flq_body']/div") for div in div_list: item = {} item["b_cate"] = div.xpath("./dl/dt//text()").extract() item["b_cate"] = [i.strip() for i in item["b_cate"] if len(i.strip()) > 0] # 中间分类分组 if len(item["b_cate"]) > 0: div_data = str(div.extract()) dl_list = re.findall('''
pipeline 代码:
import csv from itemadapter import ItemAdapter class DangdangbookPipeline: # 将数据写入到csv文件中 def process_item(self, item, spider): with open('./dangdangbook.csv', 'a+', encoding='utf-8', newline='') as file: fieldnames = ['b_cate', 'm_cate', 's_cate', 's_href', 'book_img', 'book_name', 'book_desc', 'book_price', 'book_author', 'book_publish_date', 'book_press'] writer = csv.DictWriter(file, fieldnames=fieldnames) writer.writerow(item) return item def open_spider(self, spider): with open('./dangdangbook.csv', 'w+', encoding='utf-8', newline='') as file: fieldnames = ['b_cate', 'm_cate', 's_cate', 's_href', 'book_img', 'book_name', 'book_desc', 'book_price', 'book_author', 'book_publish_date', 'book_press'] writer = csv.DictWriter(file, fieldnames=fieldnames) writer.writeheader()
settings 代码:
BOT_NAME = 'dangdangbook' SPIDER_MODULES = ['dangdangbook.spiders'] NEWSPIDER_MODULE = 'dangdangbook.spiders' # 需要scrapy-redis 的去重功能,这里引用 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" # 以及调度器 SCHEDULER = "scrapy_redis.scheduler.Scheduler" SCHEDULER_PERSIST = True # LOG_LEVEL = 'WARNING' # 设置redis 的服务地址 REDIS_URL = 'redis://127.0.0.1:6379' # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36' # Obey robots.txt rules ROBOTSTXT_OBEY = False ITEM_PIPELINES = { 'dangdangbook.pipelines.DangdangbookPipeline': 300, }
crontab 定时执行
以上都在Linux平台的直接操作crontab。
在python环境下我们可以借助pycrontab 来操作crontab 来设置定时任务。
补充
自定义的excel 到导出文件格式代码:
from scrapy.exporters import BaseItemExporter import xlwt class ExcelItemExporter(BaseItemExporter): def __init__(self, file, **kwargs): self._configure(kwargs) self.file = file self.wbook = xlwt.Workbook() self.wsheet = self.wbook.add_sheet('scrapy') self.row = 0 def finish_exporting(self): self.wbook.save(self.file) def export_item(self, item): fields = self._get_serialized_fields(item) for col, v in enumerate(x for _, x in fields): self.wsheet.write(self.row, col, v) self.row += 1
版权声明:本文内容由网络用户投稿,版权归原作者所有,本站不拥有其著作权,亦不承担相应法律责任。如果您发现本站中有涉嫌抄袭或描述失实的内容,请联系我们jiasou666@gmail.com 处理,核实后本网站将在24小时内删除侵权内容。
发表评论
暂时没有评论,来抢沙发吧~