Python爬虫 scrapy框架（四）CrawlSpider 链接提取器 LinkExtractor 规则解析器 Rule_综合

scrapy框架

CrawlSpider

CrawlSpider：基于Spider的子类，继承父类的功能，且派生出自己的功能。

全站数据爬取的方式

基于Spider：手动请求发送
基于CrawlSpider：

基本使用

创建一个工程

scrapy startproject quanzhanPro

切换到工程目录

cd quanzhanPro

创建一个基于CrawlSpider类的爬虫文件

scrapy genspider -t crawl quanzhan wz.sun0769.com/political/index/politicsNewest

生成的爬虫文件

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Ruleclass QuanzhanSpider(CrawlSpider):name = 'quanzhan'allowed_domains = ['www.xxx.com']start_urls = ['http://www.xxx.com/']rules = (Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),)def parse_item(self, response):item = {
    }#item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()#item['name'] = response.xpath('//div[@id="name"]').get()#item['description'] = response.xpath('//div[@id="description"]').get()return item

链接提取器 LinkExtractor

根据指定规则（allow=‘正则’）提取符合规则的链接（会自动去重）
在这里插入图片描述

link = LinkExtractor(allow=r'id=1&page=/d+')

规则解析器 Rule

将链接提取器提取到的链接进行指定规则（callback）的解析操作
follow=True 全站数据爬取（进入到提取的页面继续提取匹配的链接）

Rule(LinkExtractor(allow=r'id=1&page=/d+'), callback='parse_item', follow=True)

quanzhan.py

import scrapy
from scrapy.linkextractors import LinkExtractor  # LinkExtractor链接提取器
from scrapy.spiders import CrawlSpider, Rule  # Rule规则解析器class QuanzhanSpider(CrawlSpider):name = 'quanzhan'# allowed_domains = ['www.xxx.com']start_urls = ['http://wz.sun0769.com/political/index/politicsNewest']# 实例化了一个链接提取器对象 根据指定规则提取符合规则的链接link = LinkExtractor(allow=r'id=1&page=\d+')rules = (# 实例化了一个规则解析器对象Rule(link, callback='parse_item', follow=True),)def parse_item(self, response):print(response)

执行工程

scrapy crawl quanzhan

在这里插入图片描述

案例

爬取阳关问政网站的问题反馈及其内容
http://wz.sun0769.com/political/index/politicsNewest
在这里插入图片描述

settings.py

BOT_NAME = 'quanzhanPro'SPIDER_MODULES = ['quanzhanPro.spiders']
NEWSPIDER_MODULE = 'quanzhanPro.spiders'# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'# Obey robots.txt rules
ROBOTSTXT_OBEY = FalseLOG_LEVEL = 'ERROR'ITEM_PIPELINES = {
    'quanzhanPro.pipelines.QuanzhanproPipeline': 300,
}

items.py

import scrapyclass QuanzhanproItem(scrapy.Item):# define the fields for your item here like:quantion_id = scrapy.Field()quantion_title = scrapy.Field()class DetailItem(scrapy.Item):quantion_detail = scrapy.Field()quantion_detail_id = scrapy.Field()

pipelines.py

from itemadapter import ItemAdapterclass QuanzhanproPipeline:fp = Nonedef open_spider(self, spider):  # 重写父类的一个方法：该方法只在开始爬虫的时候被调用一次print('start')self.fp = open('./quantion.txt', 'w', encoding='utf-8')def process_item(self, item, spider):if item.__class__.__name__ == 'DetailItem':quantion_detail = item['quantion_detail']quantion_detail_id = item['quantion_detail_id']self.fp.write(quantion_detail_id + '\n' + quantion_detail + '\n')else:quantion_id = item['quantion_id']quantion_title = item['quantion_title']self.fp.write(quantion_id + '\n' + quantion_title + '\n')return itemdef close_spider(self, spider):print('end')self.fp.close()

quanzhan.py

import scrapy
from scrapy.linkextractors import LinkExtractor  # LinkExtractor链接提取器
from scrapy.spiders import CrawlSpider, Rule  # Rule规则解析器
from quanzhanPro.items import QuanzhanproItem, DetailItemclass QuanzhanSpider(CrawlSpider):name = 'quanzhan'# allowed_domains = ['www.xxx.com']start_urls = ['http://wz.sun0769.com/political/index/politicsNewest']# 实例化了一个链接提取器对象 根据指定规则提取符合规则的链接link = LinkExtractor(allow=r'id=1&page=\d+')link_detail = LinkExtractor(allow=r'index\?id=\d+')rules = (# 实例化了一个规则解析器对象Rule(link, callback='parse_item', follow=True),Rule(link_detail, callback='parse_detial'),)def parse_item(self, response):li_list = response.xpath('/html/body/div[2]/div[3]/ul[2]/li')for li in li_list:quantion_id = li.xpath('./span[1]/text()').extract_first()quantion_title = li.xpath('./span[3]/a/text()').extract_first()item = QuanzhanproItem()item['quantion_id'] = quantion_iditem['quantion_title'] = quantion_titleyield item# return itemdef parse_detial(self, response):quantion_detail_id = response.xpath('/html/body/div[3]/div[2]/div[2]/div[1]/span[4]/text()').extract_first()quantion_detail = response.xpath('/html/body/div[3]/div[2]/div[2]/div[2]/pre//text()').extract()quantion_detail = ''.join(quantion_detail)detail_item = DetailItem()detail_item['quantion_detail_id'] = quantion_detail_iddetail_item['quantion_detail'] = quantion_detailyield detail_item