当前位置: 代码迷 >> 综合 >> scrapy-redis 持久化爬取吉他社吉他谱
  详细解决方案

scrapy-redis 持久化爬取吉他社吉他谱

热度:66   发布时间:2023-12-06 05:18:02.0

我做了一个导航站(域名是挂路灯的全拼gualudeng.com),里面精选了各种影视,动漫,黑科技,实用工具,搞笑有趣的站点,动动大家可爱的小手,点进来看看吧,良心站点。

1.爬虫文件

# -*- coding: utf-8 -*-
import scrapy
import copy
import sysfrom gtshe.items import GtsheItemclass MusicSpider(scrapy.Spider):name = 'music'allowed_domains = ['jitashe.org']start_urls = ["https://www.jitashe.org"]cookie = "yGhj_40fe_saltkey=IDLlBPKk; yGhj_40fe_lastvisit=1551500410; yGhj_40fe_auth=e9f8%2FuHsl%2BbAhP%2BGint%2FUgLktBBjjf3EvlF0TXj4ZXWHe4Z%2Bcbge1LRi21zG6TL19UdsJLoP8sZmZAE%2B3iutAYxvfGg; yGhj_40fe_lastcheckfeed=644660%7C1551504015; yGhj_40fe_lip=60.176.42.168%2C1551504015; yGhj_40fe_pushuid=8430419; yGhj_40fe_pushgid=49982; yGhj_40fe_connect_is_bind=1; yGhj_40fe_st_p=644660%7C1551540151%7C13551060efe5a9679df4c4e9e02ed7a1; yGhj_40fe_viewid=tid_1336137; yGhj_40fe_ulastactivity=1551587332%7C0; yGhj_40fe_checkpm=1; yGhj_40fe_noticeTitle=1; Hm_lvt_4ad169a3774e8f5be3c7945513632bde=1551504009,1551515651,1551522942,1551587351; Hm_lpvt_4ad169a3774e8f5be3c7945513632bde=1551587351; yGhj_40fe_lastact=1551587332%09misc.php%09patch"cookies = {i.split("=")[0]:i.split("=")[1] for i in cookie.split("; ")}def start_requests(self):yield scrapy.Request(self.start_urls[0],callback = self.parse,cookies=self.cookies)def parse(self,response):new = "https://www.jitashe.org/guide/newtab/t1/"hot = "https://www.jitashe.org/guide/hottab/t1/"item = GtsheItem()item['cat'] = "new"yield scrapy.Request(new,callback=self.parse1,meta = {'item':copy.deepcopy(item)},cookies=self.cookies)item['cat'] = "hot"yield scrapy.Request(hot,callback=self.parse1,meta = {'item':copy.deepcopy(item)})def parse1(self,response):url_list = ["https://www.jitashe.org"+i for i in response.xpath("//a[@class='title']/@href").extract()]name_list = response.xpath("//a[@class='title']/text()").extract()next_item = copy.deepcopy(response.meta['item'])print (url_list)for index,url in enumerate(url_list):item = response.meta['item']item['name'] = name_list[index]yield scrapy.Request(url = url,meta={'item':copy.deepcopy(item)},callback = self.parse2,cookies=self.cookies #为了实现持久化必须每次访问都携带cookies)#获取下一页next_url = response.xpath('//a[@class="nxt"]/@href').extract()if len(next_url)!=0:yield scrapy.Request(url = "https://www.jitashe.org/"+next_url[0],meta = {'item':copy.deepcopy(next_item)},callback = self.parse1)def parse2(self,response):url = response.xpath("//a[@id='gtp_download']/@href").extract_first()print(url)if url is not None:gtp_url = "https://www.jitashe.org"+urlitem = response.meta['item']item['url'] = gtp_urlprint(gtp_url)print("开始爬取:"+item['name'])yield item

2.配置文件

# -*- coding: utf-8 -*-
#radis 配置DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER_PERSIST = True
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
REDIS_URL = "redis://127.0.0.1:6379"BOT_NAME = 'gtshe'
SPIDER_MODULES = ['gtshe.spiders']
NEWSPIDER_MODULE = 'gtshe.spiders'
DEFAULT_REQUEST_HEADERS = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8','Accept-Language': 'en',
}
#COOKIES_DEBUG=True
#LOG_LEVEL="WARNING"
USER_AGENT_LIST = ["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60","Opera/8.0 (Windows NT 5.1; U; en)","Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50","Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0","Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36","Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11","Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36","Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER","Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)","Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)","Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0","Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36"
]# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'gtshe (+http://www.yourdomain.com)'# Obey robots.txt rules
ROBOTSTXT_OBEY = False
FILES_STORE = "G:/Eclipse_p/scrapy/gtshe/gtp_forum"
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 1# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8','Accept-Language': 'en',
}DOWNLOADER_MIDDLEWARES = {'gtshe.middlewares.GtsheDownloaderMiddleware': 543,
}ITEM_PIPELINES = {'gtshe.pipelines.GtshePipeline': 300,
}

3.item文件

# -*- coding: utf-8 -*-# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.htmlimport scrapyclass GtsheItem(scrapy.Item):# define the fields for your item here like:# name = scrapy.Field()name = scrapy.Field()url = scrapy.Field()cat = scrapy.Field()

4.下载中间件

# -*- coding: utf-8 -*-# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
import random
from scrapy import signals
class GtsheDownloaderMiddleware(object):# Not all methods need to be defined. If a method is not defined,# scrapy acts as if the spider middleware does not modify the# passed objects.def process_request(self,request,spider):request.headers['USER-AGENT'] = random.choice(spider.settings.get('USER_AGENT_LIST'))

5.管道文件

# -*- coding: utf-8 -*-import scrapy
import copy
import os
from scrapy import cmdline
from scrapy.utils.misc import md5sum
from scrapy.pipelines.files import FilesPipeline
try:from cStringIO import StringIO as BytesIO
except ImportError:from io import BytesIO
class GtshePipeline(FilesPipeline):def get_media_requests(self,item,spider):yield scrapy.Request(item['url'],meta={'item':item})#获取文件后缀名def file_path(self,request,response=None,info=None):item = request.meta['item']return item['cat']+"/"#由于不能直接从链接中获取图片名称所以只能从header头中获取		def file_downloaded(self, response, request, info):path = self.file_path(request, response=response, info=info)file_name = response.headers.get('Content-Disposition')#print(response.headers)if file_name is None:print("爬虫关闭")os._exit(0)path = path+str(file_name,'utf-8').split("\"")[1]buf = BytesIO(response.body)checksum = md5sum(buf)buf.seek(0)self.store.persist_file(path, buf, info)return checksum