爬取爱套图网图片:https://github.com/EExplode/scrapy_aitaotu
一.保存至MongoDB
import pymongoclass MongoPipeline(object):# 初始化参数def __init__(self, mongo_uri, mongo_db):self.mongo_uri = mongo_uriself.mongo_db = mongo_db# 以依赖注入的方式获取settings.py中的配置信息@classmethoddef from_crawler(cls, crawler):return cls(mongo_uri=crawler.settings.get('MONGO_URI'),mongo_db=crawler.settings.get('MONGO_DB'))# Spider开启时,初始化数据库连接def open_spider(self, spider):self.client = pymongo.MongoClient(self.mongo_uri)self.db = self.client[self.mongo_db]# 将item插入MongoDB,集合定义在item中def process_item(self, item, spider):self.db[item.collection].insert(dict(item))return item# Spider结束后,关闭数据库连接def close_spider(self, spider):self.client.close()
二.保存至MySQL
import pymysqlclass MysqlPipeline(object):def __init__(self, host, database, user, password, port):self.host = hostself.database = databaseself.user = userself.password = passwordself.port = port@classmethoddef from_crawler(cls, crawler):return cls(host=crawler.settings.get('MYSQL_HOST'),database=crawler.settings.get('MYSQL_DATABASE'),user=crawler.settings.get('MYSQL_USER'),password=crawler.settings.get('MYSQL_PASSWORD'),port=crawler.settings.get('MYSQL_PORT'),)def open_spider(self, spider):self.db = pymysql.connect(self.host, self.user, self.password, self.database, charset='utf8', port=self.port)self.cursor = self.db.cursor()def close_spider(self, spider):self.db.close()def process_item(self, item, spider):data = dict(item)keys = ', '.join(data.keys())values = ', '.join(['%s'] * len(data))sql = 'insert into %s (%s) values (%s)' % (item.table, keys, values)self.cursor.execute(sql, tuple(data.values()))self.db.commit()return item
三.图片下载管道
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy import Request
import reclass ImagePipeline(ImagesPipeline):# 返回文件名及其相对路径# 'https://img.aitaotu.cc:8089/Pics/2020/0115/22/04.jpg'# '/Pics/2020/0115/22/04.jpg'def file_path(self, request, response=None, info=None):return re.search('(.*)8089(.*)', request.url).group(2)# 若下载失败,则抛出异常def item_completed(self, results, item, info):image_paths = [x['path'] for ok, x in results if ok]if not image_paths:raise DropItem('Image Downloaded Failed')return item# 设置请求头,下载图片def get_media_requests(self, item, info):headers = {"Referer": item['referer'],"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"}yield Request(item['image'], headers=headers)
四.其他文件
items.py
import scrapyclass AitaotuItem(scrapy.Item):collection = table = 'images'# 图片链接image = scrapy.Field()# 图片链接请求头关键参数,不然下载出现404referer = scrapy.Field()# 图片名称image_name = scrapy.Field()
settings.py
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 0.5ITEM_PIPELINES = {'Aitaotu.pipelines.ImagePipeline': 300,'Aitaotu.pipelines.MongoPipeline': 301,'Aitaotu.pipelines.MysqlPipeline': 302,
}# 图片管道配置
IMAGES_STORE = './images'# mongodb配置
MONGO_URI = 'localhost'
MONGO_DB = 'aitaotu'# msyql配置
MYSQL_HOST = 'localhost'
MYSQL_PORT = 3306
MYSQL_USER = 'root'
MYSQL_PASSWORD = '123456'
MYSQL_DATABASE = 'aitaotu'
五.Feed 导出
scrapy crawl aitaotu -o images.json
scrapy crawl aitaotu -o images.jsonlines
scrapy crawl aitaotu -o images.csv
scrapy crawl aitaotu -o images.xml