看到某吧有人上传了些写真,前面刚刚看了 scrapy的CrawlSpider 就想网站此model的所有写真套图爬下来
CrawlSpider提取连接太方便了
spider文件
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from mid.items import XiurenjiItem
class SpidernameSpider(CrawlSpider):
name = 'spidername'
# allowed_domains = ['www.xxx.com']
#搜索页的一个连接
start_urls = ['https://www.xiurenji.cc/plus/search/index.asp?keyword=egg&searchtype=title&p=4']
#链接提取器,拿到所有的搜索分页的连接
link = LinkExtractor(allow='title&p=\d+')
rules = (
Rule(link, callback='parse_item', follow=True),
)
#自定义这个爬虫的图片下载的本地存储位置
custom_settings = {
'IMAGES_STORE': r'W:\HDTV\写真\女神egg'
}
#解析每个分页的url
def parse_item(self, response):
#分页页面内的套图a标签
title_tag=response.xpath('//div[@class="list"]//div[@class="title1"]/a')
#循环a标签可以拿到套图的title和url
for title in title_tag:
t=''.join(title.xpath('.//text()').extract())
u=f'https://www.xiurenji.cc/{title.xpath("./@href").extract_first()}'
#手动发请求,回调套图的解析,要传个参数,把当前套图的title传过去
yield scrapy.Request(url=u,callback=self.parse_page,meta={'title':t})
#套图的解析
def parse_page(self, response):
#拿到套图的title
title=response.meta['title']
#套图里面也有分页,每个分页里面有几张写真,拿到分页里面的写真
p=response.xpath('//div[@class="img"]/p/img')
#循环页面内的写真图片,拿到每一个写真图片的url
for img in p:
#url要拼接一下
img_url='https://www.xiurenji.cc'+img.xpath('./@src').extract_first()
#实例化item,这个是要传到管道的
item=XiurenjiItem()
item['url']=img_url
#图片按title分类,scrapy会自动根据路径建文件夹
item['path']=title+'\\'+img_url.split('/')[-1]
#把item传到管道
yield item
#拿到下一页的url
next_url=response.xpath('//div[@class="page"]/a[text()="后"]/@href').extract_first()
#如果存在就回调页面解析
if next_url:
next_url=f'https://www.xiurenji.cc{next_url}'
#这里要把title传过去
yield scrapy.Request(url=next_url,callback=self.parse_page,dont_filter=True,meta={'title':title})
pipelines文件
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import scrapy, os,time
#通过管道下载图片视频等需要导入的模块
from scrapy.pipelines.images import ImagesPipeline
class ImgDownPipeline(ImagesPipeline):
#通过管道类下载图片解析url的方法
def get_media_requests(self, item, info):
#meta自定义传参,传给保存图片的方法
yield scrapy.Request(url=item['url'], meta={'item': item})
#保存图片的方法
def file_path(self, request, response=None, info=None):
#拿到item中的文件名或者是路径文件名
img_path = request.meta['item']['path']
# 只需返回要给文件名就可以自动保存(文件名前有文件夹自动创建)
return img_path
#向下一个管道传item
def item_completed(self, results, item, info):
return item
class OpenDocPipeline(object):
#spider就是爬虫主py,可以直接调用爬虫的类属性
def open_spider(self, spider):
print('爬虫开始'.center(50, '='))
# 管道中处理item的方法
def process_item(self, item, spider):
return item # 向下个管道传item
def close_spider(self, spider):
from scrapy.utils.project import get_project_settings
print(f'共用时{time.perf_counter()}秒!')
print('爬虫结束'.center(50, '='))
middlewares文件
import random
#自定义了个ua伪装的中间件
class UseragentrandomDownloaderMiddleware(object):
USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2919.83 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2866.71 Safari/537.36',
'Mozilla/5.0 (X11; Ubuntu; Linux i686 on x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2820.59 Safari/537.36'
]
def process_request(self, request, spider):
user_agent = random.choice(self.USER_AGENTS)
request.headers['User-Agent'] = user_agent
settings文件,该打开的就打开
#反爬要关闭
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'
IMAGES_STORE=r'W:\HDTV\写真\女神egg'
#这个重定向要打开,不然图片下载不下来
MEDIA_ALLOW_REDIRECTS = True
DOWNLOADER_MIDDLEWARES = {
'mid.middlewares.UseragentrandomDownloaderMiddleware': 543,
}
ITEM_PIPELINES = {
'mid.pipelines.ImgDownPipeline': 300,
'mid.pipelines.OpenDocPipeline': 500
}
items文件
import scrapy
class XiurenjiItem(scrapy.Item):
url=scrapy.Field()
path=scrapy.Field()