进行设置settings
[Asm] 纯文本查看 复制代码 #启动图片管道
ITEM_PIPELINES = {
'mztu.pipelines.ImagesPipelinse': 300,
}
#设置默认目录地址 注意下载图片的话默认地址必须设置!!!
IMAGES_STORE = "E:\study\Python\scrapy\mztu\imges"
#设置图片通道失效时间
IMAGES_EXPIRES =90
#缩略图生成
#IMAGES_THUMBS = {
# 'small': (50, 50),
# 'big': (270, 270),
#}
spider目录
[Python] 纯文本查看 复制代码 # -*- coding: utf-8 -*-
import scrapy
from mztu.items import MztuItem
class ZimdgSpider(scrapy.Spider):
name = 'zimdg'
allowed_domains = ['mzitu.com']
#生成链接列表
start_urls = ['http://www.mzitu.com/xinggan/page/{}/'.format(str(x)) for x in range(118)]
def parse(self, response):
#解析出链接
set_li = response.xpath("//div[@class='postlist']/ul/li")
for ecth in set_li:
ed = ecth.xpath('./a/@href').extract()
#进行二次分类解析
yield scrapy.Request(ed[0],callback=self.parse_item)
def parse_item(self,response):
itme = MztuItem()
# 获取页数链接进行访问
offset = int(response.xpath('//div[@class="pagenavi"]/a/span/text()')[4].extract())
#生成链接访问
#遍历链接访问
for i in [response.url+"/{}".format(str(x)) for x in range(1,offset+1)]:
itme['Referer']=i
#将meta传入链接
yield scrapy.Request(itme['Referer'],meta={'meta_1':itme}, callback=self.parse_ponse)
# for i in url:
def parse_ponse(self,response):
#获取itme资源
itme = response.meta['meta_1']
#获取图片地址
imgs = response.xpath('//div[@class="main-image"]/p/a/img/@src')[0].extract()
#获取图片目录
title = response.xpath('//div[@class="main-image"]/p/a/img/@alt')[0].extract()
itme["title"]= title
itme["imge_url"]= imgs
#itme["nickname"] = itme["Referer"][itme["Referer"].rfind("/"):]+itme["imge_url"][itme["imge_url"].rfind('/')+1:itme["imge_url"].rfind('.')]
#itme["nickname"] = itme["imge_url"][itme["imge_url"].rfind('/')+1:itme["imge_url"].rfind('.')]
yield itme
items
[Python] 纯文本查看 复制代码 import scrapy
class MztuItem(scrapy.Item):
#目录
title = scrapy.Field()
#图片地址
imge_url = scrapy.Field()
#请求头
Referer = scrapy.Field()
image_Path = scrapy.Field()
#图片名称
# nickname = scrapy.Field()
pipelines管道
[Python] 纯文本查看 复制代码 # -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
# 导入这个包为了移动文件
import shutil
#此包不解释
import scrapy
# 导入项目设置
from scrapy.utils.project import get_project_settings
# 导入scrapy框架的图片下载类
from scrapy.pipelines.images import ImagesPipeline
#此包不解释
import os
class ImagesPipelinse(ImagesPipeline):
#def process_item(self, item, spider):
# return item
# 获取settings文件里设置的变量值
IMAGES_STORE = get_project_settings().get("IMAGES_STORE")
# 重写ImagesPipeline类的此方法
# 发送图片下载请求
def get_media_requests(self, item, info):
image_url = item["imge_url"]
#headers是请求头主要是防反爬虫
yield scrapy.Request(image_url,headers={'Referer':item['Referer']})
def item_completed(self, result, item, info):
image_path = [x["path"] for ok, x in result if ok]
# 定义分类保存的路径
img_path = "%s\%s" % (self.IMAGES_STORE, item['title'])
# 目录不存在则创建目录
if os.path.exists(img_path) == False:
os.mkdir(img_path)
# 将文件从默认下路路径移动到指定路径下
shutil.move(self.IMAGES_STORE + "\\" +image_path[0], img_path + "\\" +image_path[0][image_path[0].find("full\\")+6:])
item['image_Path'] = img_path + "\\" + image_path[0][image_path[0].find("full\\")+6:]
return item
这里实现图片保存到不同的目录下,主要函数是shutil.move(),将图片从原始默认路径移动到指定目录下
|