BeautifulPic.py
#coding=utf-8
'''
author 小丑
'''
from scrapy import Request
from scrapy.spiders import Spider
from ..items import BeautifulItem
import re
class DoubanProject(Spider):
name='pics'
def start_requests(self):
urls = ["https://www.meitulu.com/t/nvshen/{}.html".format(str(i))for i in range(2,3)] #爬取页数控制
for url in urls:
yield Request(url) #循环访问爬取页数
def parse(self, response): #解析数据
item=BeautifulItem()
info_url=response.xpath("//ul[@class='img']/li/a/@href").extract() #解析出当前页中每个小姐姐的url地址
u = re.findall("item/(.*?).html", str(info_url)) #解析出url中的代号
for i in range(60): #一页中url个数(小姐姐个数)
infor_url = "https://www.meitulu.com/item/" + u[i] + ".html"
print(info_url)
yield Request(infor_url,callback=self.parse_detail,meta={"item":item})
def parse_detail(self,response):
item=BeautifulItem()
next_url=response.xpath("//div[@id='pages']/a/@href").extract()
u = re.findall("item/(.*?).html", str(next_url))
for i in range(3): #进入小姐姐页面后,循环爬取照片页数
next_url = "https://www.meitulu.com/item/" + u[i] + ".html"
info_list=response.xpath("/html/body/div[4]")
for info in info_list:
pic=info.xpath("/html/body/div[4]/center/img/@src").extract()
name =info.xpath("/html/body/div[4]/center/img/@alt").extract()
item["name"]=name
item["pic"]=pic
yield Request(next_url,callback=self.parse_detail,meta={"item":item})
yield item
items.py
import scrapy
class BeautifulItem(scrapy.Item):
#片名
name=scrapy.Field()
#图片url
pic=scrapy.Field()
pipelines.py
from scrapy.pipelines.images import ImagesPipeline
from scrapy import Request
from scrapy.exceptions import DropItem
import logging
log=logging.getLogger("SavepicturePipeline")
class BeautifulPipeline(object):
def process_item(self, item, spider):
return item
class SavepicturePipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for img_url in item["pic"]:
yield Request(url=img_url,meta={'item':item,'index':item["pic"].index(img_url)})
def item_completed(self, results, item, info): #判断下载是否成功
if not results[0][0]:
raise DropItem("下载失败")
logging.debug("下载成功")
return item
def file_path(self, request, response=None, info=None): #保存图片路径与图片名称
item=request.meta["item"]
index=request.meta["index"]
image_name=item["name"][index]+".jpg"
return image_name
settings.py
BOT_NAME = 'Beautiful'
SPIDER_MODULES = ['Beautiful.spiders']
NEWSPIDER_MODULE = 'Beautiful.spiders'
IMAGES_STORE="D:\\picture" #保存地址
DEFAULT_REQUEST_HEADERS = {
'USER_AGENT':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
'Referer':'www.google.com',
}
ITEM_PIPELINES = {
'Beautiful.pipelines.BeautifulPipeline': 300,
'Beautiful.pipelines.SavepicturePipeline':400,
}
可以设置很多请求头或者随机ip,但是我感觉没太大必要,就练个手。
大家看个乐就好,我是初学的,如果有不足的地方我很乐意接受批评,敬请指正。