这个案例的几个注意事项:
- 启用scrapy自带的http2支持
- 使用scrapy的图片pipeline下载图片
实现功能:
1.根据给定的壁纸分类url,自动进入壁纸详情页爬取,能够自动翻页爬取下一页壁纸。
完整代码附件:
umeiwallpaper.zip
(6.6 KB, 下载次数: 494)
图片会根据网站中的标题作为目录,保存在项目下Downloads/壁纸标题/xxx
(由于美女太性感,所以这里只演示下载其他壁纸)
下面是几个核心文件的代码:
items.py
import scrapy
class UMeiItem(scrapy.Item):
name = scrapy.Field()
# 如果要使用图片管道,这个字段必须是这个名字,否则你自己要继承图片管道类,重写相关的方法。
# 这里字段要写入图片的下载url地址。
image_urls = scrapy.Field()
# images是用于存放下载图片的结果
images = scrapy.Field()
dirname = scrapy.Field()
pipelines.py
from itemadapter import ItemAdapter
from scrapy.pipelines.images import ImagesPipeline
class MyImagePipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None, *, item=None):
adapter = ItemAdapter(item)
img_name = adapter.get("name")
dirname = adapter.get("dirname")
# print(f"{img_name=}")
return f"{dirname}/{img_name}.jpg"
settings.py
import os.path
BOT_NAME = "umeiwallpaper"
SPIDER_MODULES = ["umeiwallpaper.spiders"]
NEWSPIDER_MODULE = "umeiwallpaper.spiders"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/114.0"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 0.2
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
DOWNLOAD_HANDLERS = {
"https": "scrapy.core.downloader.handlers.http2.H2DownloadHandler",
}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
"umeiwallpaper.pipelines.MyImagePipeline": 300,
}
# 图片管道下载存储根路径
BASE_DIR = os.path.dirname(os.path.dirname(__file__))
IMAGES_STORE = os.path.join(BASE_DIR, "Downloads")
# 你可以自定,也可以用默认的,默认就这2个值
IMAGES_URLS_FIELD = "image_urls"
IMAGES_RESULT_FIELD = "images"
# 允许重定向下载(有一些http会重定向到https,不允许就提示301状态码)
MEDIA_ALLOW_REDIRECTS = True
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
AUTOTHROTTLE_ENABLED = True
# # # The initial download delay
AUTOTHROTTLE_START_DELAY = 1
# # # The maximum download delay to be set in case of high latencies
AUTOTHROTTLE_MAX_DELAY = 5
# # # The average number of requests Scrapy should be sending in parallel to
# # # each remote server
AUTOTHROTTLE_TARGET_CONCURRENCY = 2.0
# # # Enable showing throttling stats for every response received:
AUTOTHROTTLE_DEBUG = False
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
爬虫 umei.py
import scrapy
from scrapy.http import HtmlResponse
from umeiwallpaper.items import UMeiItem
class UmeiSpider(scrapy.Spider):
name = "umei"
allowed_domains = ["umei.cc"]
# start_urls = ["https://www.umei.cc/meinvtupian/meinvmote/235660.htm"]
start_urls = ["https://www.umei.cc/meinvtupian/meinvmote/"]
def parse(self, response: HtmlResponse, **kwargs):
lists_xpath = "//div[@class='item masonry_brick']/div/div[@class='img']/a"
selector_list = response.xpath(lists_xpath)
for selector in selector_list:
url = selector.xpath("./@href").get()
# title = selector.xpath("./img/@alt").get()
print(f"列表:{url=}")
# print(f"{title=}")
yield scrapy.Request("https://www.umei.cc" + url, callback=self.parse_item, dont_filter=False)
def parse_item(self, response: HtmlResponse):
last_page = response.xpath('//a[contains(text(),"尾页")]/@href').get()
if last_page:
count = int(last_page.split("/")[-1].rsplit(".", 1)[0].split("_")[1])
else:
count = 1
# print(f"{count=}")
for i in range(count):
if i == 0:
# url = self.start_urls[0]
url = response.url
else:
url = f"{response.url.rsplit('.', 1)[0]}_{i + 1}.htm"
print(f"{url=}")
yield scrapy.Request(url, callback=self.parse_detail, dont_filter=True)
def parse_detail(self, response: HtmlResponse):
img_url = response.xpath("//div[@class='big-pic']/a/img/@src").get()
name = response.url.split("/")[-1].split(".")[0]
title = response.xpath('//div[contains(@class, "imgtitle")]/h1/text()').get()
u_mei_item = UMeiItem()
u_mei_item["name"] = name
u_mei_item["dirname"] = title
u_mei_item["image_urls"] = [img_url, ]
# print(u_mei_item)
yield u_mei_item