[Python] 纯文本查看 复制代码 pip install scrapy
# 创建项目
scrapy startproject bian
# 在项目下创建爬虫文件
scrapy genspider -t crawl bian_pic [url]https://pic.netbian.com[/url]
[Python] 纯文本查看 复制代码 # settings.py
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
ROBOTSTXT_OBEY = False
LOG_LEVEL = "ERROR"
CONCURRENT_REQUESTS = 32
ITEM_PIPELINES = {
"bian.pipelines.BianPipeline": 300,
}
[Python] 纯文本查看 复制代码 # items.py
class BianItem(scrapy.Item):
href = scrapy.Field()
title = scrapy.Field()
src = scrapy.Field()
[Python] 纯文本查看 复制代码 # bian_pic.py
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from bian.items import BianItem
class BianPicSpider(CrawlSpider):
name = "bian_pic"
# allowed_domains = ["pic.netbian.com"]
base_url = "https://pic.netbian.com"
start_urls = [
"https://pic.netbian.com/4kdongman",
"https://pic.netbian.com/4kyouxi",
"https://pic.netbian.com/4kmeinv",
"https://pic.netbian.com/4kfengjing",
"https://pic.netbian.com/4kyingshi",
"https://pic.netbian.com/4kqiche",
"https://pic.netbian.com/4krenwu",
"https://pic.netbian.com/4kdongwu",
"https://pic.netbian.com/4kzongjiao",
"https://pic.netbian.com/4kbeijing",
"https://pic.netbian.com/pingban",
"https://pic.netbian.com/shoujibizhi",
]
link = LinkExtractor(restrict_xpaths='//*[@class="page"]/a')
rules = (Rule(link, callback="parse_item", follow=True),)
def parse_item(self, response):
a_list = response.xpath('//*[@class="slist"]/ul/li/a')
for a in a_list:
if a.xpath('./@target').extract_first():
href = a.xpath('./@href').extract_first()
item = BianItem()
item["href"] = href
yield scrapy.Request(url=self.base_url + href, callback=self.parse_detail)
def parse_detail(self, response):
src = response.xpath('//*[@id="img"]/img/@src').extract_first()
title = response.xpath('//*[@id="img"]/img/@title').extract_first()
item = BianItem()
item["src"] = self.base_url + src
item["title"] = title
yield item
[Python] 纯文本查看 复制代码 # pipelines.py
class BianPipeline:
fp = None
def open_spider(self, spider):
print("开始写入爬虫文件")
self.fp = open("pic.txt", "w", encoding="utf-8")
def process_item(self, item, spider):
self.fp.write(item["title"] + " | " + item["src"] + "\n")
return item
def close_spider(self, spider):
print("写入爬虫完成结束")
self.fp.close()
[Python] 纯文本查看 复制代码 因为在公司无聊写的,所以爬到的数据直接写到文件中了,不敢download图片怕流量异常。有兴趣的可以在pipelines中写下载文件的方法 |