Scrapy
Scrapy官网
Scrapy | A Fast and Powerful Scraping and Web Crawling Framework https://scrapy.org/
清华大学某Scrapy相关PDF,建议参考学习:
http://www.tup.tsinghua.edu.cn/upload/books/yz/094461-01.pdf
Result
Step
1、创建虚拟环境
创建爬虫专用的Python虚拟环境,Python版本随意,这里我选择的是Python3.10,虚拟环境管理工具使用的是anaconda,
conda create -n Crawler python=3.10 scrapy
此命令创建名称为Crawler的虚拟环境,Python版本为3.10,同时安装scrapy第三方依赖。
当然,还可以选择使用Python自带的venv创建虚拟环境,请自行百度解决。
2、选择一个你喜欢的目录,打开cmd,激活Crawler虚拟环境,使用命令生成项目
# 激活虚拟环境
conda activate Crawler
# 创建Scrapy爬虫项目
scrapy startproject NetBiAnSpider
# 进入项目
cd NetBiAnSpider
# 生成爬虫,指定名称允许爬取的域名
scrapy genspider netbian target_domain
3、相关代码
目录结构如下图所示:
items.py
class NetbianspiderItem(scrapy.Item):
src = scrapy.Field()
alt = scrapy.Field()
spider/netbian.py
import logging
import scrapy
from NetBiAnSpider.items import NetbianspiderItem
class NetbianSpider(scrapy.Spider):
name = "netbian"
# 某an手机壁纸域名
allowed_domains = [""]
# 某an手机壁纸通用分页请求地址
start_urls = [""]
finish_category = [] # 完成的分类列表
current_category = "动漫"
current_page = 0
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.classes = {
# '全部': 2,
'动漫': 24, '游戏': 25, '美女': 26, '风景': 33, '动物': 35, '文字': 51, '帅哥': 44,
'汽车': 34, '新年': 36, '鬼刀': 37, '集原美': 38, '英雄联盟': 39, '王者荣耀': 41, '影视': 42, '苹果': 43,
'女生专用': 45, '宗教': 46, '节日': 47, '可爱': 48, '正能量': 49, '系统': 50, '情侣': 52, '简约': 53,
# '独家': 56, # 独家没了
}
self.keys = self.classes.keys()
self.log(f"当前所有关键字列表为:{self.keys}", logging.INFO)
def parse(self, response):
if len(response.body) == 0:
self.logger.info(f"当前分类{self.current_category},共{self.current_page - 1}页,已到达最后一页")
self.finish_category.append(self.current_category)
self.current_page = 0
for key in self.keys:
if key not in self.finish_category:
self.current_category = key
yield from self.next() # 需要yield from,Chat GPT说的,不然不行,自行百度,俺也不会用yield
self.logger.info(f"当前分类{self.current_category},第{self.current_page}页")
break
else:
self.current_page += 1
yield from self.next() # 这样写就能继续处理后续生产的请求了,yield俺也不常用,问的chatGPT
self.logger.info(f"当前分类{self.current_category},第{self.current_page}页")
images = response.xpath('//li/a/img')
item = NetbianspiderItem()
for image in images:
src = image.xpath('@src').get()
alt = image.xpath('@alt').get()
item['src'] = src
item['alt'] = alt
yield item
def next(self):
# 某an手机壁纸通用分页请求地址
url = f""
self.logger.info(url)
yield scrapy.Request(url, callback=self.parse)
pipelines.py
import datetime
import logging
import re
import pymysql
from pymysql import IntegrityError, OperationalError
import settings
# useful for handling different item types with a single interface
class NetbianspiderPipeline:
def __init__(self):
self.db_conn = None
self.db_cur = None
try:
self.db_conn = pymysql.connect(host=settings.HOST, port=settings.PORT, user=settings.USERNAME, password=settings.PASSWORD, database=settings.DATABASE, charset='utf8', use_unicode=True) # 连接数据库
self.db_cur = self.db_conn.cursor() # 创建游标
logging.info("数据库连接成功")
except OperationalError as e:
logging.error(e)
def process_item(self, item, spider):
if not self.db_cur: # 判断是否获取到数据库游标
return item
preview = item['src']
u = preview[:-14] + ".jpg"
img_url = u.replace("small", "") # 去掉前面的small,然后去掉.jpg前面的十位即为大图url
match = re.search(r"/(\d{4})/(\d{2})(\d{2})/", u) # 提取URL中的日期 年月日
year, mouth, day = match.group(1), match.group(2), match.group(3)
dt = datetime.datetime(year=int(year), month=int(mouth), day=int(day)) # 转为datetime对象
values = (item['alt'], preview, img_url, dt)
sql = "insert into wallpaper(title,preview,url,create_at) values (%s,%s,%s,%s)" # 为SQL语法,数据顺序要和建表一致
try:
self.db_cur.execute(sql, values)
self.db_conn.commit()
except IntegrityError:
logging.info(f"此记录已存在: {preview}")
return item
def close_spider(self, spider):
if self.db_cur:
self.db_cur.close() # 关闭游标
if self.db_conn:
self.db_conn.close() # 关闭数据库
settings.py
# 启用管道将数据存入数据库
ITEM_PIPELINES = {
"NetBiAnSpider.pipelines.NetbianspiderPipeline": 300,
}
# Log
LOG_LEVEL = 'INFO'
today = datetime.datetime.now()
log_file_path = f'log/scrapy_{today.year}_{today.month}_{today.day}.log'
LOG_FILE = log_file_path
# Database
HOST = "localhost"
PORT = 3306
USERNAME = "root"
PASSWORD = "root"
DATABASE = "wallpaper"
wallpaper.wallpaper DDL
CREATE TABLE `wallpaper` (
`id` bigint(20) NOT NULL AUTO_INCREMENT,
`title` varchar(100) CHARACTER SET utf8 COLLATE utf8_unicode_ci DEFAULT NULL,
`preview` varchar(255) CHARACTER SET utf8 COLLATE utf8_unicode_ci DEFAULT NULL,
`url` varchar(255) CHARACTER SET utf8 COLLATE utf8_unicode_ci DEFAULT NULL,
`create_at` date DEFAULT NULL,
PRIMARY KEY (`id`),
UNIQUE KEY `wallpaper_unique` (`preview`)
) ENGINE=MyISAM AUTO_INCREMENT=8555 DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
4、运行爬虫
scrapy runspider netbian
没啥大问题就能看到类似输出
TODO
- [ ] 增量式爬取
- [ ] 关键信息写入日志
- [ ] 邮件发送
- [ ] 定时执行
- [ ] 改进工程,新增其他类似网站爬虫
End
我的Python爬虫的学习路线是从正则表达式(re库)提取数据,再到BeautifulSoup(pip install beautifulsoup4),再到lxml,最后就是爬虫框架,框架适合大型项目,平时使用requests+lxml更多,随机请求头工具可以使用fake_useragent(pip install fake_useragent),代{过}{滤}理的话GitHub上有个大佬写的开源代{过}{滤}理,挺好用的。最后,希望这个项目对各位有帮助。(其实我是来水52的帖的,之前号被清了,后面开放注册有重新注册回来了,这次悠着点,手动狗头)
GitHub - jhao104/proxy_pool: Python ProxyPool for web spider Python ProxyPool for web spider. Contribute to jhao104/proxy_pool development by creating an account on GitHub. https://github.com/jhao104/proxy_pool