本帖最后由 情绪666 于 2021-11-26 12:07 编辑
爬取表情包
一、爬取页面: https://fabiaoqing.com/biaoqing
- Fix:
2021/11/26: 1. 动态判断图片类型保存到本地 2. 文件名过长无法保存到本地,进行截取120个字符
二、能学到了什么(专业非Python:有错误请指正,看到会及时调整修改,还是比较适用于入门爬虫,基本上不是前后端分离的都可以使用)
效果展示
代码区域
[Python] 纯文本查看 复制代码 # -*- coding: utf-8 -*-
# Time : 2021/11/23 10:39
# Author : Melon
# Site :
# Note :
# File : biaoqingbao.py
# Software: PyCharm
import asyncio
import imghdr
import os
import re
import time
import requests
from lxml import etree
from tqdm import tqdm
# 错误次数
err_num = 0
async def saveImg(image_url, file_name):
global err_num
# Windows 保存文件时不能出现这些字符,全部替换成_
file_name = re.sub(r'[\\/:*?"<>|\r\n]+', "_", file_name)
if len(file_name) > 120:
file_name = file_name[:120]
try:
# 文件夹名称
dir_name = './表情包'
# 没有文件夹 就 创建文件夹
if not os.path.exists(dir_name):
os.mkdir(dir_name)
# 加延迟防止过快
time.sleep(0.5)
result = requests.get(image_url)
# 保存图片
with open(dir_name + "/" + file_name + '.' + imghdr.what(None, result.content), "wb") as f:
f.write(result.content)
except Exception as e:
# 异常:1.打印错误信息 2.延迟 3S 后重新调用 3.错误超过3次则跳过
err_num += 1
print("\nError Total %s Sleep 3 saveImg:%s Message: %s" % (err_num, image_url, e))
if err_num >= 3:
pass
else:
time.sleep(3)
await saveImg(image_url, file_name)
async def get_img(start_page, end_page):
global err_num
for i in range(start_page, end_page):
url = "https://fabiaoqing.com/biaoqing/lists/page/{}.html".format(i)
get = requests.get(url=url)
# xpath解析
html = etree.HTML(get.content)
divs = html.xpath("//div[@class='tagbqppdiv']")
# 设置进度条
pbar = tqdm(total=len(divs))
# 设置进度条前面的文字
pbar.set_description("Processing 第%s页" % i)
for div in divs:
# 设置进度条后面的文字
pbar.set_postfix_str(div.xpath("./a/img/@title")[0])
# 保存图片
err_num = 0
img_data_original = div.xpath("./a/img/@data-original")[0]
await saveImg(img_data_original, div.xpath("./a/img/@title")[0])
# print('\n' + div.xpath("./a/img/@title")[0], div.xpath("./a/img/@data-original")[0])
# time.sleep(0.5)
# 更新进度条
pbar.update(1)
# 关闭进度条
pbar.close()
if __name__ == '__main__':
# 1. 创建一个事件循环
loop = asyncio.get_event_loop()
tasks = [
loop.create_task(get_img(21, 201)),
]
# 3. 执行事件队列
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
|