Python爬表情包
本帖最后由 Traitor 于 2022-3-8 13:42 编辑网站 : https://fabiaoqing.com/
新手练手写的,一共9000张表情包,没有使用多线程,url是直接写入到文本,然后再读出来保存的,其实也可以不用这样,写的不好的地方大家多多交流!
#
import requests
import random
import time
from lxml import etree
import os
import re
user_agent_pc = [
# 谷歌
'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.html.2171.71 Safari/537.36',
'Mozilla/5.0.html (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.html.1271.64 Safari/537.11',
'Mozilla/5.0.html (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.html.648.133 Safari/534.16',
# 火狐
'Mozilla/5.0.html (Windows NT 6.1; WOW64; rv:34.0.html) Gecko/20100101 Firefox/34.0.html',
'Mozilla/5.0.html (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
# opera
'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.html.2171.95 Safari/537.36 OPR/26.0.html.1656.60',
# qq浏览器
'Mozilla/5.0.html (compatible; MSIE 9.0.html; Windows NT 6.1; WOW64; Trident/5.0.html; SLCC2; .NET CLR 2.0.html.50727; .NET CLR 3.5.30729; .NET CLR 3.0.html.30729; Media Center PC 6.0.html; .NET4.0C; .NET4.0E; QQBrowser/7.0.html.3698.400)',
# 搜狗浏览器
'Mozilla/5.0.html (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.html.963.84 Safari/535.11 SE 2.X MetaSr 1.0.html',
# 360浏览器
'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.html.1599.101 Safari/537.36',
'Mozilla/5.0.html (Windows NT 6.1; WOW64; Trident/7.0.html; rv:11.0.html) like Gecko',
# uc浏览器
'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.html.2125.122 UBrowser/4.0.html.3214.0.html Safari/537.36',
]
def getUserAgent():
user_agent = random.choice(user_agent_pc)
headers = {
"User-Agent": user_agent,
"Host": "fabiaoqing.com",
}
return headers
def handleResponse(n):
time.sleep(2)
url = f"https://fabiaoqing.com/biaoqing/lists/page/{n}.html"
response = requests.get(url, headers=getUserAgent()).text
print("休眠2秒")
return response
def createFile():
if os.path.exists("url.txt"):
os.remove("url.txt")
if os.path.exists(path):
pass
else:
os.mkdir(path)
def saveImageDate():
html = etree.HTML(data)
image_url = html.xpath('//div[@class="tagbqppdiv"]//img//@data-original')
for j in image_url:
print(f"正在写入{j}")
with open("url.txt", "a+", encoding="UTF-8") as f:
f.write(j + "\n")
def downloadImage():
x = 0
url = open("url.txt", "r", encoding="UTF-8").readlines()
regex = re.compile(r'\.(png|jpeg|jpg|PNG|JPEG|JPG|gif|GIF)$')
for m in url:
suffix = regex.search(m).group()
time.sleep(2)
print("休眠2秒")
print(f"正在下载第{x}张图片!")
download = requests.get(m, getUserAgent()).content
with open(os.path.join(path, str(x) + suffix), "wb") as f:
f.write(download)
x = x + 1
if x == len(url):
print("图片下载完成")
break
if __name__ == '__main__':
# 页码 1 - 200
# 一页 45 张
page = 200
path = "images"
createFile()
for i in range(0, page):
data = handleResponse(i)
saveImageDate()
print("写入完成")
downloadImage()
支持一下,最近也在学习 user_agent不错,借用了 网址不错,先拿了 感谢分享 收下了 感谢楼主分享 不错不错,学到了,有空实践一下python
支持一下,小白也想学习 感谢分享 想试试
页:
[1]