Python爬表情包

Traitor 发表于 2022-3-8 13:40

本帖最后由 Traitor 于 2022-3-8 13:42 编辑

网站： https://fabiaoqing.com/
新手练手写的，一共9000张表情包，没有使用多线程，url是直接写入到文本，然后再读出来保存的，其实也可以不用这样，写的不好的地方大家多多交流！
#
import requests
import random
import time
from lxml import etree
import os
import re

user_agent_pc = [
# 谷歌
'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.html.2171.71 Safari/537.36',
'Mozilla/5.0.html (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.html.1271.64 Safari/537.11',
'Mozilla/5.0.html (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.html.648.133 Safari/534.16',
# 火狐
'Mozilla/5.0.html (Windows NT 6.1; WOW64; rv:34.0.html) Gecko/20100101 Firefox/34.0.html',
'Mozilla/5.0.html (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
# opera
'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.html.2171.95 Safari/537.36 OPR/26.0.html.1656.60',
# qq浏览器
'Mozilla/5.0.html (compatible; MSIE 9.0.html; Windows NT 6.1; WOW64; Trident/5.0.html; SLCC2; .NET CLR 2.0.html.50727; .NET CLR 3.5.30729; .NET CLR 3.0.html.30729; Media Center PC 6.0.html; .NET4.0C; .NET4.0E; QQBrowser/7.0.html.3698.400)',
# 搜狗浏览器
'Mozilla/5.0.html (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.html.963.84 Safari/535.11 SE 2.X MetaSr 1.0.html',
# 360浏览器
'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.html.1599.101 Safari/537.36',
'Mozilla/5.0.html (Windows NT 6.1; WOW64; Trident/7.0.html; rv:11.0.html) like Gecko',
# uc浏览器
'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.html.2125.122 UBrowser/4.0.html.3214.0.html Safari/537.36',
]

def getUserAgent():
user_agent = random.choice(user_agent_pc)
headers = {
   "User-Agent": user_agent,
   "Host": "fabiaoqing.com",
}

return headers

def handleResponse(n):
time.sleep(2)
url = f"https://fabiaoqing.com/biaoqing/lists/page/{n}.html"
response = requests.get(url, headers=getUserAgent()).text
print("休眠2秒")
return response

def createFile():
if os.path.exists("url.txt"):
   os.remove("url.txt")
if os.path.exists(path):
   pass
else:
   os.mkdir(path)

def saveImageDate():
html = etree.HTML(data)
image_url = html.xpath('//div[@class="tagbqppdiv"]//img//@data-original')
for j in image_url:
   print(f"正在写入{j}")
   with open("url.txt", "a+", encoding="UTF-8") as f:
         f.write(j + "\n")

def downloadImage():
x = 0
url = open("url.txt", "r", encoding="UTF-8").readlines()
regex = re.compile(r'\.(png|jpeg|jpg|PNG|JPEG|JPG|gif|GIF)$')
for m in url:
   suffix = regex.search(m).group()
   time.sleep(2)
   print("休眠2秒")
   print(f"正在下载第{x}张图片！")
   download = requests.get(m, getUserAgent()).content
   with open(os.path.join(path, str(x) + suffix), "wb") as f:
         f.write(download)
   x = x + 1
   if x == len(url):
         print("图片下载完成")
         break

if __name__ == '__main__':
# 页码 1 - 200
# 一页 45 张
page = 200
path = "images"
createFile()
for i in range(0, page):
   data = handleResponse(i)
   saveImageDate()
print("写入完成")
downloadImage()

小不点吃鱼 发表于 2022-3-8 13:51

支持一下，最近也在学习

junjie0927 发表于 2022-3-8 15:42

user_agent不错，借用了

FIzz001 发表于 2022-3-8 15:53

网址不错，先拿了

狂笑一君 发表于 2022-3-8 16:02

感谢分享

bluemood4 发表于 2022-3-8 18:05

收下了感谢楼主分享

Wisdom_xiaogui 发表于 2022-3-8 19:42

不错不错，学到了，有空实践一下python

矢志不渝 发表于 2022-3-13 20:55

支持一下，小白也想学习

emmali 发表于 2022-5-16 09:48

感谢分享想试试

页: [1]

吾爱破解 - 52pojie.cn's Archiver

Python爬表情包