Traitor 发表于 2022-3-8 13:40

Python爬表情包

本帖最后由 Traitor 于 2022-3-8 13:42 编辑

网站 : https://fabiaoqing.com/
新手练手写的,一共9000张表情包,没有使用多线程,url是直接写入到文本,然后再读出来保存的,其实也可以不用这样,写的不好的地方大家多多交流!
#
import requests
import random
import time
from lxml import etree
import os
import re

user_agent_pc = [
    # 谷歌
    'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.html.2171.71 Safari/537.36',
    'Mozilla/5.0.html (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.html.1271.64 Safari/537.11',
    'Mozilla/5.0.html (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.html.648.133 Safari/534.16',
    # 火狐
    'Mozilla/5.0.html (Windows NT 6.1; WOW64; rv:34.0.html) Gecko/20100101 Firefox/34.0.html',
    'Mozilla/5.0.html (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
    # opera
    'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.html.2171.95 Safari/537.36 OPR/26.0.html.1656.60',
    # qq浏览器
    'Mozilla/5.0.html (compatible; MSIE 9.0.html; Windows NT 6.1; WOW64; Trident/5.0.html; SLCC2; .NET CLR 2.0.html.50727; .NET CLR 3.5.30729; .NET CLR 3.0.html.30729; Media Center PC 6.0.html; .NET4.0C; .NET4.0E; QQBrowser/7.0.html.3698.400)',
    # 搜狗浏览器
    'Mozilla/5.0.html (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.html.963.84 Safari/535.11 SE 2.X MetaSr 1.0.html',
    # 360浏览器
    'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.html.1599.101 Safari/537.36',
    'Mozilla/5.0.html (Windows NT 6.1; WOW64; Trident/7.0.html; rv:11.0.html) like Gecko',
    # uc浏览器
    'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.html.2125.122 UBrowser/4.0.html.3214.0.html Safari/537.36',
]


def getUserAgent():
    user_agent = random.choice(user_agent_pc)
    headers = {
      "User-Agent": user_agent,
      "Host": "fabiaoqing.com",
    }

    return headers


def handleResponse(n):
    time.sleep(2)
    url = f"https://fabiaoqing.com/biaoqing/lists/page/{n}.html"
    response = requests.get(url, headers=getUserAgent()).text
    print("休眠2秒")
    return response


def createFile():
    if os.path.exists("url.txt"):
      os.remove("url.txt")
    if os.path.exists(path):
      pass
    else:
      os.mkdir(path)


def saveImageDate():
    html = etree.HTML(data)
    image_url = html.xpath('//div[@class="tagbqppdiv"]//img//@data-original')
    for j in image_url:
      print(f"正在写入{j}")
      with open("url.txt", "a+", encoding="UTF-8") as f:
            f.write(j + "\n")


def downloadImage():
    x = 0
    url = open("url.txt", "r", encoding="UTF-8").readlines()
    regex = re.compile(r'\.(png|jpeg|jpg|PNG|JPEG|JPG|gif|GIF)$')
    for m in url:
      suffix = regex.search(m).group()
      time.sleep(2)
      print("休眠2秒")
      print(f"正在下载第{x}张图片!")
      download = requests.get(m, getUserAgent()).content
      with open(os.path.join(path, str(x) + suffix), "wb") as f:
            f.write(download)
      x = x + 1
      if x == len(url):
            print("图片下载完成")
            break


if __name__ == '__main__':
    # 页码 1 - 200
    # 一页 45 张
    page = 200
    path = "images"
    createFile()
    for i in range(0, page):
      data = handleResponse(i)
      saveImageDate()
    print("写入完成")
    downloadImage()

小不点吃鱼 发表于 2022-3-8 13:51

支持一下,最近也在学习

junjie0927 发表于 2022-3-8 15:42

user_agent不错,借用了

FIzz001 发表于 2022-3-8 15:53

网址不错,先拿了

狂笑一君 发表于 2022-3-8 16:02

感谢分享

bluemood4 发表于 2022-3-8 18:05

收下了 感谢楼主分享

Wisdom_xiaogui 发表于 2022-3-8 19:42

不错不错,学到了,有空实践一下python

矢志不渝 发表于 2022-3-13 20:55


支持一下,小白也想学习

emmali 发表于 2022-5-16 09:48

感谢分享   想试试
页: [1]
查看完整版本: Python爬表情包