yuan981667802 发表于 2020-9-22 12:26

不咳嗽的图片他来了 加速版

昨天看到这位@w411024大佬写的抓取脚本,手痒痒想自己加速一下,第一次写python,从安装环境到运行成功查了好久的资料也参考了大佬们的源码,不好的地方请大佬指教,感谢@w411024提供的源码   地址懂得都懂

import threading
import random
import time
import re
import requests
import os
from bs4 import BeautifulSoup

url = 'www.xxx.com'


headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Cookie': 'UM_distinctid=174a9141007f29-0697ac1aa46086-3971095d-19fd10-174a9141008cbb; CNZZDATA1279234273=1349150808-1600561283-%7C1600561283'
}

def getHtml(url):
    response = requests.get(url, headers=headers)
    response.encoding = 'utf-8'
    html = BeautifulSoup(response.text, 'lxml')
    return html
pass


def getmainname(head):# 得到6大分类
    r = requests.get(url + '/home/index.html', headers=head)
    namels = [];
    urlls = [];
    namelist = []
    try:
      r.raise_for_status()
    except:
      print('r.code!=200')
    r.encoding = r.apparent_encoding
    soup = BeautifulSoup(r.text, 'lxml')
    mainname = soup.find_all('li')
    mainname = mainname[1:7]# mainname为标签列表

    for i in mainname:
      aurl = i.find('a')['href']
      urlls.append(aurl)
    for i in range(len(mainname)):
      namels.append(mainname.string)

    for i in range(len(mainname)):
      namelist.append(, urlls])
    return namelist# 返回六大分类名字返回名字和相应的地址,为一个2维列表


def entername(namelist):# 输入要查找的分类名字
    subnamelist = [];
    subnamels = [];
    suburlls = []
    for i in range(6):
      print('{}--{}'.format(i, namelist[0]))
    n = eval(input('请输入要查找的对象数字0——5之间(按CTRL+C可强制退出程序):'))
    print('已选择{}'.format(namelist[0]))
    print(namelist[1])
    aa = url + namelist[1]
    soup = getHtml(aa)
    print(aa)
    imgList_name =soup.find_all("div", class_="vodname")
    imgList_url = soup.find_all('div',class_='listpic')

    for i in imgList_url:
      suburlls.append(url+ i.find('a')['href'])
    for i in range(len(imgList_name)):
      subnamels.append(imgList_name.get_text())
    for i in range(len(imgList_name)):
      subnamelist.append(, suburlls])
    subnamelist = subnamelist[0:20]
    getpics(subnamelist)
    return subnamelist# 返回该分类下所有套图的名字和地址为一个2维列表


def getpics(subnamelist):# 对该套图进行处理
    global namelist, head

    if not os.path.exists('D:\\pics\\'):
      os.mkdir('D:\\pics\\')
    print('此分类下共有{}个套图'.format(len(subnamelist)))
    for i in range(len(subnamelist)):# 这里为对每一个套图进行遍历
      print('\t正在抓取第{}个套图:{}'.format(i + 1, subnamelist[0]))
      dirName = subnamelist[0]
      test_str = re.search(r"\W", dirName)
      if test_str == None:
            print("没有没有真没有特殊字符")
      else:
            dirName = '有特殊符号'+str(i+1)
      if not os.path.exists('D:\\pics\\' + dirName):
            os.mkdir('D:\\pics\\' + dirName)# 这里为创建对应套图名称的文件夹
      urllist, url = getdetails(subnamelist[1])# urllist为对应套图中每张图片的地址
      name = 'D:\\pics\\' + dirName + '\\'
      for j in range(len(urllist)):# 以地址为索引
            print('\t正在抓取第{}张'.format(j + 1))
            threading.Thread(target=write, args=(urllist.get('src'), name, j)).start()# 引入多线程,加快速度
            if j == len(urllist) - 1:
                print('此套图抓取完毕')
    print('已抓取全部')
    pass


def write(urlls, name, j):# 对套图中的单个图片写入
    f = open(name + str(j) + '.jpg', 'ab')
    r = requests.get(urlls, headers)
    f.write(r.content)
    f.close()
    pass


def getdetails(url):# 得到套图中每张图片的地址返回一个列表
    imgList_html = getHtml(url)
    urllist = imgList_html.select('#hellobox > div.newsbody > div.nbodys > img')
    return urllist, url


def main( namelist):
    subnamelist = entername(namelist)
    return subnamelist


head = {
   'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Cookie': 'UM_distinctid=174a9141007f29-0697ac1aa46086-3971095d-19fd10-174a9141008cbb; CNZZDATA1279234273=1349150808-1600561283-%7C1600561283'
}
namelist = getmainname(head)

while True:
    main(namelist)
    key_loop = eval(input('是否继续?是请按1:'))
    if not key_loop:
      print('程序结束')
      break

tan19890718 发表于 2020-10-8 00:49

楼主你的代码我运行不了,自己改了一下
import threading
import random
import time
import re
import requests
import os
from bs4 import BeautifulSoup

url = 'xxxxxx'


headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Cookie': 'UM_distinctid=174a9141007f29-0697ac1aa46086-3971095d-19fd10-174a9141008cbb; CNZZDATA1279234273=1349150808-1600561283-%7C1600561283'
}

def getHtml(url):
    response = requests.get(url, headers=headers)
    response.encoding = 'utf-8'
    html = BeautifulSoup(response.text, 'lxml')
    return html
pass


def getmainname(head):# 得到6大分类
    r = requests.get(url + '/home/index.html', headers=head)
    namels = []
    urlls = []
    namelist = []
    try:
      r.raise_for_status()
    except:
      print('r.code!=200')
    r.encoding = r.apparent_encoding
    soup = BeautifulSoup(r.text, 'lxml')
    mainname = soup.find_all('li')
    mainname = mainname# mainname为标签列表
    # print(mainname)
    for i in mainname:
      aurl = i.find('a')['href']
      urlls.append(aurl)
    for i in range(len(mainname)):
      namels.append(mainname.string)

    # for i in range(len(mainname)):
    #   namelist.append()
    namelist.append(namels)
    namelist.append(urlls)
    # print(namelist)
    return namelist# 返回六大分类名字返回名字和相应的地址,为一个2维列表


def entername(namelist):# 输入要查找的分类名字
    subnamelist = []
    subnamels = []
    suburlls = []
    for i in range(6):
      print('{}--{}'.format(i, namelist))
    n = eval(input('请输入要查找的对象数字0——5之间(按CTRL+C可强制退出程序):'))
    print('已选择{}'.format(namelist))

    aa = url + namelist
    soup = getHtml(aa)
    print(aa)
    imgList_name = soup.find_all("div", class_="vodname")
    imgList_url = soup.find_all('div', class_='listpic')
    for i in imgList_url:
      suburlls.append(url + i.find('a')['href'])
    for i in imgList_name:
      subnamels.append(i.get_text())
      # subnamels.append(imgList_name)
    # for i in range(len(imgList_name)):
    #   subnamelist.append()
    subnamelist.append(subnamels)
    subnamelist.append(suburlls)
    print(subnamelist)
    print(suburlls)
    subnamelist = subnamelist
    getpics(subnamelist)
    return subnamelist# 返回该分类下所有套图的名字和地址为一个2维列表


def getpics(subnamelist):# 对该套图进行处理
    # global namelist, head

    if not os.path.exists('D:\\pics\\'):
      os.mkdir('D:\\pics\\')
    print('此分类下共有{}个套图'.format(len(subnamelist)))
    for i in range(len(subnamelist)):# 这里为对每一个套图进行遍历
      print('\t正在抓取第{}个套图:{}'.format(i + 1, subnamelist))
      dirName = subnamelist
      test_str = re.search(r"\W", dirName)
      if test_str == None:
            print("没有没有真没有特殊字符")
      else:
            dirName = '有特殊符号'+str(i+1)
      if not os.path.exists('D:\\pics\\' + dirName):
            os.mkdir('D:\\pics\\' + dirName)# 这里为创建对应套图名称的文件夹
      urllist, url = getdetails(subnamelist)# urllist为对应套图中每张图片的地址
      name = 'D:\\pics\\' + dirName + '\\'
      for j in range(len(urllist)):# 以地址为索引
            print('\t正在抓取第{}张'.format(j + 1))
            threading.Thread(target=write, args=(urllist.get('src'), name, j)).start()# 引入多线程,加快速度
            if j == len(urllist) - 1:
                print('此套图抓取完毕')
    print('已抓取全部')
    pass


def write(urlls, name, j):# 对套图中的单个图片写入
    f = open(name + str(j) + '.jpg', 'ab')
    r = requests.get(urlls, headers)
    f.write(r.content)
    f.close()
    pass


def getdetails(url):# 得到套图中每张图片的地址返回一个列表
    imgList_html = getHtml(url)
    urllist = imgList_html.select('#hellobox > div.newsbody > div.nbodys > img')
    return urllist, url


def main(namelist):
    subnamelist = entername(namelist)
    return subnamelist


head = {
   'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Cookie': 'UM_distinctid=174a9141007f29-0697ac1aa46086-3971095d-19fd10-174a9141008cbb; CNZZDATA1279234273=1349150808-1600561283-%7C1600561283'
}
namelist = getmainname(head)

while True:
    # main(namelist)
    entername(namelist)
    key_loop = eval(input('是否继续?是请按1:'))
    if not key_loop:
      print('程序结束')
      break

yuan981667802 发表于 2020-9-22 12:49

没发过几次帖子,没放出大佬的帖子,这是大佬的原贴 @w411024
不咳嗽的图片他来了
https://www.52pojie.cn/thread-1270998-1-1.html
(出处: 吾爱破解论坛)

rud 发表于 2020-9-22 12:40

看不懂!也点个赞

xiaojian1999 发表于 2020-9-22 12:47

看不懂,好像用不了

撒呀嘎 发表于 2020-9-22 12:56

看不懂!也点个赞

c03xp 发表于 2020-9-22 12:57

发帖的时候有个“<>”按钮,弹出对话框,选择指定语言,把代码粘贴到里面

mscsky 发表于 2020-9-22 13:06

我有100多套这个系列,没意思

mscsky 发表于 2020-9-22 13:08

另外请不要使用多线程,服务器搞崩了没的玩

不乖啊 发表于 2020-9-22 13:12

这个会收到律师函吗

yuan981667802 发表于 2020-9-22 13:21

mscsky 发表于 2020-9-22 13:08
另外请不要使用多线程,服务器搞崩了没的玩

好的,谢谢指导
页: [1] 2
查看完整版本: 不咳嗽的图片他来了 加速版