吾爱破解 - 52pojie.cn

 找回密码
 注册[Register]

QQ登录

只需一步,快速开始

查看: 3707|回复: 19
收起左侧

[Python 转载] 不咳嗽的图片他来了 加速版

[复制链接]
yuan981667802 发表于 2020-9-22 12:26
昨天看到这位@w411024大佬写的抓取脚本,手痒痒想自己加速一下,第一次写python,从安装环境到运行成功查了好久的资料也参考了大佬们的源码,不好的地方请大佬指教,感谢@w411024提供的源码   地址懂得都懂

import threading
import random
import time
import re
import requests
import os
from bs4 import BeautifulSoup

url = 'www.xxx.com'


headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Cookie': 'UM_distinctid=174a9141007f29-0697ac1aa46086-3971095d-19fd10-174a9141008cbb; CNZZDATA1279234273=1349150808-1600561283-%7C1600561283'
}

def getHtml(url):
    response = requests.get(url, headers=headers)
    response.encoding = 'utf-8'
    html = BeautifulSoup(response.text, 'lxml')
    return html
pass


def getmainname(head):  # 得到6大分类
    r = requests.get(url + '/home/index.html', headers=head)
    namels = [];
    urlls = [];
    namelist = []
    try:
        r.raise_for_status()
    except:
        print('r.code!=200')
    r.encoding = r.apparent_encoding
    soup = BeautifulSoup(r.text, 'lxml')
    mainname = soup.find_all('li')
    mainname = mainname[1:7]  # mainname为标签列表

    for i in mainname:
        aurl = i.find('a')['href']
        urlls.append(aurl)
    for i in range(len(mainname)):
        namels.append(mainname.string)

    for i in range(len(mainname)):
        namelist.append([namels, urlls])
    return namelist  # 返回六大分类名字返回名字和相应的地址,为一个2维列表


def entername(namelist):  # 输入要查找的分类名字
    subnamelist = [];
    subnamels = [];
    suburlls = []
    for i in range(6):
        print('{}--{}'.format(i, namelist[0]))
    n = eval(input('请输入要查找的对象数字0——5之间(CTRL+C可强制退出程序)'))
    print('已选择{}'.format(namelist[n][0]))
    print(namelist[n][1])
    aa = url + namelist[n][1]
    soup = getHtml(aa)
    print(aa)
    imgList_name =  soup.find_all("div", class_="vodname")
    imgList_url = soup.find_all('div',class_='listpic')

    for i in imgList_url:
        suburlls.append(url+ i.find('a')['href'])
    for i in range(len(imgList_name)):
        subnamels.append(imgList_name.get_text())
    for i in range(len(imgList_name)):
        subnamelist.append([subnamels, suburlls])
    subnamelist = subnamelist[0:20]
    getpics(subnamelist)
    return subnamelist  # 返回该分类下所有套图的名字和地址为一个2维列表


def getpics(subnamelist):  # 对该套图进行处理
    global namelist, head

    if not os.path.exists('D:\\pics\\'):
        os.mkdir('D:\\pics\\')
    print('此分类下共有{}个套图'.format(len(subnamelist)))
    for i in range(len(subnamelist)):  # 这里为对每一个套图进行遍历
        print('\t正在抓取第{}个套图:{}'.format(i + 1, subnamelist[0]))
        dirName = subnamelist[0]
        test_str = re.search(r"\W", dirName)
        if test_str == None:
            print("没有没有真没有特殊字符")
        else:
            dirName = '有特殊符号'+str(i+1)
        if not os.path.exists('D:\\pics\\' + dirName):
            os.mkdir('D:\\pics\\' + dirName)  # 这里为创建对应套图名称的文件夹
        urllist, url = getdetails(subnamelist[1])  # urllist为对应套图中每张图片的地址
        name = 'D:\\pics\\' + dirName + '\\'
        for j in range(len(urllist)):  # 以地址为索引
            print('\t正在抓取第{}'.format(j + 1))
            threading.Thread(target=write, args=(urllist[j].get('src'), name, j)).start()  # 引入多线程,加快速度
            if j == len(urllist) - 1:
                print('此套图抓取完毕')
    print('已抓取全部')
    pass


def write(urlls, name, j):  # 对套图中的单个图片写入
    f = open(name + str(j) + '.jpg', 'ab')
    r = requests.get(urlls, headers)
    f.write(r.content)
    f.close()
    pass


def getdetails(url):  # 得到套图中每张图片的地址返回一个列表
    imgList_html = getHtml(url)
    urllist = imgList_html.select('#hellobox > div.newsbody > div.nbodys > img')
    return urllist, url


def main( namelist):
    subnamelist = entername(namelist)
    return subnamelist


head = {
     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Cookie': 'UM_distinctid=174a9141007f29-0697ac1aa46086-3971095d-19fd10-174a9141008cbb; CNZZDATA1279234273=1349150808-1600561283-%7C1600561283'
}
namelist = getmainname(head)

while True:
    main(namelist)
    key_loop = eval(input('是否继续?是请按1'))
    if not key_loop:
        print('程序结束')
        break

免费评分

参与人数 2吾爱币 +3 热心值 +2 收起 理由
kjian + 1 + 1 用心讨论,共获提升!
ekanshao + 2 + 1 不明觉厉

查看全部评分

发帖前要善用论坛搜索功能,那里可能会有你要找的答案或者已经有人发布过相同内容了,请勿重复发帖。

tan19890718 发表于 2020-10-8 00:49
楼主你的代码我运行不了,自己改了一下
[Python] 纯文本查看 复制代码
import threading
import random
import time
import re
import requests
import os
from bs4 import BeautifulSoup

url = 'xxxxxx'


headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Cookie': 'UM_distinctid=174a9141007f29-0697ac1aa46086-3971095d-19fd10-174a9141008cbb; CNZZDATA1279234273=1349150808-1600561283-%7C1600561283'
}

def getHtml(url):
    response = requests.get(url, headers=headers)
    response.encoding = 'utf-8'
    html = BeautifulSoup(response.text, 'lxml')
    return html
pass


def getmainname(head):  # 得到6大分类
    r = requests.get(url + '/home/index.html', headers=head)
    namels = []
    urlls = []
    namelist = []
    try:
        r.raise_for_status()
    except:
        print('r.code!=200')
    r.encoding = r.apparent_encoding
    soup = BeautifulSoup(r.text, 'lxml')
    mainname = soup.find_all('li')
    mainname = mainname[1:7]  # mainname为标签列表
    # print(mainname)
    for i in mainname:
        aurl = i.find('a')['href']
        urlls.append(aurl)
    for i in range(len(mainname)):
        namels.append(mainname[i].string)

    # for i in range(len(mainname)):
    #     namelist.append([namels, urlls])
    namelist.append(namels)
    namelist.append(urlls)
    # print(namelist)
    return namelist  # 返回六大分类名字返回名字和相应的地址,为一个2维列表


def entername(namelist):  # 输入要查找的分类名字
    subnamelist = []
    subnamels = []
    suburlls = []
    for i in range(6):
        print('{}--{}'.format(i, namelist[0][i]))
    n = eval(input('请输入要查找的对象数字0——5之间(按CTRL+C可强制退出程序):'))
    print('已选择{}'.format(namelist[0][n]))

    aa = url + namelist[1][n]
    soup = getHtml(aa)
    print(aa)
    imgList_name = soup.find_all("div", class_="vodname")
    imgList_url = soup.find_all('div', class_='listpic')
    for i in imgList_url:
        suburlls.append(url + i.find('a')['href'])
    for i in imgList_name:
        subnamels.append(i.get_text())
        # subnamels.append(imgList_name)
    # for i in range(len(imgList_name)):
    #     subnamelist.append([subnamels, suburlls])
    subnamelist.append(subnamels)
    subnamelist.append(suburlls)
    print(subnamelist)
    print(suburlls)
    subnamelist = subnamelist[0:20]
    getpics(subnamelist)
    return subnamelist  # 返回该分类下所有套图的名字和地址为一个2维列表


def getpics(subnamelist):  # 对该套图进行处理
    # global namelist, head

    if not os.path.exists('D:\\pics\\'):
        os.mkdir('D:\\pics\\')
    print('此分类下共有{}个套图'.format(len(subnamelist[0])))
    for i in range(len(subnamelist[0])):  # 这里为对每一个套图进行遍历
        print('\t正在抓取第{}个套图:{}'.format(i + 1, subnamelist[0][i]))
        dirName = subnamelist[0][i]
        test_str = re.search(r"\W", dirName)
        if test_str == None:
            print("没有没有真没有特殊字符")
        else:
            dirName = '有特殊符号'+str(i+1)
        if not os.path.exists('D:\\pics\\' + dirName):
            os.mkdir('D:\\pics\\' + dirName)  # 这里为创建对应套图名称的文件夹
        urllist, url = getdetails(subnamelist[1][i])  # urllist为对应套图中每张图片的地址
        name = 'D:\\pics\\' + dirName + '\\'
        for j in range(len(urllist)):  # 以地址为索引
            print('\t正在抓取第{}张'.format(j + 1))
            threading.Thread(target=write, args=(urllist[j].get('src'), name, j)).start()  # 引入多线程,加快速度
            if j == len(urllist) - 1:
                print('此套图抓取完毕')
    print('已抓取全部')
    pass


def write(urlls, name, j):  # 对套图中的单个图片写入
    f = open(name + str(j) + '.jpg', 'ab')
    r = requests.get(urlls, headers)
    f.write(r.content)
    f.close()
    pass


def getdetails(url):  # 得到套图中每张图片的地址返回一个列表
    imgList_html = getHtml(url)
    urllist = imgList_html.select('#hellobox > div.newsbody > div.nbodys > img')
    return urllist, url


def main(namelist):
    subnamelist = entername(namelist)
    return subnamelist


head = {
     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Cookie': 'UM_distinctid=174a9141007f29-0697ac1aa46086-3971095d-19fd10-174a9141008cbb; CNZZDATA1279234273=1349150808-1600561283-%7C1600561283'
}
namelist = getmainname(head)

while True:
    # main(namelist)
    entername(namelist)
    key_loop = eval(input('是否继续?是请按1:'))
    if not key_loop:
        print('程序结束')
        break
 楼主| yuan981667802 发表于 2020-9-22 12:49
没发过几次帖子,没放出大佬的帖子,这是大佬的原贴 @w411024
不咳嗽的图片他来了
https://www.52pojie.cn/thread-1270998-1-1.html
(出处: 吾爱破解论坛)
rud 发表于 2020-9-22 12:40
xiaojian1999 发表于 2020-9-22 12:47
看不懂,好像用不了
撒呀嘎 发表于 2020-9-22 12:56
看不懂!也点个赞
c03xp 发表于 2020-9-22 12:57
发帖的时候有个“<>”按钮,弹出对话框,选择指定语言,把代码粘贴到里面
mscsky 发表于 2020-9-22 13:06
我有100多套这个系列,没意思
mscsky 发表于 2020-9-22 13:08
另外请不要使用多线程,服务器搞崩了没的玩
不乖啊 发表于 2020-9-22 13:12
这个会收到律师函吗
 楼主| yuan981667802 发表于 2020-9-22 13:21
mscsky 发表于 2020-9-22 13:08
另外请不要使用多线程,服务器搞崩了没的玩

好的,谢谢指导
您需要登录后才可以回帖 登录 | 注册[Register]

本版积分规则

返回列表

RSS订阅|小黑屋|处罚记录|联系我们|吾爱破解 - LCG - LSG ( 京ICP备16042023号 | 京公网安备 11010502030087号 )

GMT+8, 2024-11-25 23:20

Powered by Discuz!

Copyright © 2001-2020, Tencent Cloud.

快速回复 返回顶部 返回列表