不咳嗽的图片他来了 加速版
昨天看到这位@w411024大佬写的抓取脚本,手痒痒想自己加速一下,第一次写python,从安装环境到运行成功查了好久的资料也参考了大佬们的源码,不好的地方请大佬指教,感谢@w411024提供的源码 地址懂得都懂import threading
import random
import time
import re
import requests
import os
from bs4 import BeautifulSoup
url = 'www.xxx.com'
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Cookie': 'UM_distinctid=174a9141007f29-0697ac1aa46086-3971095d-19fd10-174a9141008cbb; CNZZDATA1279234273=1349150808-1600561283-%7C1600561283'
}
def getHtml(url):
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
html = BeautifulSoup(response.text, 'lxml')
return html
pass
def getmainname(head):# 得到6大分类
r = requests.get(url + '/home/index.html', headers=head)
namels = [];
urlls = [];
namelist = []
try:
r.raise_for_status()
except:
print('r.code!=200')
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text, 'lxml')
mainname = soup.find_all('li')
mainname = mainname[1:7]# mainname为标签列表
for i in mainname:
aurl = i.find('a')['href']
urlls.append(aurl)
for i in range(len(mainname)):
namels.append(mainname.string)
for i in range(len(mainname)):
namelist.append(, urlls])
return namelist# 返回六大分类名字返回名字和相应的地址,为一个2维列表
def entername(namelist):# 输入要查找的分类名字
subnamelist = [];
subnamels = [];
suburlls = []
for i in range(6):
print('{}--{}'.format(i, namelist[0]))
n = eval(input('请输入要查找的对象数字0——5之间(按CTRL+C可强制退出程序):'))
print('已选择{}'.format(namelist[0]))
print(namelist[1])
aa = url + namelist[1]
soup = getHtml(aa)
print(aa)
imgList_name =soup.find_all("div", class_="vodname")
imgList_url = soup.find_all('div',class_='listpic')
for i in imgList_url:
suburlls.append(url+ i.find('a')['href'])
for i in range(len(imgList_name)):
subnamels.append(imgList_name.get_text())
for i in range(len(imgList_name)):
subnamelist.append(, suburlls])
subnamelist = subnamelist[0:20]
getpics(subnamelist)
return subnamelist# 返回该分类下所有套图的名字和地址为一个2维列表
def getpics(subnamelist):# 对该套图进行处理
global namelist, head
if not os.path.exists('D:\\pics\\'):
os.mkdir('D:\\pics\\')
print('此分类下共有{}个套图'.format(len(subnamelist)))
for i in range(len(subnamelist)):# 这里为对每一个套图进行遍历
print('\t正在抓取第{}个套图:{}'.format(i + 1, subnamelist[0]))
dirName = subnamelist[0]
test_str = re.search(r"\W", dirName)
if test_str == None:
print("没有没有真没有特殊字符")
else:
dirName = '有特殊符号'+str(i+1)
if not os.path.exists('D:\\pics\\' + dirName):
os.mkdir('D:\\pics\\' + dirName)# 这里为创建对应套图名称的文件夹
urllist, url = getdetails(subnamelist[1])# urllist为对应套图中每张图片的地址
name = 'D:\\pics\\' + dirName + '\\'
for j in range(len(urllist)):# 以地址为索引
print('\t正在抓取第{}张'.format(j + 1))
threading.Thread(target=write, args=(urllist.get('src'), name, j)).start()# 引入多线程,加快速度
if j == len(urllist) - 1:
print('此套图抓取完毕')
print('已抓取全部')
pass
def write(urlls, name, j):# 对套图中的单个图片写入
f = open(name + str(j) + '.jpg', 'ab')
r = requests.get(urlls, headers)
f.write(r.content)
f.close()
pass
def getdetails(url):# 得到套图中每张图片的地址返回一个列表
imgList_html = getHtml(url)
urllist = imgList_html.select('#hellobox > div.newsbody > div.nbodys > img')
return urllist, url
def main( namelist):
subnamelist = entername(namelist)
return subnamelist
head = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Cookie': 'UM_distinctid=174a9141007f29-0697ac1aa46086-3971095d-19fd10-174a9141008cbb; CNZZDATA1279234273=1349150808-1600561283-%7C1600561283'
}
namelist = getmainname(head)
while True:
main(namelist)
key_loop = eval(input('是否继续?是请按1:'))
if not key_loop:
print('程序结束')
break
楼主你的代码我运行不了,自己改了一下
import threading
import random
import time
import re
import requests
import os
from bs4 import BeautifulSoup
url = 'xxxxxx'
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Cookie': 'UM_distinctid=174a9141007f29-0697ac1aa46086-3971095d-19fd10-174a9141008cbb; CNZZDATA1279234273=1349150808-1600561283-%7C1600561283'
}
def getHtml(url):
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
html = BeautifulSoup(response.text, 'lxml')
return html
pass
def getmainname(head):# 得到6大分类
r = requests.get(url + '/home/index.html', headers=head)
namels = []
urlls = []
namelist = []
try:
r.raise_for_status()
except:
print('r.code!=200')
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text, 'lxml')
mainname = soup.find_all('li')
mainname = mainname# mainname为标签列表
# print(mainname)
for i in mainname:
aurl = i.find('a')['href']
urlls.append(aurl)
for i in range(len(mainname)):
namels.append(mainname.string)
# for i in range(len(mainname)):
# namelist.append()
namelist.append(namels)
namelist.append(urlls)
# print(namelist)
return namelist# 返回六大分类名字返回名字和相应的地址,为一个2维列表
def entername(namelist):# 输入要查找的分类名字
subnamelist = []
subnamels = []
suburlls = []
for i in range(6):
print('{}--{}'.format(i, namelist))
n = eval(input('请输入要查找的对象数字0——5之间(按CTRL+C可强制退出程序):'))
print('已选择{}'.format(namelist))
aa = url + namelist
soup = getHtml(aa)
print(aa)
imgList_name = soup.find_all("div", class_="vodname")
imgList_url = soup.find_all('div', class_='listpic')
for i in imgList_url:
suburlls.append(url + i.find('a')['href'])
for i in imgList_name:
subnamels.append(i.get_text())
# subnamels.append(imgList_name)
# for i in range(len(imgList_name)):
# subnamelist.append()
subnamelist.append(subnamels)
subnamelist.append(suburlls)
print(subnamelist)
print(suburlls)
subnamelist = subnamelist
getpics(subnamelist)
return subnamelist# 返回该分类下所有套图的名字和地址为一个2维列表
def getpics(subnamelist):# 对该套图进行处理
# global namelist, head
if not os.path.exists('D:\\pics\\'):
os.mkdir('D:\\pics\\')
print('此分类下共有{}个套图'.format(len(subnamelist)))
for i in range(len(subnamelist)):# 这里为对每一个套图进行遍历
print('\t正在抓取第{}个套图:{}'.format(i + 1, subnamelist))
dirName = subnamelist
test_str = re.search(r"\W", dirName)
if test_str == None:
print("没有没有真没有特殊字符")
else:
dirName = '有特殊符号'+str(i+1)
if not os.path.exists('D:\\pics\\' + dirName):
os.mkdir('D:\\pics\\' + dirName)# 这里为创建对应套图名称的文件夹
urllist, url = getdetails(subnamelist)# urllist为对应套图中每张图片的地址
name = 'D:\\pics\\' + dirName + '\\'
for j in range(len(urllist)):# 以地址为索引
print('\t正在抓取第{}张'.format(j + 1))
threading.Thread(target=write, args=(urllist.get('src'), name, j)).start()# 引入多线程,加快速度
if j == len(urllist) - 1:
print('此套图抓取完毕')
print('已抓取全部')
pass
def write(urlls, name, j):# 对套图中的单个图片写入
f = open(name + str(j) + '.jpg', 'ab')
r = requests.get(urlls, headers)
f.write(r.content)
f.close()
pass
def getdetails(url):# 得到套图中每张图片的地址返回一个列表
imgList_html = getHtml(url)
urllist = imgList_html.select('#hellobox > div.newsbody > div.nbodys > img')
return urllist, url
def main(namelist):
subnamelist = entername(namelist)
return subnamelist
head = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Cookie': 'UM_distinctid=174a9141007f29-0697ac1aa46086-3971095d-19fd10-174a9141008cbb; CNZZDATA1279234273=1349150808-1600561283-%7C1600561283'
}
namelist = getmainname(head)
while True:
# main(namelist)
entername(namelist)
key_loop = eval(input('是否继续?是请按1:'))
if not key_loop:
print('程序结束')
break
没发过几次帖子,没放出大佬的帖子,这是大佬的原贴 @w411024
不咳嗽的图片他来了
https://www.52pojie.cn/thread-1270998-1-1.html
(出处: 吾爱破解论坛)
看不懂!也点个赞 看不懂,好像用不了 看不懂!也点个赞 发帖的时候有个“<>”按钮,弹出对话框,选择指定语言,把代码粘贴到里面 我有100多套这个系列,没意思 另外请不要使用多线程,服务器搞崩了没的玩 这个会收到律师函吗 mscsky 发表于 2020-9-22 13:08
另外请不要使用多线程,服务器搞崩了没的玩
好的,谢谢指导
页:
[1]
2