python爬虫多进程批量下载ppt模板【2020/11/4更新】
本帖最后由 We. 于 2020-11-6 22:13 编辑改良日志
# 2020/11/6
# 破案了,代码没问题,是资源链接的问题。
#
# 2020/11/4
# 之前的代码只是4线程,只是实现了进程间的通信,速度还是有一定提升空间。
# 最后的下载列表 放到了进程池中,多进程并发,下载速度会更快。代码能用,但是还有一定问题需要改进,这段时间会继续磕多进程,改到完美。
# 为了方便大家看懂改了函数名称还有写了注释。
# 老规矩改良代码贴最后。
#
# 2020/10/28
# 经过昨天下班后的死磕,改良版来了!代码和效果图直接附在最后!
# 支持多进程(用的Queue队列)下载了,速度还是提升了不少。但是我感觉还可以更快,我再研究下。
# 逻辑改了一些,去掉了一些不必要的步骤。
=================================================
ppt模板地址:http://www.ypppt.com/moban/ 这是一个免费的ppt模板网站
前两天帮朋友下述职报告,需要啥分类自己改下地址就可以用了。
请各位指正。
这是单线程的第一版。
附上代码:
import time
import requests
from scrapy.selector import Selector
def RequestsDX(url):
# 实例化一个request对象
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/86.0.4240.111 Safari/537.36 Edg/86.0.622.51',
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,'
'application/'
'signed-exchange;v=b3;q=0.9'
}
response = requests.get(url, headers)
response.encoding = 'utf-8'
return response
def SelectorDX(response):
# 实例化一个selector对象
selector = Selector(text=response.text)
return selector
def url_page():
# 查看模板分类下有多少页
ppt_url = 'http://www.ypppt.com/moban/shuzhi/'
url_list =
for i in range(2, 1000):
ppt_url2 = 'http://www.ypppt.com/moban/shuzhi/list-%s.html' % i
response = RequestsDX(ppt_url2)
if response.status_code == 200:
url_list.append(ppt_url2)
else:
break
return url_list
def url_page_ppt_id(selector):
# 查看每页ppt模板的id,并合成对应ppt链接
id = []
page_id_sum = len(selector.xpath('/html/body/div/ul/li'))
for i in range(1, page_id_sum):
tag = selector.xpath('/html/body/div/ul/li[%s]/a' % i).extract_first()
# 一个笨办法做的字符串过滤 拿到ppt的id 变量名起的稀碎
y = tag.split('<')
a = y.find('.html')
b = y.rfind('/')
# print(y)
id.append(y)
return id
def download_page():
# 拿到下载页面
url_list = url_page()
url = 'http://www.ypppt.com/p/d.php?aid='
url_download = []
for i in url_list:
response = RequestsDX(i)
selector = SelectorDX(response)
id = url_page_ppt_id(selector)
for i in id:
url_download.append(url+i)
return url_download
def download_url():
# 分析下载页面中的下载链接并下载
url_list = []
name_list = []
url = download_page()
xpath = '/html/body/div/div/ul/li/a'
filename_xpath = '/html/body/div/div/div/div/h1'
for i in url:
response = RequestsDX(i)
selector = SelectorDX(response)
url_download = selector.xpath(xpath).extract_first()
file_name = selector.xpath(filename_xpath).extract_first()
name1 = file_name.replace(' - 下载页', '')
name2 = name1.replace('<h1>', '')
name = name2.replace('</h1>', '')
a = url_download.find('"')
b = url_download.rfind('"')
url_list.append(url_download)
name_list.append(name)
download_list =
return download_list
def download():
# 下载
start_time = time.time()
download_list = download_url()
url = download_list
name = download_list
for i in range(len(url)):
response = RequestsDX(url)
print('='*100)
print('正在下载', name)
with open(r'D:\ppt\%s.zip' % name, 'wb') as f:
for chunk in response.iter_content(chunk_size=1024):
f.write(chunk)
print('下载完成')
end_time = time.time()
cost = end_time - start_time
print(cost)
download()
最后贴上效果图
===========================================
改良代码:
from time import perf_counter
import requests
from scrapy.selector import Selector
from multiprocessing import Queue,Process,Pool
def RequestsDX(url): # 实例化requests对象方便后面调用
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/86.0.4240.111 Safari/537.36 Edg/86.0.622.51',
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,'
'application/'
'signed-exchange;v=b3;q=0.9'
}
response = requests.get(url=url , headers=headers)
response.encoding = 'utf-8'
return response
def SelectorDX(url): # 实例化selector对象方便后面调用
response = RequestsDX(url)
selector = Selector(text=response.text)
return selector
def category_page(q): # 获取分类下的每页的链接
q.put('http://www.ypppt.com/moban/shuzhi/')
args = list(range(2,100))
for i in args:
ppt_url2 = 'http://www.ypppt.com/moban/shuzhi/list-%s.html' % i
response = RequestsDX(ppt_url2)
if response.status_code == 200:
print(ppt_url2)
q.put(ppt_url2)
else:
break
def download_page_parse(q, url_q): # 分析每页的aid并合成每个ppt的下载页面的链接
while q.empty() is not True:
selector = SelectorDX(q.get())
page_id_sum = len(selector.xpath('/html/body/div/ul/li'))
for i in range(1, page_id_sum) :
tag = selector.xpath('/html/body/div/ul/li[%s]/a' % i).extract_first()
y = tag.split('<')
a = y.find('.html')
b = y.rfind('/')
# print(y)
id_url = 'http://www.ypppt.com/p/d.php?aid='+y
print(id_url)
url_q.put(id_url)
def download_url_parse(url_q, download_q): # 分析每个ppt下载页面的下载链接以及ppt名字
download_list = []
while url_q.empty() is not True :
selector = SelectorDX(url_q.get())
xpath = '/html/body/div/div/ul/li/a'
filename_xpath = '/html/body/div/div/div/div/h1'
url_download = selector.xpath(xpath).extract_first()
file_name = selector.xpath(filename_xpath).extract_first()
name1 = file_name.replace(' - 下载页', '')
name2 = name1.replace('<h1>', '')
name = name2.replace('</h1>', '')
a = url_download.find('"')
b = url_download.rfind('"')
download_list.append((name,url_download))
print(download_list)
download_q.put(download_list)
def down_load(download_list): # 开始下载
response = RequestsDX(download_list)
print('=' * 100)
print('正在下载', download_list)
with open(r'D:\ppt\%s.zip' % download_list, 'wb') as f :
for chunk in response.iter_content(chunk_size=1024) :
f.write(chunk)
print('下载完成')
if __name__ == '__main__':
t = perf_counter()
q = Queue()
url_q = Queue()
download_q = Queue()
p1 = Process(target=category_page, args=(q,))
p2 = Process(target=download_page_parse, args=(q,url_q,))
p3 = Process(target=download_url_parse, args=(url_q,download_q,))
p_l =
for i in p_l:
i.start()
i.join()
download_list = download_q.get()
pool = Pool(10)
pool.map(down_load, download_list)
t1 = perf_counter()
cost = t1-t
print(cost,'s')
第二次改良的效果图:
4线程增加了queue队列,速度明显提升。
最终版效果图:
10进程,48秒完事儿!
最后严正声明:爬别人的ppt网站只是为了和大家交流学习,千万不要去恶意搞人家的网站。 We. 发表于 2020-10-28 15:43
大佬你这个我还是没看懂, 能指点一二吗 ?聊聊思路
给你加点注释 你可以当多线程模板使用
import queue
import threading
# 解析线程类
class Parse(threading.Thread):
def __init__(self, number, data_list, req_thread):
super(Parse, self).__init__()
self.number = number
self.data_list = data_list
self.req_thread = req_thread
self.is_parse = True# 判断是否从数据队列里提取数据
def run(self):
print('启动%d号解析线程' % self.number)
while True:
# 如何判断解析线程的结束条件
for t in self.req_thread:
if t.is_alive():
break
else:
if self.data_list.qsize() == 0:
self.is_parse = False
if self.is_parse:# 解析
try:
data = self.data_list.get(timeout=3)
except Exception as e:
data = None
if data is not None:
self.parse(data)
else:
break
print('退出%d号解析线程' % self.number)
# 页面解析函数
def parse(self, data):
# 下载文件
pass
# 采集线程类
class Crawl(threading.Thread):
def __init__(self, number, req_list, data_list):
super(Crawl, self).__init__()
self.number = number
self.req_list = req_list
self.data_list = data_list
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36'
}
def run(self):
print('启动采集线程%d号' % self.number)
while self.req_list.qsize() > 0:
url = self.req_list.get()
print('%d号线程采集:%s' % (self.number, url))
# time.sleep(random.randint(1, 3))
self.data_list.put("填充详细页面链接")# 向数据队列里追加
def main():
concurrent = 3
conparse = 3
# 生成请求队列
req_list = queue.Queue()
# 生成数据队列
data_list = queue.Queue()
# 填充请求数据
for i in range(1, 13 + 1):
base_url = 'https://www.baidu.com/{}.html'.format(i)
req_list.put(base_url)
# 生成N个采集线程
req_thread = []
for i in range(concurrent):
t = Crawl(i + 1, req_list, data_list)# 创造线程
t.start()
req_thread.append(t)
# 生成N个解析线程
parse_thread = []
for i in range(conparse):
t = Parse(i + 1, data_list, req_thread)# 创造解析线程
t.start()
parse_thread.append(t)
for t in req_thread:
t.join()
for t in parse_thread:
t.join()
if __name__ == '__main__':
main()
给你整个多线程版
import os
import queue
import threading
import requests
from lxml import etree
class Parse(threading.Thread):
def __init__(self, number, data_list, req_thread):
super(Parse, self).__init__()
self.number = number
self.data_list = data_list
self.req_thread = req_thread
self.is_parse = True
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36'
}
def run(self):
while True:
for t in self.req_thread:
if t.is_alive():
break
else:
if self.data_list.qsize() == 0:
self.is_parse = False
if self.is_parse:# 解析
try:
item = self.data_list.get(timeout=3)
except Exception as e:
item = None
if item is not None:
self.parse(item)
else:
break
def parse(self, item):
id = item["id"]
title = item["title"]
url = 'http://www.ypppt.com/p/d.php?aid={}'.format(id)
resp = requests.get(url, self.headers)
parser = etree.HTML(resp.text)
down_url = parser.xpath("//ul[@class='down clear']/li/a/@href")
print(down_url)
file_name = "down/" + title + ".rar"
with open(file_name, 'wb') as f:
f.write(requests.get(down_url).content)
print('下载完成:', file_name)
class Crawl(threading.Thread):
def __init__(self, number, req_list, data_list):
super(Crawl, self).__init__()
self.number = number
self.req_list = req_list
self.data_list = data_list
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36'
}
def run(self):
while self.req_list.qsize() > 0:
url = self.req_list.get()
resp = requests.get(url, self.headers)
resp.encoding = "utf-8"
parser = etree.HTML(resp.text)
a_list = parser.xpath("//ul[@class='posts clear']/li/a[@class='p-title']")
for a in a_list:
link = a.xpath("./@href")
id = link
title = a.xpath("./text()")
item = {
"id": id,
"title": title
}
print(item)
self.data_list.put(item)
def main():
path = "down"
if not os.path.exists(path):
os.makedirs(path)
concurrent = 2
conparse = 10
req_list = queue.Queue()
data_list = queue.Queue()
req_list.put('http://www.ypppt.com/moban/shuzhi/')
all_page = 3
for i in range(2, all_page + 1):
ppt_url = 'http://www.ypppt.com/moban/shuzhi/list-%s.html' % i
req_list.put(ppt_url)
req_thread = []
for i in range(concurrent):
t = Crawl(i + 1, req_list, data_list)
t.start()
req_thread.append(t)
parse_thread = []
for i in range(conparse):
t = Parse(i + 1, data_list, req_thread)
t.start()
parse_thread.append(t)
for t in req_thread:
t.join()
for t in parse_thread:
t.join()
if __name__ == '__main__':
main()
最后的down可以使用一个多进程来完成,for循环太慢一个一个完成,多进程模块使用multiprocessing 支持!!!! 好东西,感谢分享 fanvalen 发表于 2020-10-27 12:17
最后的down可以使用一个多进程来完成,for循环太慢一个一个完成,多进程模块使用multiprocessing
请问一下为什么不用多线程呢? 另外多进程我能想到的就是把任务拆成好个然后多进程并发, 但是有没有办法创造一个任务池呢 ?多进程自己进去领任务完成就行,具体如何实现我还没想明白。 支持支持 xilidexiao 发表于 2020-10-27 14:32
给你整个多线程版
import os
import queue
非常感谢,看的有点懵,得补补多线程了和类了。{:1_893:} xilidexiao 发表于 2020-10-27 14:32
给你整个多线程版
import os
import queue
大佬你这个我还是没看懂, 能指点一二吗 ?聊聊思路 能爬一些ppt课程吗