本帖最后由 We. 于 2020-11-6 22:13 编辑
改良日志[Python] 纯文本查看 复制代码
# 2020/11/6
# 破案了,代码没问题,是资源链接的问题。
#
# 2020/11/4
# 之前的代码只是4线程,只是实现了进程间的通信,速度还是有一定提升空间。
# 最后的下载列表 放到了进程池中,多进程并发,下载速度会更快。代码能用,但是还有一定问题需要改进,这段时间会继续磕多进程,改到完美。
# 为了方便大家看懂改了函数名称还有写了注释。
# 老规矩改良代码贴最后。
#
# 2020/10/28
# 经过昨天下班后的死磕,改良版来了!代码和效果图直接附在最后!
# 支持多进程(用的Queue队列)下载了,速度还是提升了不少。但是我感觉还可以更快,我再研究下。
# 逻辑改了一些,去掉了一些不必要的步骤。
=================================================
ppt模板地址:http://www.ypppt.com/moban/ 这是一个免费的ppt模板网站
前两天帮朋友下述职报告,需要啥分类自己改下地址就可以用了。
请各位指正。
这是单线程的第一版。
附上代码:
[Python] 纯文本查看 复制代码 import time
import requests
from scrapy.selector import Selector
def RequestsDX(url):
# 实例化一个request对象
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/86.0.4240.111 Safari/537.36 Edg/86.0.622.51',
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,'
'application/'
'signed-exchange;v=b3;q=0.9'
}
response = requests.get(url, headers)
response.encoding = 'utf-8'
return response
def SelectorDX(response):
# 实例化一个selector对象
selector = Selector(text=response.text)
return selector
def url_page():
# 查看模板分类下有多少页
ppt_url = 'http://www.ypppt.com/moban/shuzhi/'
url_list = [ppt_url]
for i in range(2, 1000):
ppt_url2 = 'http://www.ypppt.com/moban/shuzhi/list-%s.html' % i
response = RequestsDX(ppt_url2)
if response.status_code == 200:
url_list.append(ppt_url2)
else:
break
return url_list
def url_page_ppt_id(selector):
# 查看每页ppt模板的id,并合成对应ppt链接
id = []
page_id_sum = len(selector.xpath('/html/body/div[2]/ul/li'))
for i in range(1, page_id_sum):
tag = selector.xpath('/html/body/div[2]/ul/li[%s]/a' % i).extract_first()
# 一个笨办法做的字符串过滤 拿到ppt的id 变量名起的稀碎
y = tag.split('<')
a = y[1].find('.html')
b = y[1].rfind('/')
# print(y[1][b+1:a])
id.append(y[1][b + 1 :a])
return id
def download_page():
# 拿到下载页面
url_list = url_page()
url = 'http://www.ypppt.com/p/d.php?aid='
url_download = []
for i in url_list:
response = RequestsDX(i)
selector = SelectorDX(response)
id = url_page_ppt_id(selector)
for i in id:
url_download.append(url+i)
return url_download
def download_url():
# 分析下载页面中的下载链接并下载
url_list = []
name_list = []
url = download_page()
xpath = '/html/body/div/div/ul/li[1]/a'
filename_xpath = '/html/body/div/div/div[2]/div[2]/h1'
for i in url:
response = RequestsDX(i)
selector = SelectorDX(response)
url_download = selector.xpath(xpath).extract_first()
file_name = selector.xpath(filename_xpath).extract_first()
name1 = file_name.replace(' - 下载页', '')
name2 = name1.replace('<h1>', '')
name = name2.replace('</h1>', '')
a = url_download.find('"')
b = url_download.rfind('"')
url_list.append(url_download[a + 1 :b])
name_list.append(name)
download_list = [name_list, url_list]
return download_list
def download():
# 下载
start_time = time.time()
download_list = download_url()
url = download_list[1]
name = download_list[0]
for i in range(len(url)):
response = RequestsDX(url[i])
print('='*100)
print('正在下载', name[i])
with open(r'D:\ppt\%s.zip' % name[i], 'wb') as f:
for chunk in response.iter_content(chunk_size=1024):
f.write(chunk)
print('下载完成')
end_time = time.time()
cost = end_time - start_time
print(cost)
download()
最后贴上效果图
===========================================
改良代码:
[Python] 纯文本查看 复制代码 from time import perf_counter
import requests
from scrapy.selector import Selector
from multiprocessing import Queue,Process,Pool
def RequestsDX(url): # 实例化requests对象方便后面调用
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/86.0.4240.111 Safari/537.36 Edg/86.0.622.51',
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,'
'application/'
'signed-exchange;v=b3;q=0.9'
}
response = requests.get(url=url , headers=headers)
response.encoding = 'utf-8'
return response
def SelectorDX(url): # 实例化selector对象方便后面调用
response = RequestsDX(url)
selector = Selector(text=response.text)
return selector
def category_page(q): # 获取分类下的每页的链接
q.put('http://www.ypppt.com/moban/shuzhi/')
args = list(range(2,100))
for i in args:
ppt_url2 = 'http://www.ypppt.com/moban/shuzhi/list-%s.html' % i
response = RequestsDX(ppt_url2)
if response.status_code == 200:
print(ppt_url2)
q.put(ppt_url2)
else:
break
def download_page_parse(q, url_q): # 分析每页的aid并合成每个ppt的下载页面的链接
while q.empty() is not True:
selector = SelectorDX(q.get())
page_id_sum = len(selector.xpath('/html/body/div[2]/ul/li'))
for i in range(1, page_id_sum) :
tag = selector.xpath('/html/body/div[2]/ul/li[%s]/a' % i).extract_first()
y = tag.split('<')
a = y[1].find('.html')
b = y[1].rfind('/')
# print(y[1][b+1:a])
id_url = 'http://www.ypppt.com/p/d.php?aid='+y[1][b + 1 :a]
print(id_url)
url_q.put(id_url)
def download_url_parse(url_q, download_q): # 分析每个ppt下载页面的下载链接以及ppt名字
download_list = []
while url_q.empty() is not True :
selector = SelectorDX(url_q.get())
xpath = '/html/body/div/div/ul/li[1]/a'
filename_xpath = '/html/body/div/div/div[2]/div[2]/h1'
url_download = selector.xpath(xpath).extract_first()
file_name = selector.xpath(filename_xpath).extract_first()
name1 = file_name.replace(' - 下载页', '')
name2 = name1.replace('<h1>', '')
name = name2.replace('</h1>', '')
a = url_download.find('"')
b = url_download.rfind('"')
download_list.append((name,url_download[a + 1 :b]))
print(download_list)
download_q.put(download_list)
def down_load(download_list): # 开始下载
response = RequestsDX(download_list[1])
print('=' * 100)
print('正在下载', download_list[0])
with open(r'D:\ppt\%s.zip' % download_list[0], 'wb') as f :
for chunk in response.iter_content(chunk_size=1024) :
f.write(chunk)
print('下载完成')
if __name__ == '__main__':
t = perf_counter()
q = Queue()
url_q = Queue()
download_q = Queue()
p1 = Process(target=category_page, args=(q,))
p2 = Process(target=download_page_parse, args=(q,url_q,))
p3 = Process(target=download_url_parse, args=(url_q,download_q,))
p_l = [p1, p2, p3]
for i in p_l:
i.start()
i.join()
download_list = download_q.get()
pool = Pool(10)
pool.map(down_load, download_list)
t1 = perf_counter()
cost = t1-t
print(cost,'s')
第二次改良的效果图:
4线程增加了queue队列,速度明显提升。
最终版效果图:
10进程,48秒完事儿!
最后严正声明:爬别人的ppt网站只是为了和大家交流学习,千万不要去恶意搞人家的网站。 |