本帖最后由 52pojie_mxrain 于 2018-7-24 23:34 编辑
终于注册到52pojie帐号了,走进大佬的世界,跟着大佬的步伐,从工具开始
练练协程....
但感怪怪的..
异步夹同步....乡村版协程
不懂用aiohttp下载文件怎么办...那就调用调用IDM下载...
写得不好,希望多多指教
爬取地址爱盘:https://down.52pojie.cn/
[Python] 纯文本查看 复制代码 #!/usr/bin/env python
# -*- coding:utf-8 -*-
__author__ = "mxrain"
import re
import os
import asyncio
import aiohttp
from urllib.parse import urlparse ,unquote
seen_urls = set()# url去重
sem = asyncio.Semaphore(3)#并发数量限制3个
waitting_urls = []
stopping = False
start_url = "https://down.52pojie.cn/"#开始url
main_path = "{}\\爱盘".format(os.path.dirname(os.path.abspath(__file__)))#文件存放目录
def check_file_path(url):
'''
把url变成文件目录路径
从url得到文件的名字
以及储存路径
'''
url = unquote(url)
up = urlparse(url)
paths = up.path.replace("/","\\").rsplit('\\',1)
if paths[0] != "":
folder_path = main_path + paths[0]
else:
folder_path = main_path
return folder_path
def get_file(url):
'''
插进idm队列里,下载文件
'''
folder_path = check_file_path(url)
command = "IDMan.exe /d {} /p {} /a /n".format(url,folder_path)
os.system(command)
print('[队列添加] : {}'.format(url))
async def fetch(url,session):
'''
从服务器返回回html
'''
async with sem:
# await asyncio.sleep(0.5)
try:
async with session.get(url) as resp:
print('[{}] url: {}'.format(resp.status,url))
if resp.status in [200, 201]:
data = await resp.text(encoding="utf-8")
return data
except Exception as e:
print("[{}]get url:{}".format(e,url))
async def extract_url(url,session):
'''
提取 节点url,往asyncio抛协程
提取 文件url,往asyncio抛协程
'''
html = await fetch(url,session)
regex_mo = 'href="(.*?)"'
regex_com = re.compile(regex_mo)
href_list = regex_com.findall(html)
for href in href_list:
urls = url+href
# url识别
if href.endswith("/") and not href.startswith("htt")and not href.startswith("."):
if urls in seen_urls:#去重判断
continue
seen_urls.add(urls)
waitting_urls.append(urls) #
elif href.endswith(('.zip','.txt','.7z','.rar','.mp3','.mp4','.jpg','.png','.md','.exe')):
get_file(urls)
else:
pass
async def consumer():
'''
不断判断waitting_urls是否有url
往事件循环抛进协程
'''
async with aiohttp.ClientSession() as session:
while not stopping:
if len(waitting_urls) == 0:
await asyncio.sleep(0.5)
continue
url = waitting_urls.pop()
print("start get url: {}".format(url))
asyncio.ensure_future(extract_url(url,session)) #协程抛进事件循环
async def main():
seen_urls.add(start_url)
waitting_urls.append(start_url)
asyncio.ensure_future(consumer())
if __name__ == "__main__":
loop = asyncio.get_event_loop()
asyncio.ensure_future(main())
loop.run_forever()
最后的问题 |