本帖最后由 话痨司机啊 于 2022-6-12 13:40 编辑
兄弟萌,我切菜手切坏了,打个字都费劲,给点赞吧。最近上班忙,抽空看看异步,觉得好强大,当然底层的话这个就是消息循环调用。
同步与异步耗时对比:
只是测了50张的图片,耗时对比竟然就高达近 15 秒,如果是百万张,可想差距有多大,如果是多线程,量小差距不大。
上代码!!
异步代码:
[Python] 纯文本查看 复制代码
import aiohttp
import asyncio
import os
import aiofiles
from loguru import logger as logs
from fake_useragent import UserAgent
from tqdm import tqdm
import time
logs.add('download_pic.log',level='INFO',format="{time:YYYY-MM-DD HH:mm:ss} {level} {message}",enqueue=True,encoding='utf-8',backtrace=True)
async def mkdir(path):
'''
创建文件夹
'''
if not os.path.exists(path):
os.makedirs(path)
async def request(url,sem, img=False):
'''
请求url,返回响应
'''
headers = {'User-Agent':UserAgent().random,'Referer':'https://photo.ihansen.org'}
async with aiohttp.ClientSession(headers=headers) as session:
async with sem:
async with session.get(url,verify_ssl=False) as response:
assert response.status == 200
if img == True:
return await response.read()
else:
return await response.json(encoding='utf8')
async def save_image(res,path,sem):
'''
保存图片
'''
async with sem:
async with aiofiles.open(path,'wb') as f:
await f.write(res)
async def get_json_info(sem,start_page,end_page):
"""
获取图片下载列表信息并下载
"""
url = lambda num:'https://api.ihansen.org/img/detail?page={num}&perPage=50&index=&orderBy=today'.format(num=num)
task = [request(url(i),sem,img=False) for i in range(start_page,end_page+1)]
return await asyncio.gather(*task)
async def get_img_content_and_downloads(sem,urls):
"""
获取图片内容
"""
base_dir = os.path.normpath(os.path.dirname(__file__) + '/img')
futures = [asyncio.ensure_future(request(url,sem,img=True)) for url in urls]
contents = [await i for i in tqdm(asyncio.as_completed(futures),total=len(futures),desc='正在下载图片')]
await asyncio.create_task(mkdir(base_dir))
#异步保存图片
task = [save_image(content,os.path.join(base_dir,str(i) + '.jpg'),sem) for i,content in enumerate(contents)]
await asyncio.gather(*task)
async def async_main():
"""
异步函数逻辑
"""
now = lambda:time.time()
start = now()
start_page = int(input('请输入起始页:'))
end_page = int(input('请输入结束页:'))
# 规定异步信号量为10,数值太大可能会被封IP一段时间
sem = asyncio.Semaphore(10)
try:
# 获取图片下载列表信息
json_list = await get_json_info(sem,start_page,end_page)
# 生成图片链接urls
url_list = []
print('正在获取图片链接...')
for urls in [json_info for json_info in json_list]:
for url in urls:
url_list.append(url.get('raw'))
await get_img_content_and_downloads(sem,url_list)
logs.info(f'从第{start_page}页到第{end_page}页下载耗时:{now()-start}')
except Exception as e:
logs.exception(e)
if __name__ == '__main__':
asyncio.get_event_loop().run_until_complete(async_main())
同步代码:
[Python] 纯文本查看 复制代码 from asyncio import constants
import requests
import os
from loguru import logger as logs
from fake_useragent import UserAgent
from alive_progress import alive_bar
import time
from urllib3 import disable_warnings
disable_warnings()
logs.add('download_sync_pic.log',level='INFO',format="{time:YYYY-MM-DD HH:mm:ss} {level} {message}",encoding='utf-8')
def mkdir(path):
'''
创建文件夹
'''
if not os.path.exists(path):
os.makedirs(path)
def get_html(url,img=False,bar=None):
'''
请求url,获取response
'''
headers = {'User-Agent':UserAgent().random,'Referer':'https://photo.ihansen.org'}
res = requests.get(url,headers=headers,verify=False)
assert res.status_code == 200
if img == True:
bar()
return res.content
else:
return res.json()
def save_img(res,path):
'''
保存图片
'''
with open(path,'wb') as f:
f.write(res)
def get_json_info(start_page,end_page):
'''
返回json信息
'''
url = lambda num:'https://api.ihansen.org/img/detail?page={num}&perPage=50&index=&orderBy=today'.format(num=num)
return [get_html(url(i),img=False) for i in range(start_page,end_page+1)]
def get_img_info(urls):
'''
保存图片
'''
base_dir = os.path.normpath(os.path.dirname(__file__) + '/sync_img')
mkdir(base_dir)
with alive_bar(len(urls)) as bar:
[save_img(content,os.path.join(base_dir,str(i) + '.jpg')) for i,content in enumerate([get_html(url,img=True,bar=bar) for url in urls])]
def main():
'''
主函数
'''
now = lambda:time.time()
start = now()
start_page = int(input('请输入起始页:'))
end_page = int(input('请输入结束页:'))
json_list = get_json_info(start_page,end_page)
url_list = []
for urls in [json_info for json_info in json_list]:
for url in urls:
url_list.append(url.get('raw'))
get_img_info(url_list)
logs.info(f'从第{start_page}页到第{end_page}页下载耗时:{now()-start}')
if __name__ == "__main__":
main()
|