多任务异步爬取实战之wallhere
本贴主要练习记录异步爬取之功效,对网页图片未作详细研究。望各位吧友见谅{:1_926:}# -*- codeing = utf-8 -*-
# @Time : 2021/3/9 20:45
# @AuThor : wuqi
# @file : wallhere爬取(多任务异步版).py
# @SOFTWARE :PyCharm
import requests
import re
import os
import asyncio
import aiohttp
import time
#定义正则
findimg='data-src="(.*?)"'
#爬取图片源码
url='https://wallhere.com/zh/wallpapers?order=popular'
header={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36 SLBrowser/7.0.0.1071 SLBChan/33'
}
req=requests.get(url=url,headers=header,timeout=3).text
img_list=re.findall(findimg,req)
print(len(img_list))
#创建文件夹储存图片
if not os.path.exists('./bizhi2'):
os.mkdir('./bizhi2')
i=0
#储存图片
async def saveimg(imgurl):
async with aiohttp.ClientSession() as sess: #注意with前需要加上async
async with await sess.get(url=imgurl,headers=header) as response:
img_text=await response.read() #阻塞操作需加await,否则会报错RuntimeWarning: coroutine 'ClientResponse.read' was never awaited
global i
i=i+1
filename='bizhi2/'+str(i)+'.jpg'
withopen(filename,'wb') as fb:
fb.write(img_text)
if __name__ == '__main__':
start=time.time()
tasks=[]
for img in img_list:
#创建一个协程对象
c=saveimg(img)
#将协程对象封装成任务对象
task=asyncio.ensure_future(c)
tasks.append(task)
#创建事件循环对象
loop=asyncio.get_event_loop()
# 将任务对象注册到循环对象中开启循环
loop.run_until_complete(asyncio.wait(tasks))
print('over...time: ',time.time()-start) 看起来不错 哥们,你这下载得都是缩略图啊
页:
[1]