好友
阅读权限 10
听众
最后登录 1970-1-1
本帖最后由 AnWenpython 于 2021-11-18 19:43 编辑
先看同步方法的代码
[Python] 纯文本查看 复制代码
# 1.输入要爬取那组图片的url例如:https://www.ivsky.com/bizhi/moraine_lake_v48781/
# https://www.ivsky.com/bizhi/moraine_lake_v48781/pic_769150.html 第一张图片的url
# wyAmZ1NxpQ3/ljl4PhA7CKEXM04HGkR+mKNM8hKFgkdWRVMaz+QZp1vq+FUCxrkeCcDJYkP4UIlYC5Qh2/w1vTGXChZo
# https://img-picdown.ivsky.com/ 另一组原图url
# img/downloadpic/download/fmAmcgttgHinljl4PhA7CKEXM04HGkRmgK9M8hKFgkdWRVMaz+QZp1vq+FUCxpUeFcDBdkP8OI15F4R1O
# [img]https://m.ivsky.com/get_picinfo.php?tn=downloadpic&picurl=/img/bizhi/pic/201804/30/moraine_lake.jpg[/img] data的url
# /img/bizhi/pic/201804/30/moraine_lake.jpg
import requests
from lxml import etree
import re
import os
import json
import time
# 创建一个session对象
# sessionp = requests.Session()
t1 = time.time()
if not os.path.exists('./极简壁纸爬取结果'):
os.mkdir('./极简壁纸爬取结果') # 创建保存图片的文件夹
# 1.对这组图片的url发get请求,拿到每张图片的url
url = 'https://www.ivsky.com/bizhi/yangzi_v59469/'
html1 = requests.get(url=url)
# print(html1.text)
d = etree.HTML(html1.text)
slist = d.xpath('/html/body/div[3]/div[4]/ul/li/div/a/@href') # href="/bizhi/moraine_lake_v48781/pic_769150.html"
html1.close()
ec = "var imgURL='(.*?)';var"
# 2.对每张图的url发请请求
for li in slist: # https://www.ivsky.com/bizhi/moraine_lake_v48781/pic_769150.html"
url2 = 'https://www.ivsky.com' + li
html2 = requests.get(url=url2)
data_url = 'https://www.ivsky.com/get_picinfo.php?'
hdes = {
'cookie': '__yjs_duid=1_1bda7ffae32f47059d094aa036adcf651634380757728; Hm_lvt_a951b469f6e313457f2934c362ed30de=1636204999,1636277027,1636278256; statistics_clientid=me; Hm_lvt_862071acf8e9faf43a13fd4ea795ff8c=1636954225,1637036227,1637054815,1637123760; Hm_lpvt_c13cf8e9faf62071ac13fd4eafaf1acf=1637140041; Hm_lpvt_862071acf8e9faf43a13fd4ea795ff8c=1637140042'
}
data = {
'tn': 'downloadpic',
'picurl': re.findall(ec, html2.text)[0] # /img/bizhi/pic/201804/30/moraine_lake.jpg
}
# 3.对保存原图后缀的data发请求
data_html = requests.get(url=data_url, headers=hdes, params=data)
dit = data_html.json()
h = dit['data']
url3 = 'https://img-picdown.ivsky.com/img/downloadpic/download/' + h
img_name = url3.split('/')[-1] # 获取图片名称
imgPath = './极简壁纸爬取结果/' + img_name + '.jpg' # 图片储存路径
with open(imgPath, 'wb') as fp:
fp.write(requests.get(url=url3).content)
print(img_name, '下载成功')
print(img_name,'爬取成功!!!')
html2.close()
data_html.close()
t2 = time.time()
print(t2 - t1)
异步方法的代码
# 1.输入要爬取那组图片的url例如:https://www.ivsky.com/bizhi/moraine_lake_v48781/
# https://www.ivsky.com/bizhi/moraine_lake_v48781/pic_769150.html 第一张图片的url
# https://img-picdown.ivsky.com/ 原图url
# img/downloadpic/download/wyAmZ1NxpQ3/ljl4PhA7CKEXM04HGkR+mKNM8hKFgkdWRVMaz+QZp1vq+FUCxrkeCcDJYkP4UIlYC5Qh2/w1vTGXChZo
# https://img-picdown.ivsky.com/ 另一组原图url
# img/downloadpic/download/fmAmcgttgHinljl4PhA7CKEXM04HGkRmgK9M8hKFgkdWRVMaz+QZp1vq+FUCxpUeFcDBdkP8OI15F4R1O
# data的url
# /img/bizhi/pic/201804/30/moraine_lake.jpg
import requests
from lxml import etree
import re
import os
import json
import asyncio
import aiohttp
import aiofiles
import time
# 创建一个session对象
# sessionp = requests.Session()
"""
1. 同步操作: 访问一组图片的url 拿到所有图片的url,访问这些URl拿到data的url(过程需要进行数据解析)
2. 异步操作: 访问这些data的url,拿到图片url, 下载所有的图片内容
"""
t1 = time.time()
if not os.path.exists('./极简壁纸爬取结果2'):
os.mkdir('./极简壁纸爬取结果2') # 创建保存图片的文件夹
# 1.对这组图片的url发get请求,拿到每张图片的url
url = 'https://www.ivsky.com/bizhi/yangzi_v59469/'
html1 = requests.get(url=url)
d = etree.HTML(html1.text)
slist = d.xpath('/html/body/div[3]/div[4]/ul/li/div/a/@href') # href="/bizhi/moraine_lake_v48781/pic_769150.html"
html1.close()
# 2.对每张图的url发请请求,获取data的url
lest = [] # 保存data的url
async def yyds(url):
url = 'https://www.ivsky.com' + url
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
html = await resp.text()
ec = "var imgURL='(.*?)';var"
s = re.findall(ec, html)[0] # /img/bizhi/pic/201804/30/moraine_lake.jpg
data_url = f'https://www.ivsky.com/get_picinfo.php?tn=downloadpic&picurl={s}'
lest.append(data_url)
print(f'{s}获取成功')
async def mian():
task_1 = [asyncio.create_task(yyds(url)) for url in slist]
await asyncio.wait(task_1)
print(lest)
loop = asyncio.get_event_loop()
loop.run_until_complete(mian())
# 开始异步
hdrs = {
'cookie': '__yjs_duid=1_1bda7ffae32f47059d094aa036adcf651634380757728; Hm_lvt_a951b469f6e313457f2934c362ed30de=1636204999,1636277027,1636278256; statistics_clientid=me; Hm_lvt_862071acf8e9faf43a13fd4ea795ff8c=1637036227,1637054815,1637123760,1637141749; Hm_lpvt_c13cf8e9faf62071ac13fd4eafaf1acf=1637141759; Hm_lpvt_862071acf8e9faf43a13fd4ea795ff8c=1637141760'
}
async def aiodownload(h):
url = 'https://img-picdown.ivsky.com/img/downloadpic/download/' + h
img_name = url.split('/')[-1] # 获取图片名称
imgPath = './极简壁纸爬取结果2/' + img_name + '.jpg' # 图片储存路径
async with aiohttp.ClientSession() as session:
async with session.get(url=url) as resp:
async with aiofiles.open(imgPath, mode='wb') as f:
await f.write(await resp.content.read())
print(img_name, "下载完成")
async def getCatalog(lest):
tasks = []
for item in lest:
data_json = requests.get(url=item, headers=hdrs).json()
h = data_json['data']
tasks.append(asyncio.create_task(aiodownload(h)))
# 准备异步任务
print('开始下载任务')
await asyncio.wait(tasks)
# # asyncio.run(getCatalog(lest)))
loop = asyncio.get_event_loop()
loop.run_until_complete(getCatalog(lest))
t2 = time.time()
print(t2 - t1)
两者都是用wb写二进制数据
同步爬取的结果,非常的好
异步爬取结果,打不开
发帖前要善用【论坛搜索 】 功能,那里可能会有你要找的答案或者已经有人发布过相同内容了,请勿重复发帖。