此爬虫只是对站内另一爬虫的异步改写,原帖在这里:https://www.52pojie.cn/thread-1685010-1-1.html经测试,在爬取时有时会有卡顿情况,但爬虫本身并未报错,所以这个应该是api接口不能及时传回数据导致的,耐心等待下即可。
演示:
爬虫代码:
[Python] 纯文本查看 复制代码 import re
import os
import requests
import aiohttp
import aiofiles
import asyncio # win下注释掉此行
import uvloop
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) # win下注释掉此行
headers = {
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1'
}
quantity = 0 # 初始视频数量
indexNum = 1
async def start(session,sec_uid,max_cursor='0'):
global quantity
apiUrl = f'https://m.douyin.com/web/api/v2/aweme/post/?reflow_source=reflow_page&sec_uid={sec_uid}&count=21&max_cursor={max_cursor}'
data = await getData(session,apiUrl) # 请求数据
# print(data)
max_cursor = data['max_cursor'] # 获取max_cursor
aweme_list = data['aweme_list'] # 获取视频列表
tasks = []
for aweme in aweme_list: # 遍历视频列表
quantity += 1 # 视频数量+1
video_name = aweme['desc'] # 获取视频名称
video_url = aweme['video']['play_addr']['url_list'][0] # 获取视频地址
nickname = aweme['author']['nickname'] # 获取作者昵称
video_name = video_name.replace('\n', ' ') # 吧\n替换成空格
video_name = re.sub(r'[\/:*?"<>|]', '-', video_name) # 替换文件名中的特殊字符
# print(f'正在下载第{quantity}个视频:{video_name}') # 打印视频名称
if video_name == '':
video_name = str(quantity) + nickname # 如果视频名称为空,就用视频数量+作者昵称作为视频名称
task = asyncio.create_task(downVideo(session,video_url,video_name,nickname))
tasks.append(task)
await asyncio.wait(tasks)
has_more = data['has_more']
if not has_more: # 如果has_more为False 说明没有更多视频了
print(f'视频下载结束!共下载{quantity}个视频')
return # 退出循环
else:
await start(session,sec_uid,max_cursor)
async def downVideo(session,url,videoname,nickname):
global indexNum
async with session.get(url,headers=headers) as resp:
if resp.status == 200:
videoContent = await resp.read()
downloadPath = "/home/yin/download"
if not os.path.exists(f'{downloadPath}/{nickname}'): # 如果作者文件夹不存在,就创建
os.mkdir(f'{downloadPath}/{nickname}') # 如果作者文件夹不存在,就创建一个
async with aiofiles.open(f'{downloadPath}/{nickname}/{videoname}.mp4', 'wb') as f:
print(f'正在下载第{indexNum}个视频:{videoname}') # 打印视频名称
await f.write(videoContent)
indexNum +=1
async def getData(session,url):
async with session.get(url,headers=headers) as resp:
if resp.status == 200 or resp.status ==302:
apiJson = await resp.json()
return apiJson
async def indexInfo(url):
try:
sec_uid = re.findall('user/(.*)\?', url)[0] # 从url中提取sec_uid
except:
sec_uid = re.findall('user/(.*)', url)[0]
return sec_uid
async def main(offset):
secUid = await indexInfo(offset)
async with aiohttp.ClientSession() as session:
await start(session,secUid)
if __name__ == '__main__':
url = input('请输入作者主页链接:') # 输入作者主页链接 例:https://www.douyin.com/user/MS4wLjABAAAAm-YgirNQo_9nm1B8TNynOD5ZrYBtesVrgBuaZaS2dzQ?vid=6907843457583205646
url = requests.get(url, headers=headers).url # 获取重定向后的url
asyncio.run(main(url))
|