爬小姐姐视频 为啥 异步开不起来 问题在哪里呢
本帖最后由 lihu5841314 于 2021-7-6 11:18 编辑#--------------------------------------------------
import requests, time, re,os
from bs4 import BeautifulSoup
from multiprocessing.dummy importPool
"""
1.通过抓包工具分析视频网址 视频音频一般在Media中找到视频url
https://huya-w10.huya.com/2119/505632739/1300/8bafda5b5bbace86a2edcb1cdb2da201.mp4
2.分析视频url的来源通过截取视频url的后半段数据用全局搜索去搜 例如:8bafda5b5bbace86a2edcb1cdb2da201
找到视频url的来源 url_1 = https://liveapi.huya.com/moment/getMomentContent?callback=jQuery1124001977014823594203_1625483619844&videoId=505632739&_=1625483619852
同过多个视频分析来源url的变化url_2 = https://liveapi.huya.com/moment/getMomentContent?callback=jQuery112403179934484805591_1625484034808&videoId=515515867&uid=&_=1625484034830
?后面是请求携带的参数通过分析可以知道=1625483619852是时间戳可以通过python的time模块实现
&videoId就是视频请求url的后半部分
jQuery112403179934484805591_1625484034808没搞明白 暂时不管试试
"""
# 1.从列表页响应中获得每个视频的播放页url
# 2.videoId+时间戳构建视频来源的url
# 3.从视频来源的url中提取出视频的url地址
# 4.请求视频url地址持久化存储
# 目标网站
url = 'https://v.huya.com/g/all?set_id=31&order=hot&page=1'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36',
}
# 构建请求函数
def get_response(url):
try:
resp = requests.get(url=url, headers=headers)
resp.encoding = resp.apparent_encoding
if resp.status_code == 200:
return resp
except:
print("请求失败")
# 解析每个视频的url
def parse_videoid(resp):
# 创建BeautifulSoup对象
soup = BeautifulSoup(resp.text, "lxml")# 不填写"lxml"会报警告
li_lis = soup.find('ul', "vhy-video-list w215 clearfix").findAll('li')
video_dics = []
for li in li_lis:
video_url = 'https://v.huya.com' + li.find('a').get('href')
video_name = li.find('a').get('title')
video_dic = {
'video_url': video_url,
'video_name': video_name
}
video_dics.append(video_dic)
return video_dics
# 构建来源url的时间戳是13位对时间戳进行构造
# https://liveapi.huya.com/moment/getMomentContent?callback=jQuery1124005236691164789575_1625490993380&videoId=525648071&_=1625490993386
# https://liveapi.huya.com/moment/getMomentContent?callback=jQuery1124040335487863006936_1625490632031&videoId=508624469&_=162549063203
# https://liveapi.huya.com/moment/getMomentContent?callback=jQuery112403179934484805591_1625484034808&videoId=515515867&_=1625484034830
def create_url(video_dics):
mov_dics = []
for video_dic in video_dics:
video_url = video_dic['video_url']
video_name = video_dic['video_name']
video_id = video_url.split('/')[-1].split('.')
url_time = int(time.time() * 1000)# 把时间戳变13位取整
mov_url = f'https://liveapi.huya.com/moment/getMomentContent?callback=jQuery112403179934484805591_{url_time}&videoId={video_id}&_={url_time}'
mov_dic = {
'video_url': mov_url,
'video_name': video_name
}
mov_dics.append(mov_dic)
return mov_dics
# 从来源url响应中提取出视频的url
def get_movie_url(mov_dic):
url = mov_dic['video_url']
video_name = mov_dic['video_name']
try:
resp = get_response(url)
movie_url = re.findall(r',"url":"(?P<movie_url>.*?)"', resp.text)
if len(movie_url) > 0:
movie_url = movie_url.split("?")
dic ={
'movie_url':movie_url,
'video_name':video_name,
}
return dic
except IndexError:
print('list index out of range')
def Down_movie(dic):
url = dic['movie_url']
print(url)
name =dic['video_name']
path = url.split('/')[-1].split('?')
path1 = 'video/'+path.replace(path.split('.'), name)
resp = requests.get(url=url,headers=headers)
print(name, '************正在下载**********')
with open(path1, 'wb') as f:
f.write(resp.content)
print(name, '下载完成')
time.sleep(2)
def main():
if not os.path.exists("video"):
os.makedirs("video")
start = time.time()
resp = get_response(url)
video_dics = parse_videoid(resp)
mov_dics = create_url(video_dics)
dics = []
for mov_dic in mov_dics:
dic = get_movie_url(mov_dic)
dics.append(dic)
pool.map(Down_movie,dics)
pool.close()
pool.join()
print('over',time.time()-start)
if __name__ == '__main__':
pool = Pool(6)
main()import requests,time,re
from bs4importBeautifulSoup
import aiohttp
import aiofiles
import asyncio
"""
1.通过抓包工具分析视频网址 视频音频一般在Media中找到视频url
https://huya-w10.huya.com/2119/505632739/1300/8bafda5b5bbace86a2edcb1cdb2da201.mp4
2.分析视频url的来源通过截取视频url的后半段数据用全局搜索去搜 例如:8bafda5b5bbace86a2edcb1cdb2da201
找到视频url的来源 url_1 = https://liveapi.huya.com/moment/getMomentContent?callback=jQuery1124001977014823594203_1625483619844&videoId=505632739&_=1625483619852
同过多个视频分析来源url的变化url_2 = https://liveapi.huya.com/moment/getMomentContent?callback=jQuery112403179934484805591_1625484034808&videoId=515515867&uid=&_=1625484034830
?后面是请求携带的参数通过分析可以知道=1625483619852是时间戳可以通过python的time模块实现
&videoId就是视频请求url的后半部分
jQuery112403179934484805591_1625484034808没搞明白 暂时不管试试
"""
#1.从列表页响应中获得每个视频的播放页url
#2.videoId+时间戳构建视频来源的url
#3.从视频来源的url中提取出视频的url地址
#4.请求视频url地址持久化存储
#目标网站
url = 'https://v.huya.com/g/all?set_id=31&order=hot&page=1'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36',
}
#构建请求函数
defget_response(url):
try:
resp =requests.get(url=url,headers=headers)
resp.encoding = resp.apparent_encoding
if resp.status_code == 200:
return resp
except:
print("请求失败")
#解析每个视频的url
def parse_videoid(resp):
#创建BeautifulSoup对象
soup = BeautifulSoup(resp.text,"lxml")#不填写"lxml"会报警告
li_lis = soup.find('ul',"vhy-video-list w215 clearfix").findAll('li')
video_dics = []
for li inli_lis:
video_url ='https://v.huya.com' + li.find('a').get('href')
video_name = li.find('a').get('title')
video_dic = {
'video_url':video_url,
'video_name':video_name
}
video_dics.append(video_dic)
return video_dics
#构建来源url的时间戳是13位对时间戳进行构造
# https://liveapi.huya.com/moment/getMomentContent?callback=jQuery1124005236691164789575_1625490993380&videoId=525648071&_=1625490993386
# https://liveapi.huya.com/moment/getMomentContent?callback=jQuery1124040335487863006936_1625490632031&videoId=508624469&_=162549063203
# https://liveapi.huya.com/moment/getMomentContent?callback=jQuery112403179934484805591_1625484034808&videoId=515515867&_=1625484034830
defcreate_url(video_dics):
mov_dics = []
for video_dic in video_dics:
video_url = video_dic['video_url']
video_name = video_dic['video_name']
video_id = video_url.split('/')[-1].split('.')
url_time = int(time.time() * 1000)# 把时间戳变13位取整
mov_url = f'https://liveapi.huya.com/moment/getMomentContent?callback=jQuery112403179934484805591_{url_time}&videoId={video_id}&_={url_time}'
mov_dic={
'video_url':mov_url,
'video_name': video_name
}
mov_dics.append(mov_dic)
return mov_dics
#从来源url响应中提取出视频的url
def get_movie_url(mov_dic):
url = mov_dic['video_url']
video_name = mov_dic['video_name']
try:
resp = get_response(url)
movie_url = re.findall(r',"url":"(?P<movie_url>.*?)"',resp.text)
iflen(movie_url) > 0:
movie_url = movie_url
return movie_url,video_name
except IndexError:
print('list index out of range')
async def Down_movie(url,name):
path = url.split('/')[-1].split('?')
path1 = path.replace(path.split('.'),name)
async with aiohttp.ClientSession()as session:
async with awaitsession.get(url,headers=headers)as resp:
resp = awaitresp.read()
print(name,'************正在下载**********')
async with aiofiles.open(path1,'wb') as f:
await f.write(resp)
print(name,'下载完成')
await asyncio.sleep(2)
defmain():
resp = get_response(url)
video_dics = parse_videoid(resp)
mov_dics = create_url(video_dics)
tasks = []
for mov_dic inmov_dics:
movie_url,video_name = get_movie_url(mov_dic)
task = asyncio.ensure_future(Down_movie(movie_url,video_name))
tasks.append(task)
loop.run_until_complete(asyncio.wait(tasks))
print('over')
if __name__ == '__main__':
loop = asyncio.get_event_loop()
main() 这种不用API输出json格式的的网站,一般称为LJ网站,看的兴趣都没有:lol 没有啥东西,不敢兴趣 視頻直接是用mp4存放傳輸的,
放到現在來說,
商用的話,
肯定是短視頻!
嗯,
短的我已經有很多了,
不感興趣了! 学习一下,没学懂 可惜了看不懂 换个网址试试{:301_997:} 这种不用API输出json格式的的网站,一般称为LJ网站,看的兴趣都没有{:301_998:} 看起来又是个不用API输出json格式的网站诶 不用开异步吧 我怎么感觉调用这个函数,loop没传进去啊
页:
[1]
2