爬小姐姐视频为啥异步开不起来问题在哪里呢

lihu5841314 发表于 2021-7-5 21:46

本帖最后由 lihu5841314 于 2021-7-6 11:18 编辑

#--------------------------------------------------
import requests, time, re,os
from bs4 import BeautifulSoup
from multiprocessing.dummy importPool

"""
1.通过抓包工具分析视频网址视频音频一般在Media中找到视频url
https://huya-w10.huya.com/2119/505632739/1300/8bafda5b5bbace86a2edcb1cdb2da201.mp4
2.分析视频url的来源通过截取视频url的后半段数据用全局搜索去搜例如：8bafda5b5bbace86a2edcb1cdb2da201
找到视频url的来源       url_1 = https://liveapi.huya.com/moment/getMomentContent?callback=jQuery1124001977014823594203_1625483619844&videoId=505632739&_=1625483619852
同过多个视频分析来源url的变化url_2 = https://liveapi.huya.com/moment/getMomentContent?callback=jQuery112403179934484805591_1625484034808&videoId=515515867&uid=&_=1625484034830
?后面是请求携带的参数通过分析可以知道=1625483619852是时间戳可以通过python的time模块实现
&videoId就是视频请求url的后半部分
jQuery112403179934484805591_1625484034808没搞明白暂时不管试试
"""
# 1.从列表页响应中获得每个视频的播放页url
# 2.videoId+时间戳构建视频来源的url
# 3.从视频来源的url中提取出视频的url地址
# 4.请求视频url地址持久化存储

# 目标网站
url = 'https://v.huya.com/g/all?set_id=31&order=hot&page=1'

headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36',
}

# 构建请求函数
def get_response(url):
try:
   resp = requests.get(url=url, headers=headers)
   resp.encoding = resp.apparent_encoding
   if resp.status_code == 200:
         return resp
except:
   print("请求失败")

# 解析每个视频的url
def parse_videoid(resp):
# 创建BeautifulSoup对象
soup = BeautifulSoup(resp.text, "lxml")# 不填写"lxml"会报警告
li_lis = soup.find('ul', "vhy-video-list w215 clearfix").findAll('li')
video_dics = []
for li in li_lis:
   video_url = 'https://v.huya.com' + li.find('a').get('href')
   video_name = li.find('a').get('title')
   video_dic = {
         'video_url': video_url,
         'video_name': video_name
   }
   video_dics.append(video_dic)
return video_dics

# 构建来源url的时间戳是13位对时间戳进行构造
# https://liveapi.huya.com/moment/getMomentContent?callback=jQuery1124005236691164789575_1625490993380&videoId=525648071&_=1625490993386
# https://liveapi.huya.com/moment/getMomentContent?callback=jQuery1124040335487863006936_1625490632031&videoId=508624469&_=162549063203
# https://liveapi.huya.com/moment/getMomentContent?callback=jQuery112403179934484805591_1625484034808&videoId=515515867&_=1625484034830
def create_url(video_dics):
mov_dics = []
for video_dic in video_dics:
   video_url = video_dic['video_url']
   video_name = video_dic['video_name']
   video_id = video_url.split('/')[-1].split('.')
   url_time = int(time.time() * 1000)# 把时间戳变13位取整
   mov_url = f'https://liveapi.huya.com/moment/getMomentContent?callback=jQuery112403179934484805591_{url_time}&videoId={video_id}&_={url_time}'
   mov_dic = {
         'video_url': mov_url,
         'video_name': video_name
   }
   mov_dics.append(mov_dic)
return mov_dics

# 从来源url响应中提取出视频的url
def get_movie_url(mov_dic):
url = mov_dic['video_url']
video_name = mov_dic['video_name']
try:
   resp = get_response(url)
   movie_url = re.findall(r',"url":"(?P<movie_url>.*?)"', resp.text)
   if len(movie_url) > 0:
         movie_url = movie_url.split("?")
         dic ={
            'movie_url':movie_url,
            'video_name':video_name,
         }
         return dic
except IndexError:
   print('list index out of range')

def Down_movie(dic):
url = dic['movie_url']
print(url)
name =dic['video_name']
path = url.split('/')[-1].split('?')
path1 = 'video/'+path.replace(path.split('.'), name)
resp = requests.get(url=url,headers=headers)
print(name, '************正在下载**********')
with open(path1, 'wb') as f:
   f.write(resp.content)
   print(name, '下载完成')
   time.sleep(2)

def main():
if not os.path.exists("video"):
   os.makedirs("video")
start = time.time()
resp = get_response(url)
video_dics = parse_videoid(resp)
mov_dics = create_url(video_dics)
dics = []
for mov_dic in mov_dics:
   dic = get_movie_url(mov_dic)
   dics.append(dic)
pool.map(Down_movie,dics)
pool.close()
pool.join()
print('over',time.time()-start)

if __name__ == '__main__':
pool = Pool(6)
main()import requests,time,re
from bs4importBeautifulSoup
import aiohttp
import aiofiles
import asyncio
"""
1.通过抓包工具分析视频网址视频音频一般在Media中找到视频url
https://huya-w10.huya.com/2119/505632739/1300/8bafda5b5bbace86a2edcb1cdb2da201.mp4
2.分析视频url的来源通过截取视频url的后半段数据用全局搜索去搜例如：8bafda5b5bbace86a2edcb1cdb2da201
找到视频url的来源       url_1 = https://liveapi.huya.com/moment/getMomentContent?callback=jQuery1124001977014823594203_1625483619844&videoId=505632739&_=1625483619852
同过多个视频分析来源url的变化url_2 = https://liveapi.huya.com/moment/getMomentContent?callback=jQuery112403179934484805591_1625484034808&videoId=515515867&uid=&_=1625484034830
?后面是请求携带的参数通过分析可以知道=1625483619852是时间戳可以通过python的time模块实现
&videoId就是视频请求url的后半部分
jQuery112403179934484805591_1625484034808没搞明白暂时不管试试
"""
#1.从列表页响应中获得每个视频的播放页url
#2.videoId+时间戳构建视频来源的url
#3.从视频来源的url中提取出视频的url地址
#4.请求视频url地址持久化存储

#目标网站
url = 'https://v.huya.com/g/all?set_id=31&order=hot&page=1'

headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36',
}
#构建请求函数
defget_response(url):
try:
   resp =requests.get(url=url,headers=headers)
   resp.encoding = resp.apparent_encoding
   if resp.status_code == 200:
         return resp
except:
   print("请求失败")

#解析每个视频的url
def parse_videoid(resp):
#创建BeautifulSoup对象
soup = BeautifulSoup(resp.text,"lxml")#不填写"lxml"会报警告
li_lis = soup.find('ul',"vhy-video-list w215 clearfix").findAll('li')
video_dics = []
for li inli_lis:
   video_url ='https://v.huya.com' + li.find('a').get('href')
   video_name = li.find('a').get('title')
   video_dic = {
         'video_url':video_url,
         'video_name':video_name
   }
   video_dics.append(video_dic)
return video_dics

#构建来源url的时间戳是13位对时间戳进行构造
# https://liveapi.huya.com/moment/getMomentContent?callback=jQuery1124005236691164789575_1625490993380&videoId=525648071&_=1625490993386
# https://liveapi.huya.com/moment/getMomentContent?callback=jQuery1124040335487863006936_1625490632031&videoId=508624469&_=162549063203
# https://liveapi.huya.com/moment/getMomentContent?callback=jQuery112403179934484805591_1625484034808&videoId=515515867&_=1625484034830
defcreate_url(video_dics):
mov_dics = []
for video_dic in video_dics:
   video_url = video_dic['video_url']
   video_name = video_dic['video_name']
   video_id = video_url.split('/')[-1].split('.')
   url_time = int(time.time() * 1000)# 把时间戳变13位取整
   mov_url = f'https://liveapi.huya.com/moment/getMomentContent?callback=jQuery112403179934484805591_{url_time}&videoId={video_id}&_={url_time}'
   mov_dic={
         'video_url':mov_url,
         'video_name': video_name
   }
   mov_dics.append(mov_dic)
return mov_dics

#从来源url响应中提取出视频的url
def get_movie_url(mov_dic):
url = mov_dic['video_url']
video_name = mov_dic['video_name']
try:
   resp = get_response(url)
   movie_url = re.findall(r',"url":"(?P<movie_url>.*?)"',resp.text)
   iflen(movie_url) > 0:
         movie_url = movie_url
         return movie_url,video_name
except IndexError:
   print('list index out of range')

async def Down_movie(url,name):
   path = url.split('/')[-1].split('?')
   path1 = path.replace(path.split('.'),name)
   async with aiohttp.ClientSession()as session:
         async with awaitsession.get(url,headers=headers)as resp:
            resp = awaitresp.read()
            print(name,'************正在下载**********')
            async with aiofiles.open(path1,'wb') as f:
                  await f.write(resp)
                  print(name,'下载完成')
                  await asyncio.sleep(2)

defmain():
resp = get_response(url)
video_dics = parse_videoid(resp)
mov_dics = create_url(video_dics)
tasks = []
for mov_dic inmov_dics:
   movie_url,video_name = get_movie_url(mov_dic)
   task = asyncio.ensure_future(Down_movie(movie_url,video_name))
   tasks.append(task)
loop.run_until_complete(asyncio.wait(tasks))
print('over')

if __name__ == '__main__':
loop = asyncio.get_event_loop()
main()

ofo 发表于 2021-7-5 23:12

这种不用API输出json格式的的网站，一般称为LJ网站，看的兴趣都没有:lol

股票亏损员 发表于 2021-7-5 23:29

没有啥东西，不敢兴趣

列明发表于 2021-7-5 23:43

視頻直接是用mp4存放傳輸的，
放到現在來說，
商用的話，
肯定是短視頻！
嗯，
短的我已經有很多了，
不感興趣了！

Spa495 发表于 2021-7-6 08:27

学习一下，没学懂

chenkeai深蓝 发表于 2021-7-6 09:18

可惜了看不懂

zhengxinjun 发表于 2021-7-6 10:47

换个网址试试{:301_997:}

key_user 发表于 2021-7-6 10:49

这种不用API输出json格式的的网站，一般称为LJ网站，看的兴趣都没有{:301_998:}

tricky6 发表于 2021-7-6 11:56

看起来又是个不用API输出json格式的网站诶不用开异步吧

gentlespider 发表于 2021-7-6 14:45

我怎么感觉调用这个函数，loop没传进去啊

页: [1] 2

吾爱破解 - 52pojie.cn's Archiver

爬小姐姐视频 为啥 异步开不起来 问题在哪里呢

爬小姐姐视频为啥异步开不起来问题在哪里呢