【爬虫】爬取a站目标列表视频
(仅做学习,如有侵权,请私信)import os
import pprint
import re
import json
import requests
import fake_useragent
from tqdm import tqdm# 显示进度条
from bs4 import BeautifulSoup
ua = fake_useragent.UserAgent().random
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'
}
# 获取m3u8列表文件
def get_m3u8_list(url):
r = requests.get(url, headers=headers)
info = re.findall('window.pageInfo = window.videoInfo = (.*?)window.videoResource =', r.text, re.S).strip()[:-1]
info_json = \
json.loads(json.loads(info)["currentVideoInfo"]["ksPlayJsonHevc"])['adaptationSet']['representation'][
'url']
# pprint.pprint(info_json)
name = json.loads(info)["title"]
name = re.sub(r'[|?<>/\\]','',name)
return info_json, name
# 提取所有视频片段的播放地址 ts文件
def get_ts_files(url):
r = requests.get(url, headers=headers)
ts_files = re.sub('#.*', '', r.text).split()
return ts_files
# 下载并合并视频片段
def download_combine(ts_files, name):
path = os.getcwd()
with open(f'{path}/{name}.mp4', 'ab') as f:
for ts in tqdm(ts_files):
url = 'https://tx-safety-video.acfun.cn/mediacloud/acfun/acfun_video/' + ts
content = requests.get(url, headers=headers).content
f.write(content)
f.close()
# 获取目录页的视频链接
def get_index_links(index_url):
r = requests.get(index_url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
links = soup.find_all('div', class_="list-content-item")
links_list = []
for link in links:
url = "https://www.acfun.cn" + link.a.get('href')
links_list.append(url)
return links_list
def main(urll):
index_url = urll
links = get_index_links(index_url)
for url in links:
m3u8_url, name = get_m3u8_list(url)
ts_files = get_ts_files(m3u8_url)
download_combine(ts_files, name)
if __name__ == '__main__':
url = "https://www.acfun.cn/v/list135/index.htm?sortField=rankScore&duration=all&date=default&page=1"
main(url)
试了一下,能下载,给力哦!
页:
[1]