原创爬取电影X堂电视剧
本帖最后由 Shidt 于 2023-12-27 17:15 编辑不是最近三X队比较火,还出了电视剧,写了一个能够爬取电影X堂上视频的脚本,分为两部分
第一部分是 get_m3u8.py 用来拿到 m3u8 文件
第二部分是 download_video.py 用来下载所有的分片视频,处理 m3u8 和 enckey文件 最后合并ts视频分片。
第一个文件
# conding:utf8
# author:shidt
import os
import random
import json
import time
import requests
from lxml import etree
def get_user_agent():
MY_USER_AGENT = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
]
return MY_USER_AGENT
def get_proxy():
proxy = [
'http://182.140.244.163:8118',
'http://113.124.86.180:9999',
'http://117.64.237.42:9999',
'http://182.34.102.48:9999',
'http://183.236.123.242:8060',
'http://27.192.203.80:9000',
'http://114.231.8.242:8888',
'http://36.134.91.82:8888',
'http://222.132.57.105:9000',
'http://61.216.156.222:60808',
'http://182.34.20.110:9999',
'http://60.205.132.71:80',
]
return proxy
headers = {
'user-agent': random.choice(get_user_agent()),
}
proxy = {
'http': random.choice(get_proxy()),
}
# 先拿到主页面的源码
def get_index_source(url):
url = 'https://www.sxcse.com/mov/sdd2023.html'
sess = requests.session()
response = sess.get(url, headers=headers, proxies=proxy)
response.encoding='utf-8'
file_name = 'index.html'
with open (file_name, mode='w', encoding='utf-8') as f:
f.write(response.text)
return file_name
def get_urls(source_file):
tree = etree.HTML(open(source_file, 'r', encoding='utf-8').read())
# 解析出每一集的url,存入一个列表
all_urls_list = tree.xpath('//*[@id="playlist"]/div/div/div/div/ul/li/a/@href')
# print(all_urls_list)
# 拼接处理成完整的每一集的网址
final_url_list = []
for url in all_urls_list:
final_url = 'https://www.sxcse.com' + url
final_url_list.insert(0, final_url)
return final_url_list
def download_m3u8_file(url_list, m3u8_path='m3u8_files', enckey_path='enckey_files'):
"""
下载每一集的m3u8文件到m3u8目录
下载每一集的enckey文件到key目录
:param url_list:
:return:
"""
if not os.path.exists(m3u8_path):
os.mkdir(m3u8_path)
if not os.path.exists(enckey_path):
os.mkdir(enckey_path)
# 开始解析每一个url的源码,找到m3u8文件url,做拼接以后下载下来
file = 1
for url in url_list:
print(f'正在下载第{file}集的m3u8文件和enckey文件...')
sess = requests.session()
response = sess.get(url, headers=headers, proxies=proxy)
response.encoding = 'UTF-8'
# print(response.text)
tree = etree.HTML(response.text)
data = tree.xpath('/html/body/div/div/div/div/div/div/script/text()')
# print(data)
# 2. 解析出 url 并做拼接 这边我做的比较麻烦,其实使用正则也可以
string = str(data).split(' = ').replace('\\', '').split(';')
data_dict = json.loads(string)
tmp_url = data_dict.get('url')
key = tmp_url.split('%2F')
# 拼接最终的url
m3u8_url = 'https://1080p.jszyplay.com/play/' + key + '/index.m3u8'
enckey_url = 'https://1080p.jszyplay.com/play/' + key + '/enc.key'
# 3. 拿到了m3u8 url,去下载并保存
# 下载m3u8文件
m3u8_file_path = os.path.join(m3u8_path, f'第{str(file)}集.m3u8')
m3u8_resp = sess.get(m3u8_url, headers=headers)
with open(m3u8_file_path, 'wb') as f:
f.write(m3u8_resp.content)
print(f'第{file}集的m3u8文件下载完成!')
# 下载enckey文件
enckey_file_path = os.path.join(enckey_path, f'第{str(file)}集.key')
enckey_resp = sess.get(enckey_url, headers=headers)
with open(enckey_file_path, 'wb') as f:
f.write(enckey_resp.content)
print(f'第{file}集的enckey文件下载完成!')
time.sleep(random.randint(5,10))
file += 1
if __name__ == '__main__':
url = 'https://www.sxcse.com/mov/sdd2023.html'
get_index_source(url)
urls = get_urls('index.html')
download_m3u8_file(urls)
第二个文件
# conding:utf8
# author:shidt
import random
import os
import re
import time
import requests
from concurrent.futures importThreadPoolExecutor, wait
from get_m3u8 import headers
from get_m3u8 import proxy
def download_one_video(url, i, path):
print(url, i, '开始下载')
resp = requests.get(url, headers=headers)
with open(os.path.join(path, f'{i}.ts'), 'wb') as f:
f.write(resp.content)
print(url, i, '下载完成')
def download_one_episode(m3u8_file, target_path):
'''
下载一级视频中所有ts视频的函数
:return:
'''
if not os.path.exists(target_path):
os.mkdir(target_path)
# 读取 m3u8.text 的内容
with open(m3u8_file, mode='r', encoding='utf-8') as f:
data = f.readlines()
# 创建线程池
pool = ThreadPoolExecutor(50)
tasks = []
i = 0
# 拿到所有的httpsw
for line in data:
# 提取 ts 的url路径
if line.startswith('#'):
continue
# 使用strip去除url结尾的换行符
ts_url = line.strip()
# 提交到线程池
tasks.append(pool.submit(download_one_video, ts_url, i, target_path))
i += 1
# 集体等待线程对象执行完毕
wait(tasks)
def download_all_episodes(m3u8_files_path, name):
"""
:param path: 为m3u8文件所在的文件夹
:return:
"""
if not os.path.exists(m3u8_files_path):
print('m3u8文件路径不存在...')
exit()
# 先获取到有哪些文件
files = os.listdir(m3u8_files_path)
for file in files:
# 每一集的名字
episode_name = file.split('.')
# 下载到目标路径为剧集的名字+每一集的名字
target_path = name + f'/{episode_name}'
print(f'正在下载{episode_name}')
download_one_episode(os.path.join(m3u8_files_path, file), target_path)
print(f'已完成下载{episode_name}')
time.sleep(random.randint(3, 6))
def do_m3u8_url(name, m3u8_path = 'm3u8_files', enckey_path='enckey_files'):
'''
主要实现从 m3u8文件 和 encky 文件中读取每一集的对应文件
打开以后分别做处理
对于 m3u8文件 打开以后处理好秘钥路径 和 ts 文件路径 保存到F:\三大队 的每一集文件夹中
对于 enckey 文件 保存到每一集的 key.m3u8 文件夹中
:param name:
:param m3u8_path:
:param enckey_path:
:return:
'''
# 对于 m3u8 文件的处理
m3u8_files = os.listdir(m3u8_path)
for file in m3u8_files:
with open(os.path.join(m3u8_path, file), mode='r', encoding='utf8') as f:
data = f.readlines()
# 拿到data是一个列表,做判断,处理好数据后再进行写入
dirname = file.split('.')
target_m3u8_file = os.path.join(name, dirname, file)
# print(target_file)
with open(target_m3u8_file, 'w', encoding='utf8') as fw:
i = 0
for line in data:
if line.startswith('#'):
if line.startswith('#EXT-X-KEY'):
line = line.replace('URI="enc.key"', 'URI="key.m3u8"')
fw.write(line)
else:
fw.write(line)
else:
fw.write(f'{name}/{dirname}/{i}.ts\n')
i += 1
# 对于enckey文件的处理
enckey_files = os.listdir(enckey_path)
for file in enckey_files:
dirname = file.split('.')
with open(os.path.join(enckey_path, file), mode='r', encoding='utf8') as f:
data = f.read()
target_key_file = os.path.join(name, dirname, 'key.m3u8')
with open(target_key_file, 'w', encoding='utf8') as fw:
fw.write(data)
def merge(filePath):
'''
进行ts文件合并 解决视频音频不同步的问题 建议使用这种
:param filePath:
:return:
'''
for file in os.listdir(filePath):
file_path = os.path.join(filePath,file)
os.chdir(file_path)
cmd = f'ffmpeg -i {file}.m3u8 -c copy {file}.mp4'
os.system(cmd)
if __name__ == '__main__':
name = 'F:\三大队'
# 先判断剧集名字路径是否存在,不存在创建
if not os.path.exists(name):
os.mkdir(name)
m3u8_files_path = 'm3u8_files'
# 先下载所有的片段
download_all_episodes(m3u8_files_path, name)
# 再对 m3u8 文件进行处理
do_m3u8_url(name)
# 最后合并视频
merge(name)
这里还需要一个合并的软件ffmpeg 如果有需要可以在评论区进行评论,我会发出来 本帖最后由 Shidt 于 2023-12-27 20:32 编辑
baliao 发表于 2023-12-27 19:57
感谢分享运行了这2个文件,m3u8和key也没生成,那个proxy 需要处理吗?
你可以分段注释代码。在第一个文件中的124行取消注释,并进行打印输出,查看下是否他们的服务器返回来的html源代码解析出来是加密的,如果是加密的,表示对你的ip做了反爬,此时你需要使用proxy代{过}{滤}理来进行访问请求
另外我上面的proxy的列表中都是我自己找的免费的,你也可以花钱找点靠谱的,或者你也去找一些免费的给他增加一些。 这个代码我昨天刚运行过,是可以执行的~ 主页访问不了,<!DOCTYPE html><html lang="en-US"><head><title>Just a moment...</title><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta http-equiv="X-UA-Compatible" content="IE=Edge"><meta name="robots" content="noindex,nofollow"><meta name="viewport" content="width=device-width,initial-scale=1"><link href="/cdn-cgi/styles/challenges.css" rel="stylesheet"><meta http-equiv="refresh" content="375"></head><body class="no-js"><div class="main-wrapper" role="main"><div class="main-content"><noscript><div id="challenge-error-title"><div class="h2"><span class="icon-wrapper"><div class="heading-icon warning-icon"></div></span><span id="challenge-error-text">Enable JavaScript and cookies to continue</span></div></div></noscript></div></div> 好用,有用。 给力 支持下 很有用的代码,好好研究下 厉害了,参考下,多谢 厉害,支持 感谢分享运行了这2个文件,m3u8和key也没生成,那个proxy 需要处理吗? 厉害啊,学习一下