谢谢大佬,新手学习了,第一次知道还可以通过解密js算法来下载,昨晚使用selenium下载成功了,下面是代码,您留的m3u8视频下载,我研究一下,看看能不能下载。
[Python] 纯文本查看 复制代码 from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import os
import re
import time
import base64
def fetch_blob_pic_data(driver, url):
driver.get(url)
# 等待页面加载完成,可能需要一些时间
#driver.implicitly_wait(10)
time.sleep(5) # 等待5秒以确保页面加载完成
page_source = driver.page_source
# 定位到页面中的所有img标签
images = driver.find_elements(By.XPATH, '//div[@class="post-content"]/p/img')
#print(images)
# 遍历所有图片元素
blob_image_data = []
for img in images:
src = img.get_attribute('src')
# 检查src是否为None
if src is None:
print(f"Element does not have a 'src' attribute: {img}")
continue
# 检查src是否是blob URL
if src.startswith('blob:'):
print(f"Found blob URL: {src}")
# 如果src是blob URL,需要获取其二进制数据
image_data = driver.execute_async_script(
"""
var uri = arguments[0];
var callback = arguments[1];
var toBase64 = function(buffer){for(var r,n=new Uint8Array(buffer),t=n.length,a=new Uint8Array(4*Math.ceil(t/3)),i=new Uint8Array(64),o=0,c=0;64>c;++c)i[c]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/".charCodeAt(c);
for(c=0;t-t%3>c;c+=3,o+=4)r=n[c]<<16|n[c+1]<<8|n[c+2],a[o]=i[r>>18],a[o+1]=i[r>>12&63],a[o+2]=i[r>>6&63],a[o+3]=i[63&r];return t%3===1?(r=n[t-1],a[o]=i[r>>2],a[o+1]=i[r<<4&63],a[o+2]=61,a[o+3]=61):t%3===2&&(r=(n[t-2]<<8)+n[t-1],a[o]=i[r>>10],a[o+1]=i[r>>4&63],a[o+2]=i[r<<2&63],a[o+3]=61),new TextDecoder("ascii").decode(a)};
var xhr = new XMLHttpRequest();
xhr.responseType = 'arraybuffer';
xhr.onload = function(){ callback(toBase64(xhr.response)) };
xhr.onerror = function(){ callback(null) };
xhr.open('GET', uri);
xhr.send();
""", src)
if image_data is not None:
print(f"Received image data: {image_data[:10]}...") # 打印前10个字符
blob_image_data.append(image_data)
else:
print(f"Failed to fetch image data from blob URL: {src}")
else:
print(f"Non-blob image source: {src}")
return blob_image_data
def get_page_title(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
title_tag = soup.find("title")
return title_tag.get_text() if title_tag else "Untitled"
def sanitize_filename(filename):
# 去除文件名中的特殊字符
return re.sub(r'[<>:"/\\|?*]+', '', filename)
def save_pic(blob_image_data, save_dir, title):
# 去除标题中的特殊字符
safe_title = sanitize_filename(title)
# 确保目录存在
os.makedirs(save_dir, exist_ok=True)
# 创建以页面标题命名的子目录
subdir = os.path.join(save_dir, safe_title)
os.makedirs(subdir, exist_ok=True)
# 将Base64编码的数据保存为图片文件
for i, image_data in enumerate(blob_image_data):
# 将Base64编码的数据转换为BytesIO对象
image_binary = base64.b64decode(image_data)
# 创建文件名
filename = f"image_{i}.png"
# 替换文件扩展名为.png
file_path = os.path.join(subdir, filename)
with open(file_path, 'wb') as f:
f.write(image_binary)
print(f"图片已保存: {file_path}")
if __name__ == '__main__':
url = input('请输入url')
chrome_path = r'C:\Program Files\Google\Chrome\Application\chrome.exe' # 替换为您的Chrome浏览器路径
# 手动指定ChromeDriver的路径
chromedriver_path = r'C:\Program Files\Google\Chrome\Application\chromedriver.exe' # 替换为您的ChromeDriver路径
# 创建Chrome选项
chrome_options = Options()
chrome_options.binary_location = chrome_path # 设置Chrome浏览器的路径
chrome_options.add_argument('--ignore-certificate-errors') # 忽略证书错误
# 如果是测试本地HTTPS服务器,可以使用以下参数
chrome_options.add_argument('--allow-insecure-localhost')
# 如果需要禁用SSL验证(不推荐在生产环境使用)
# chrome_options.add_argument('--disable-ssl-verification')
# 创建ChromeDriver服务
service = Service(executable_path=chromedriver_path)
# 创建WebDriver实例
driver = webdriver.Chrome(service=service, options=chrome_options)
save_dir = r'F:\Temp\picture'
blob_image_data = fetch_blob_pic_data(driver, url)
# 获取页面标题
html_content = driver.page_source
title = get_page_title(html_content)
# 保存图片
save_pic(blob_image_data, save_dir, title)
# 最后记得关闭WebDriver实例以释放资源
driver.quit() |