新手求助!使用python无法获取某网站blob图片资源,请问应该如何获取呢?
import requestsfrom bs4 import BeautifulSoup
from fake_useragent import UserAgent
import os
from urllib.parse import urlparse
import re
def get_response(url, timeout=10):
headers = {'User-Agent': UserAgent().random}
try:
response = requests.get(url, headers=headers, timeout=timeout)
response.encoding = 'utf-8'
return response.text
except requests.exceptions.Timeout:
print(f"请求超时: {url}")
return None
def fetch_pic_urls(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
# 找到所有img标签
img_tags = soup.find_all('img')
pic_urls =
return pic_urls
def get_page_title(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
title_tag = soup.find("title")
return title_tag.get_text() if title_tag else "Untitled"
def sanitize_filename(filename):
# 去除文件名中的特殊字符
return re.sub(r'[<>:"/\\|?*]+', '', filename)
def save_pic(pic_urls, save_dir, title):
# 去除标题中的特殊字符
safe_title = sanitize_filename(title)
# 确保目录存在
os.makedirs(save_dir, exist_ok=True)
# 创建以页面标题命名的子目录
subdir = os.path.join(save_dir, safe_title)
os.makedirs(subdir, exist_ok=True)
# 下载图片并保存
for url in pic_urls:
try:
response = requests.get(url, stream=True)
if response.status_code == 200:
# 从URL中提取文件名
filename = os.path.basename(urlparse(url).path)
# 去除文件名中的特殊字符
safe_filename = sanitize_filename(filename)
# 替换文件扩展名为.png
file_path = os.path.join(subdir, f"{safe_filename}.png")
with open(file_path, 'wb') as f:
for chunk in response.iter_content(1024):
f.write(chunk)
print(f"图片已保存: {file_path}")
else:
print(f"图片下载失败,状态码:{response.status_code}")
except Exception as e:
print(f"下载图片时发生错误: {e}")
if __name__ == '__main__':
url='https://xxx.com/29001'
save_dir = r'F:\Temp\picture'
html_content = get_response(url)
if html_content:
pic_urls = fetch_pic_urls(html_content)
title = get_page_title(html_content)
save_pic(pic_urls, save_dir, title) 网址多少啊 不看网站咋知道啥情况 本帖最后由 涛之雨 于 2024-8-7 22:49 编辑
简而言之:加密了,真正的代码在那个data-src里
简单的分析,代码在`usr/plugins/XPic/assets/XPic.js`里,
轻松解密ob混淆的代码后,丢给~~ChatGPT~~国产AI,优化一下算法,得到:
```js
async function loadImages() {
var images = document.querySelectorAll("img");
var imagePromises = Array.from(images).map(img => {
return fetch(img.getAttribute("data-src"))
.then(response => response.arrayBuffer())
.then(buffer => decryptArrayBufferData(buffer.slice(276)))
.then(decryptedData => {
var blob = new Blob(, { type: "image/webp" });
img.src = URL.createObjectURL(blob);
})
.catch(error => {
console.error("图片加载或解密失败:", error);
});
});
Promise.all(imagePromises).then(() => {});
}
function decryptArrayBufferData(buffer) {
var wordArray = CryptoJS.lib.WordArray.create(new Uint8Array(buffer));
var key = CryptoJS.enc.Utf8.parse("K65xztwG4B3FKcJyHOz/QAWiUE+Nh6k2");
var iv = CryptoJS.enc.Utf8.parse("rT/+upBDYhpGn05Q");
var decrypted = CryptoJS.AES.decrypt({ ciphertext: wordArray }, key, {
iv: iv,
mode: CryptoJS.mode.CTR,
padding: CryptoJS.pad.NoPadding
});
return decryptedToUint8Array(decrypted);
}
function decryptedToUint8Array(decrypted) {
var words = decrypted.words;
var sigBytes = decrypted.sigBytes;
var uint8Array = new Uint8Array(sigBytes);
for (var i = 0; i < sigBytes; i++) {
var byte = (words >>> (24 - (i % 4) * 8)) & 255;
uint8Array = byte;
}
return uint8Array;
}
loadImages();
```
最后,让AI翻译成python即可
```python
import requests
from Crypto.Cipher import AES
from Crypto.Util import Counter
import io
from PIL import Image
def decrypt_array_buffer_data(buffer):
key = b'K65xztwG4B3FKcJyHOz/QAWiUE+Nh6k2'
iv = b'rT/+upBDYhpGn05Q'
counter = Counter.new(128, initial_value=int.from_bytes(iv, byteorder='big'))
cipher = AES.new(key, AES.MODE_CTR, counter=counter)
decrypted_data = cipher.decrypt(buffer)
return decrypted_data
def download_encrypted_image(data_src, output_path):
response = requests.get(data_src)
if response.status_code != 200:
raise Exception(f"Failed to fetch the image: {response.status_code}")
encrypted_data = response.content
decrypted_data = decrypt_array_buffer_data(encrypted_data)
image_stream = io.BytesIO(decrypted_data)
try:
image = Image.open(image_stream)
image.save(output_path)
print(f"Image saved to {output_path}")
except Exception as e:
print(f"An error occurred: {e}")
# Example usage
download_encrypted_image("https://xxx.xx/xx.xx", "./a.jpg")
```
> 留个课后作业,
>
> 已知m3u8是视频切片文件,
>
> 请通过搜索、使用AI等方式完成视频下载并转码
最后赘述一句,学习技术还是要循序渐进
当然,精神可嘉。
此外,注意身体 blob 一般是 js 请求后转成的链接,所以得分析一下 js 涛之雨 发表于 2024-8-7 22:46
简而言之:加密了,真正的代码在那个data-src里
简单的分析,代码在`usr/plugins/XPic/assets/XPic.js ...
什么?注意身体?那得学习一下 涛之雨 发表于 2024-8-7 22:46
简而言之:加密了,真正的代码在那个data-src里
简单的分析,代码在`usr/plugins/XPic/assets/XPic.js ...
有些图片没有blob,就只有一个canvas id, 请问这种一般怎么定位图片的生成位置呢 涛之雨 发表于 2024-8-7 22:46
简而言之:加密了,真正的代码在那个data-src里
简单的分析,代码在`usr/plugins/XPic/assets/XPic.js ...
大佬,按你的步骤从头试了下,只能说ai真好用{:301_997:} 反手就是代码交给AI 涛之雨 发表于 2024-8-7 22:46
简而言之:加密了,真正的代码在那个data-src里
简单的分析,代码在`usr/plugins/XPic/assets/XPic.js ...
谢谢大佬,新手学习了,第一次知道还可以通过解密js算法来下载,昨晚使用selenium下载成功了,下面是代码,您留的m3u8视频下载,我研究一下,看看能不能下载。
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import os
import re
import time
import base64
def fetch_blob_pic_data(driver, url):
driver.get(url)
# 等待页面加载完成,可能需要一些时间
#driver.implicitly_wait(10)
time.sleep(5)# 等待5秒以确保页面加载完成
page_source = driver.page_source
# 定位到页面中的所有img标签
images = driver.find_elements(By.XPATH, '//div[@class="post-content"]/p/img')
#print(images)
# 遍历所有图片元素
blob_image_data = []
for img in images:
src = img.get_attribute('src')
# 检查src是否为None
if src is None:
print(f"Element does not have a 'src' attribute: {img}")
continue
# 检查src是否是blob URL
if src.startswith('blob:'):
print(f"Found blob URL: {src}")
# 如果src是blob URL,需要获取其二进制数据
image_data = driver.execute_async_script(
"""
var uri = arguments;
var callback = arguments;
var toBase64 = function(buffer){for(var r,n=new Uint8Array(buffer),t=n.length,a=new Uint8Array(4*Math.ceil(t/3)),i=new Uint8Array(64),o=0,c=0;64>c;++c)i="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/".charCodeAt(c);
for(c=0;t-t%3>c;c+=3,o+=4)r=n<<16|n<<8|n,a=i,a=i,a=i,a=i;return t%3===1?(r=n,a=i,a=i,a=61,a=61):t%3===2&&(r=(n<<8)+n,a=i,a=i,a=i,a=61),new TextDecoder("ascii").decode(a)};
var xhr = new XMLHttpRequest();
xhr.responseType = 'arraybuffer';
xhr.onload = function(){ callback(toBase64(xhr.response)) };
xhr.onerror = function(){ callback(null) };
xhr.open('GET', uri);
xhr.send();
""", src)
if image_data is not None:
print(f"Received image data: {image_data[:10]}...")# 打印前10个字符
blob_image_data.append(image_data)
else:
print(f"Failed to fetch image data from blob URL: {src}")
else:
print(f"Non-blob image source: {src}")
return blob_image_data
def get_page_title(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
title_tag = soup.find("title")
return title_tag.get_text() if title_tag else "Untitled"
def sanitize_filename(filename):
# 去除文件名中的特殊字符
return re.sub(r'[<>:"/\\|?*]+', '', filename)
def save_pic(blob_image_data, save_dir, title):
# 去除标题中的特殊字符
safe_title = sanitize_filename(title)
# 确保目录存在
os.makedirs(save_dir, exist_ok=True)
# 创建以页面标题命名的子目录
subdir = os.path.join(save_dir, safe_title)
os.makedirs(subdir, exist_ok=True)
# 将Base64编码的数据保存为图片文件
for i, image_data in enumerate(blob_image_data):
# 将Base64编码的数据转换为BytesIO对象
image_binary = base64.b64decode(image_data)
# 创建文件名
filename = f"image_{i}.png"
# 替换文件扩展名为.png
file_path = os.path.join(subdir, filename)
with open(file_path, 'wb') as f:
f.write(image_binary)
print(f"图片已保存: {file_path}")
if __name__ == '__main__':
url = input('请输入url')
chrome_path = r'C:\Program Files\Google\Chrome\Application\chrome.exe'# 替换为您的Chrome浏览器路径
# 手动指定ChromeDriver的路径
chromedriver_path = r'C:\Program Files\Google\Chrome\Application\chromedriver.exe'# 替换为您的ChromeDriver路径
# 创建Chrome选项
chrome_options = Options()
chrome_options.binary_location = chrome_path# 设置Chrome浏览器的路径
chrome_options.add_argument('--ignore-certificate-errors')# 忽略证书错误
# 如果是测试本地HTTPS服务器,可以使用以下参数
chrome_options.add_argument('--allow-insecure-localhost')
# 如果需要禁用SSL验证(不推荐在生产环境使用)
# chrome_options.add_argument('--disable-ssl-verification')
# 创建ChromeDriver服务
service = Service(executable_path=chromedriver_path)
# 创建WebDriver实例
driver = webdriver.Chrome(service=service, options=chrome_options)
save_dir = r'F:\Temp\picture'
blob_image_data = fetch_blob_pic_data(driver, url)
# 获取页面标题
html_content = driver.page_source
title = get_page_title(html_content)
# 保存图片
save_pic(blob_image_data, save_dir, title)
# 最后记得关闭WebDriver实例以释放资源
driver.quit() ts下载代码:def decrypt_ts_file(file_path, key, iv):
with open(file_path, 'rb') as file:
#读取文件的全部内容到变量encrypted_data。
encrypted_data = file.read()
#使用key和iv创建一个新的AES加密对象cipher,模式为CBC(Cipher Block Chaining,密码块链接模式)
cipher = AES.new(key, AES.MODE_CBC, iv=iv)
#首先使用cipher.decrypt()方法对加密数据进行解密,然后使用unpad函数去除解密后数据的填充(padding),AES.block_size是AES加密算法的块大小,
decrypted_data = unpad(cipher.decrypt(encrypted_data), AES.block_size)
with open(file_path, 'wb') as file:
file.write(decrypted_data)
def download_ts_files(segments, key, iv, download_dir):
for segment in segments:
uri = segment['uri']
file_name = uri.split('/')[-1]
file_path = os.path.join(download_dir, file_name)
print(file_name,file_path)
response = requests.get(uri)
with open(file_path, 'wb') as file:
file.write(response.content)
if key and iv:
decrypt_ts_file(file_path, key, iv)
页:
[1]