新手求助！使用python无法获取某网站blob图片资源，请问应该如何获取呢？

miracle1989 发表于 2024-8-7 20:42

import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import os
from urllib.parse import urlparse
import re

def get_response(url, timeout=10):
headers = {'User-Agent': UserAgent().random}
try:
   response = requests.get(url, headers=headers, timeout=timeout)
   response.encoding = 'utf-8'
   return response.text
except requests.exceptions.Timeout:
   print(f"请求超时: {url}")
   return None

def fetch_pic_urls(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
# 找到所有img标签
img_tags = soup.find_all('img')
pic_urls =
return pic_urls

def get_page_title(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
title_tag = soup.find("title")
return title_tag.get_text() if title_tag else "Untitled"

def sanitize_filename(filename):
# 去除文件名中的特殊字符
return re.sub(r'[<>:"/\\|?*]+', '', filename)

def save_pic(pic_urls, save_dir, title):
# 去除标题中的特殊字符
safe_title = sanitize_filename(title)

# 确保目录存在
os.makedirs(save_dir, exist_ok=True)

# 创建以页面标题命名的子目录
subdir = os.path.join(save_dir, safe_title)
os.makedirs(subdir, exist_ok=True)

# 下载图片并保存
for url in pic_urls:
   try:
         response = requests.get(url, stream=True)
         if response.status_code == 200:
            # 从URL中提取文件名
            filename = os.path.basename(urlparse(url).path)
            # 去除文件名中的特殊字符
            safe_filename = sanitize_filename(filename)
            # 替换文件扩展名为.png
            file_path = os.path.join(subdir, f"{safe_filename}.png")
            with open(file_path, 'wb') as f:
               for chunk in response.iter_content(1024):
                     f.write(chunk)
            print(f"图片已保存: {file_path}")
         else:
            print(f"图片下载失败，状态码：{response.status_code}")
   except Exception as e:
         print(f"下载图片时发生错误: {e}")
if __name__ == '__main__':
url='https://xxx.com/29001'
save_dir = r'F:\Temp\picture'
html_content = get_response(url)

if html_content:
   pic_urls = fetch_pic_urls(html_content)
   title = get_page_title(html_content)
   save_pic(pic_urls, save_dir, title)

star0angel 发表于 2024-8-7 21:53

网址多少啊不看网站咋知道啥情况

涛之雨 发表于 2024-8-7 22:46

本帖最后由涛之雨于 2024-8-7 22:49 编辑

简而言之：加密了，真正的代码在那个data-src里
简单的分析，代码在`usr/plugins/XPic/assets/XPic.js`里，
轻松解密ob混淆的代码后，丢给~~ChatGPT~~国产AI，优化一下算法，得到：

```js
async function loadImages() {
var images = document.querySelectorAll("img");

var imagePromises = Array.from(images).map(img => {
return fetch(img.getAttribute("data-src"))
   .then(response => response.arrayBuffer())
   .then(buffer => decryptArrayBufferData(buffer.slice(276)))
   .then(decryptedData => {
   var blob = new Blob(, { type: "image/webp" });
   img.src = URL.createObjectURL(blob);
   })
   .catch(error => {
   console.error("图片加载或解密失败:", error);
   });
});

Promise.all(imagePromises).then(() => {});
}

function decryptArrayBufferData(buffer) {
var wordArray = CryptoJS.lib.WordArray.create(new Uint8Array(buffer));
var key = CryptoJS.enc.Utf8.parse("K65xztwG4B3FKcJyHOz/QAWiUE+Nh6k2");
var iv = CryptoJS.enc.Utf8.parse("rT/+upBDYhpGn05Q");

var decrypted = CryptoJS.AES.decrypt({ ciphertext: wordArray }, key, {
iv: iv,
mode: CryptoJS.mode.CTR,
padding: CryptoJS.pad.NoPadding
});

return decryptedToUint8Array(decrypted);
}

function decryptedToUint8Array(decrypted) {
var words = decrypted.words;
var sigBytes = decrypted.sigBytes;
var uint8Array = new Uint8Array(sigBytes);

for (var i = 0; i < sigBytes; i++) {
var byte = (words >>> (24 - (i % 4) * 8)) & 255;
uint8Array = byte;
}

return uint8Array;
}

loadImages();
```

最后，让AI翻译成python即可

```python
import requests
from Crypto.Cipher import AES
from Crypto.Util import Counter
import io
from PIL import Image

def decrypt_array_buffer_data(buffer):
key = b'K65xztwG4B3FKcJyHOz/QAWiUE+Nh6k2'
iv = b'rT/+upBDYhpGn05Q'

counter = Counter.new(128, initial_value=int.from_bytes(iv, byteorder='big'))
cipher = AES.new(key, AES.MODE_CTR, counter=counter)
decrypted_data = cipher.decrypt(buffer)

return decrypted_data

def download_encrypted_image(data_src, output_path):
response = requests.get(data_src)
if response.status_code != 200:
   raise Exception(f"Failed to fetch the image: {response.status_code}")
encrypted_data = response.content

decrypted_data = decrypt_array_buffer_data(encrypted_data)

image_stream = io.BytesIO(decrypted_data)
try:
   image = Image.open(image_stream)
   image.save(output_path)
   print(f"Image saved to {output_path}")
except Exception as e:
   print(f"An error occurred: {e}")

# Example usage
download_encrypted_image("https://xxx.xx/xx.xx", "./a.jpg")

```

> 留个课后作业，
>
> 已知m3u8是视频切片文件，
>
> 请通过搜索、使用AI等方式完成视频下载并转码

最后赘述一句，学习技术还是要循序渐进

当然，精神可嘉。

此外，注意身体

三滑稽甲苯 发表于 2024-8-8 06:55

blob 一般是 js 请求后转成的链接，所以得分析一下 js

superTian 发表于 2024-8-8 09:13

涛之雨发表于 2024-8-7 22:46
简而言之：加密了，真正的代码在那个data-src里
简单的分析，代码在`usr/plugins/XPic/assets/XPic.js ...

什么？注意身体？那得学习一下

wasm2023 发表于 2024-8-8 09:39

涛之雨发表于 2024-8-7 22:46
简而言之：加密了，真正的代码在那个data-src里
简单的分析，代码在`usr/plugins/XPic/assets/XPic.js ...

有些图片没有blob，就只有一个canvas id，请问这种一般怎么定位图片的生成位置呢

马了顶大 发表于 2024-8-8 10:35

涛之雨发表于 2024-8-7 22:46
简而言之：加密了，真正的代码在那个data-src里
简单的分析，代码在`usr/plugins/XPic/assets/XPic.js ...

大佬，按你的步骤从头试了下，只能说ai真好用{:301_997:}

7228189 发表于 2024-8-8 12:23

反手就是代码交给AI

miracle1989 发表于 2024-8-8 18:10

涛之雨发表于 2024-8-7 22:46
简而言之：加密了，真正的代码在那个data-src里
简单的分析，代码在`usr/plugins/XPic/assets/XPic.js ...

谢谢大佬，新手学习了，第一次知道还可以通过解密js算法来下载，昨晚使用selenium下载成功了，下面是代码，您留的m3u8视频下载，我研究一下，看看能不能下载。
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import os
import re
import time
import base64

def fetch_blob_pic_data(driver, url):
driver.get(url)

# 等待页面加载完成，可能需要一些时间
#driver.implicitly_wait(10)
time.sleep(5)# 等待5秒以确保页面加载完成
page_source = driver.page_source

# 定位到页面中的所有img标签
images = driver.find_elements(By.XPATH, '//div[@class="post-content"]/p/img')
#print(images)
# 遍历所有图片元素
blob_image_data = []
for img in images:
   src = img.get_attribute('src')

   # 检查src是否为None
   if src is None:
         print(f"Element does not have a 'src' attribute: {img}")
         continue

   # 检查src是否是blob URL
   if src.startswith('blob:'):
         print(f"Found blob URL: {src}")
         # 如果src是blob URL，需要获取其二进制数据
         image_data = driver.execute_async_script(
            """
            var uri = arguments;
            var callback = arguments;
            var toBase64 = function(buffer){for(var r,n=new Uint8Array(buffer),t=n.length,a=new Uint8Array(4*Math.ceil(t/3)),i=new Uint8Array(64),o=0,c=0;64>c;++c)i="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/".charCodeAt(c);
            for(c=0;t-t%3>c;c+=3,o+=4)r=n<<16|n<<8|n,a=i,a=i,a=i,a=i;return t%3===1?(r=n,a=i,a=i,a=61,a=61):t%3===2&&(r=(n<<8)+n,a=i,a=i,a=i,a=61),new TextDecoder("ascii").decode(a)};
            var xhr = new XMLHttpRequest();
            xhr.responseType = 'arraybuffer';
            xhr.onload = function(){ callback(toBase64(xhr.response)) };
            xhr.onerror = function(){ callback(null) };
            xhr.open('GET', uri);
            xhr.send();
            """, src)
         if image_data is not None:
            print(f"Received image data: {image_data[:10]}...")# 打印前10个字符
            blob_image_data.append(image_data)
         else:
            print(f"Failed to fetch image data from blob URL: {src}")
   else:
         print(f"Non-blob image source: {src}")

return blob_image_data

def get_page_title(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
title_tag = soup.find("title")
return title_tag.get_text() if title_tag else "Untitled"

def sanitize_filename(filename):
# 去除文件名中的特殊字符
return re.sub(r'[<>:"/\\|?*]+', '', filename)

def save_pic(blob_image_data, save_dir, title):
# 去除标题中的特殊字符
safe_title = sanitize_filename(title)

# 确保目录存在
os.makedirs(save_dir, exist_ok=True)

# 创建以页面标题命名的子目录
subdir = os.path.join(save_dir, safe_title)
os.makedirs(subdir, exist_ok=True)

# 将Base64编码的数据保存为图片文件
for i, image_data in enumerate(blob_image_data):
   # 将Base64编码的数据转换为BytesIO对象
   image_binary = base64.b64decode(image_data)
   # 创建文件名
   filename = f"image_{i}.png"
   # 替换文件扩展名为.png
   file_path = os.path.join(subdir, filename)
   with open(file_path, 'wb') as f:
         f.write(image_binary)
   print(f"图片已保存: {file_path}")

if __name__ == '__main__':
url = input('请输入url')
chrome_path = r'C:\Program Files\Google\Chrome\Application\chrome.exe'# 替换为您的Chrome浏览器路径
# 手动指定ChromeDriver的路径
chromedriver_path = r'C:\Program Files\Google\Chrome\Application\chromedriver.exe'# 替换为您的ChromeDriver路径
# 创建Chrome选项
chrome_options = Options()
chrome_options.binary_location = chrome_path# 设置Chrome浏览器的路径
chrome_options.add_argument('--ignore-certificate-errors')# 忽略证书错误
# 如果是测试本地HTTPS服务器，可以使用以下参数
chrome_options.add_argument('--allow-insecure-localhost')
# 如果需要禁用SSL验证（不推荐在生产环境使用）
# chrome_options.add_argument('--disable-ssl-verification')
# 创建ChromeDriver服务
service = Service(executable_path=chromedriver_path)
# 创建WebDriver实例
driver = webdriver.Chrome(service=service, options=chrome_options)
save_dir = r'F:\Temp\picture'
blob_image_data = fetch_blob_pic_data(driver, url)
# 获取页面标题
html_content = driver.page_source
title = get_page_title(html_content)

# 保存图片
save_pic(blob_image_data, save_dir, title)

# 最后记得关闭WebDriver实例以释放资源
driver.quit()

miracle1989 发表于 2024-8-8 22:57

ts下载代码：def decrypt_ts_file(file_path, key, iv):
with open(file_path, 'rb') as file:
   #读取文件的全部内容到变量encrypted_data。
   encrypted_data = file.read()
#使用key和iv创建一个新的AES加密对象cipher，模式为CBC（Cipher Block Chaining，密码块链接模式）
cipher = AES.new(key, AES.MODE_CBC, iv=iv)
#首先使用cipher.decrypt()方法对加密数据进行解密，然后使用unpad函数去除解密后数据的填充（padding），AES.block_size是AES加密算法的块大小，
decrypted_data = unpad(cipher.decrypt(encrypted_data), AES.block_size)
with open(file_path, 'wb') as file:
   file.write(decrypted_data)

def download_ts_files(segments, key, iv, download_dir):
for segment in segments:
   uri = segment['uri']
   file_name = uri.split('/')[-1]
   file_path = os.path.join(download_dir, file_name)
   print(file_name,file_path)
   response = requests.get(uri)
   with open(file_path, 'wb') as file:
         file.write(response.content)
   if key and iv:
         decrypt_ts_file(file_path, key, iv)

页: [1]

吾爱破解 - 52pojie.cn's Archiver

新手求助！使用python无法获取某网站blob图片资源，请问应该如何获取呢？