新手求助！使用python无法获取某网站blob图片资源，请问应该如何获取呢？

miracle1989 · 发表于 2024-8-7 20:42

[Python] 纯文本查看 复制代码

import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import os
from urllib.parse import urlparse
import re

def get_response(url, timeout=10):
    headers = {'User-Agent': UserAgent().random}
    try:
        response = requests.get(url, headers=headers, timeout=timeout)
        response.encoding = 'utf-8'
        return response.text
    except requests.exceptions.Timeout:
        print(f"请求超时: {url}")
        return None


def fetch_pic_urls(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    # 找到所有img标签
    img_tags = soup.find_all('img')
    pic_urls = [img.get('src') for img in img_tags if img.get('src')]
    return pic_urls


def get_page_title(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    title_tag = soup.find("title")
    return title_tag.get_text() if title_tag else "Untitled"


def sanitize_filename(filename):
    # 去除文件名中的特殊字符
    return re.sub(r'[<>:"/\\|?*]+', '', filename)


def save_pic(pic_urls, save_dir, title):
    # 去除标题中的特殊字符
    safe_title = sanitize_filename(title)

    # 确保目录存在
    os.makedirs(save_dir, exist_ok=True)

    # 创建以页面标题命名的子目录
    subdir = os.path.join(save_dir, safe_title)
    os.makedirs(subdir, exist_ok=True)

    # 下载图片并保存
    for url in pic_urls:
        try:
            response = requests.get(url, stream=True)
            if response.status_code == 200:
                # 从URL中提取文件名
                filename = os.path.basename(urlparse(url).path)
                # 去除文件名中的特殊字符
                safe_filename = sanitize_filename(filename)
                # 替换文件扩展名为.png
                file_path = os.path.join(subdir, f"{safe_filename}.png")
                with open(file_path, 'wb') as f:
                    for chunk in response.iter_content(1024):
                        f.write(chunk)
                print(f"图片已保存: {file_path}")
            else:
                print(f"图片下载失败，状态码：{response.status_code}")
        except Exception as e:
            print(f"下载图片时发生错误: {e}")
if __name__ == '__main__':
    url='https://xxx.com/29001'
    save_dir = r'F:\Temp\picture'
    html_content = get_response(url)

    if html_content:
        pic_urls = fetch_pic_urls(html_content)
        title = get_page_title(html_content)
        save_pic(pic_urls, save_dir, title)

star0angel · 发表于 2024-8-7 21:53

网址多少啊不看网站咋知道啥情况

涛之雨 · 发表于 2024-8-7 22:46

本帖最后由涛之雨于 2024-8-7 22:49 编辑

简而言之：加密了，真正的代码在那个data-src里
简单的分析，代码在usr/plugins/XPic/assets/XPic.js里，
轻松解密ob混淆的代码后，丢给~~ChatGPT~~国产AI，优化一下算法，得到：

async function loadImages() {
  var images = document.querySelectorAll("img[data-src]");

  var imagePromises = Array.from(images).map(img => {
    return fetch(img.getAttribute("data-src"))
      .then(response => response.arrayBuffer())
      .then(buffer => decryptArrayBufferData(buffer.slice(276)))
      .then(decryptedData => {
        var blob = new Blob([decryptedData], { type: "image/webp" });
        img.src = URL.createObjectURL(blob);
      })
      .catch(error => {
        console.error("图片加载或解密失败:", error);
      });
  });

  Promise.all(imagePromises).then(() => {});
}

function decryptArrayBufferData(buffer) {
  var wordArray = CryptoJS.lib.WordArray.create(new Uint8Array(buffer));
  var key = CryptoJS.enc.Utf8.parse("K65xztwG4B3FKcJyHOz/QAWiUE+Nh6k2");
  var iv = CryptoJS.enc.Utf8.parse("rT/+upBDYhpGn05Q");

  var decrypted = CryptoJS.AES.decrypt({ ciphertext: wordArray }, key, {
    iv: iv,
    mode: CryptoJS.mode.CTR,
    padding: CryptoJS.pad.NoPadding
  });

  return decryptedToUint8Array(decrypted);
}

function decryptedToUint8Array(decrypted) {
  var words = decrypted.words;
  var sigBytes = decrypted.sigBytes;
  var uint8Array = new Uint8Array(sigBytes);

  for (var i = 0; i < sigBytes; i++) {
    var byte = (words[i >>> 2] >>> (24 - (i % 4) * 8)) & 255;
    uint8Array[i] = byte;
  }

  return uint8Array;
}

loadImages();

最后，让AI翻译成python即可

import requests
from Crypto.Cipher import AES
from Crypto.Util import Counter
import io
from PIL import Image

def decrypt_array_buffer_data(buffer):
    key = b'K65xztwG4B3FKcJyHOz/QAWiUE+Nh6k2'
    iv = b'rT/+upBDYhpGn05Q'

    counter = Counter.new(128, initial_value=int.from_bytes(iv, byteorder='big'))
    cipher = AES.new(key, AES.MODE_CTR, counter=counter)
    decrypted_data = cipher.decrypt(buffer)

    return decrypted_data

def download_encrypted_image(data_src, output_path):
    response = requests.get(data_src)
    if response.status_code != 200:
        raise Exception(f"Failed to fetch the image: {response.status_code}")
    encrypted_data = response.content

    decrypted_data = decrypt_array_buffer_data(encrypted_data[276:])

    image_stream = io.BytesIO(decrypted_data)
    try:
        image = Image.open(image_stream)
        image.save(output_path)
        print(f"Image saved to {output_path}")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
download_encrypted_image("https://xxx.xx/xx.xx", "./a.jpg")

留个课后作业，

已知m3u8是视频切片文件，

请通过搜索、使用AI等方式完成视频下载并转码

最后赘述一句，学习技术还是要循序渐进

当然，精神可嘉。

此外，注意身体

三滑稽甲苯 · 发表于 2024-8-8 06:55

blob 一般是 js 请求后转成的链接，所以得分析一下 js

superTian · 发表于 2024-8-8 09:13

涛之雨发表于 2024-8-7 22:46
[md]简而言之：加密了，真正的代码在那个data-src里
简单的分析，代码在`usr/plugins/XPic/assets/XPic.js ...

什么？注意身体？那得学习一下

wasm2023 · 发表于 2024-8-8 09:39

涛之雨发表于 2024-8-7 22:46
[md]简而言之：加密了，真正的代码在那个data-src里
简单的分析，代码在`usr/plugins/XPic/assets/XPic.js ...

有些图片没有blob，就只有一个canvas id，请问这种一般怎么定位图片的生成位置呢

马了顶大 · 发表于 2024-8-8 10:35

涛之雨发表于 2024-8-7 22:46
[md]简而言之：加密了，真正的代码在那个data-src里
简单的分析，代码在`usr/plugins/XPic/assets/XPic.js ...

大佬，按你的步骤从头试了下，只能说ai真好用

7228189 · 发表于 2024-8-8 12:23

反手就是代码交给AI

miracle1989 · 发表于 2024-8-8 18:10

涛之雨发表于 2024-8-7 22:46
[md]简而言之：加密了，真正的代码在那个data-src里
简单的分析，代码在`usr/plugins/XPic/assets/XPic.js ...

谢谢大佬，新手学习了，第一次知道还可以通过解密js算法来下载，昨晚使用selenium下载成功了，下面是代码，您留的m3u8视频下载，我研究一下，看看能不能下载。

[Python] 纯文本查看 复制代码

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import os
import re
import time
import base64

def fetch_blob_pic_data(driver, url):
    driver.get(url)

    # 等待页面加载完成，可能需要一些时间
    #driver.implicitly_wait(10)
    time.sleep(5)  # 等待5秒以确保页面加载完成
    page_source = driver.page_source

    # 定位到页面中的所有img标签
    images = driver.find_elements(By.XPATH, '//div[@class="post-content"]/p/img')
    #print(images)
    # 遍历所有图片元素
    blob_image_data = []
    for img in images:
        src = img.get_attribute('src')

        # 检查src是否为None
        if src is None:
            print(f"Element does not have a 'src' attribute: {img}")
            continue

        # 检查src是否是blob URL
        if src.startswith('blob:'):
            print(f"Found blob URL: {src}")
            # 如果src是blob URL，需要获取其二进制数据
            image_data = driver.execute_async_script(
                """
                var uri = arguments[0];
                var callback = arguments[1];
                var toBase64 = function(buffer){for(var r,n=new Uint8Array(buffer),t=n.length,a=new Uint8Array(4*Math.ceil(t/3)),i=new Uint8Array(64),o=0,c=0;64>c;++c)i[c]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/".charCodeAt(c);
                for(c=0;t-t%3>c;c+=3,o+=4)r=n[c]<<16|n[c+1]<<8|n[c+2],a[o]=i[r>>18],a[o+1]=i[r>>12&63],a[o+2]=i[r>>6&63],a[o+3]=i[63&r];return t%3===1?(r=n[t-1],a[o]=i[r>>2],a[o+1]=i[r<<4&63],a[o+2]=61,a[o+3]=61):t%3===2&&(r=(n[t-2]<<8)+n[t-1],a[o]=i[r>>10],a[o+1]=i[r>>4&63],a[o+2]=i[r<<2&63],a[o+3]=61),new TextDecoder("ascii").decode(a)};
                var xhr = new XMLHttpRequest();
                xhr.responseType = 'arraybuffer';
                xhr.onload = function(){ callback(toBase64(xhr.response)) };
                xhr.onerror = function(){ callback(null) };
                xhr.open('GET', uri);
                xhr.send();
                """, src)
            if image_data is not None:
                print(f"Received image data: {image_data[:10]}...")  # 打印前10个字符
                blob_image_data.append(image_data)
            else:
                print(f"Failed to fetch image data from blob URL: {src}")
        else:
            print(f"Non-blob image source: {src}")

    return blob_image_data


def get_page_title(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    title_tag = soup.find("title")
    return title_tag.get_text() if title_tag else "Untitled"


def sanitize_filename(filename):
    # 去除文件名中的特殊字符
    return re.sub(r'[<>:"/\\|?*]+', '', filename)


def save_pic(blob_image_data, save_dir, title):
    # 去除标题中的特殊字符
    safe_title = sanitize_filename(title)

    # 确保目录存在
    os.makedirs(save_dir, exist_ok=True)

    # 创建以页面标题命名的子目录
    subdir = os.path.join(save_dir, safe_title)
    os.makedirs(subdir, exist_ok=True)

    # 将Base64编码的数据保存为图片文件
    for i, image_data in enumerate(blob_image_data):
        # 将Base64编码的数据转换为BytesIO对象
        image_binary = base64.b64decode(image_data)
        # 创建文件名
        filename = f"image_{i}.png"
        # 替换文件扩展名为.png
        file_path = os.path.join(subdir, filename)
        with open(file_path, 'wb') as f:
            f.write(image_binary)
        print(f"图片已保存: {file_path}")


if __name__ == '__main__':
    url = input('请输入url')
    chrome_path = r'C:\Program Files\Google\Chrome\Application\chrome.exe'  # 替换为您的Chrome浏览器路径
    # 手动指定ChromeDriver的路径
    chromedriver_path = r'C:\Program Files\Google\Chrome\Application\chromedriver.exe'  # 替换为您的ChromeDriver路径
    # 创建Chrome选项
    chrome_options = Options()
    chrome_options.binary_location = chrome_path  # 设置Chrome浏览器的路径
    chrome_options.add_argument('--ignore-certificate-errors')  # 忽略证书错误
    # 如果是测试本地HTTPS服务器，可以使用以下参数
    chrome_options.add_argument('--allow-insecure-localhost')
    # 如果需要禁用SSL验证（不推荐在生产环境使用）
    # chrome_options.add_argument('--disable-ssl-verification')
    # 创建ChromeDriver服务
    service = Service(executable_path=chromedriver_path)
    # 创建WebDriver实例
    driver = webdriver.Chrome(service=service, options=chrome_options)
    save_dir = r'F:\Temp\picture'
    blob_image_data = fetch_blob_pic_data(driver, url)
    # 获取页面标题
    html_content = driver.page_source
    title = get_page_title(html_content)


    # 保存图片
    save_pic(blob_image_data, save_dir, title)

    # 最后记得关闭WebDriver实例以释放资源
    driver.quit()

miracle1989 · 发表于 2024-8-8 22:57

ts下载代码：

[Python] 纯文本查看 复制代码

def decrypt_ts_file(file_path, key, iv):
    with open(file_path, 'rb') as file:
        #读取文件的全部内容到变量encrypted_data。
        encrypted_data = file.read()
#使用key和iv创建一个新的AES加密对象cipher，模式为CBC（Cipher Block Chaining，密码块链接模式）
    cipher = AES.new(key, AES.MODE_CBC, iv=iv)
#首先使用cipher.decrypt()方法对加密数据进行解密，然后使用unpad函数去除解密后数据的填充（padding），AES.block_size是AES加密算法的块大小，
    decrypted_data = unpad(cipher.decrypt(encrypted_data), AES.block_size)
    with open(file_path, 'wb') as file:
        file.write(decrypted_data)

def download_ts_files(segments, key, iv, download_dir):
    for segment in segments:
        uri = segment['uri']
        file_name = uri.split('/')[-1]
        file_path = os.path.join(download_dir, file_name)
        print(file_name,file_path)
        response = requests.get(uri)
        with open(file_path, 'wb') as file:
            file.write(response.content)
        if key and iv:
            decrypt_ts_file(file_path, key, iv)

帐号		自动登录	找回密码
密码			注册[Register]

[求助] 新手求助！使用python无法获取某网站blob图片资源，请问应该如何获取呢？

免费评分

免费评分