某红书图片无水印单个/批量下载源码

邪帝发表于 2023-3-16 16:04

起因：刚刚逛论坛发现发了一个小红书无水印的下载，于是我就上GitHub上逛了逛
过程：看到一个接口，运行发现可能出现报错，进行了以下代码优化原地址GitHub
原先为数组下载，更改为单个获取以及读取文本批量下载 PS：需要COOKIE(小红书网页版获取ck)
软件就不打包了，直接上代码，虚拟机python环境测试正常测试结果如下：
1.拷贝代码
2.安装支持库
3.运行结果

单个：
批量：

代码如下：import requests
from bs4 import BeautifulSoup
import os
import re
import json

def mkdir(path):
'''
创建文件夹
'''
folder = os.path.exists(path)
if not folder:# 判断是否存在文件夹如果不存在则创建为文件夹
   print("---创建新的文件夹😀---")
   os.makedirs(path)# makedirs 创建文件时如果路径不存在会创建这个路径
   print("---OK 🚩 ---")
else:
   print("--- ⚠️ 文件夹已存在!---")

def fetchUrl(url):
'''
发起网络请求，获取网页源码
'''
headers = {
   'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9 ',
   'cookie': '',
   # 换成自己的cookie哦~
   'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4098.3 Safari/537.36',
}

r = requests.get(url, headers=headers)
return r.text

def parsing_link(html):
'''
解析html文本，提取无水印图片的 url
'''
soup = BeautifulSoup(html, 'html.parser')
script = soup.find('script', string=re.compile('window\.__INITIAL_STATE__'))

test = re.split(r'=', script.string)
# print(test)
# 处理字符串json数据不合理的地方
string = test.replace('undefined', 'null')

# 转换成json数据
result = json.loads(string, strict=False)

# 获取对应字段
imageList = result.get('note', {}).get('note', {}).get('imageList')
title = result.get('note', {}).get('note', {}).get('title')
if title == '':
   title = result.get('note', {}).get('note', {}).get('desc')
title = sanitize_folder_name(title)
if imageList and title:
   print('标题：', title)
   print('开始下载啦！🚀')

   # 调用生成以title为名的文件夹, 可自定义要保存的路径
   file = os.path.dirname(__file__) + '/image/' + title
   mkdir(file)

   # 提取图片
   for i in imageList:
         picUrl = f"https://sns-img-qc.xhscdn.com/{i['traceId']}"
         yield picUrl, i['traceId'], title

def download(url, filename, folder):
'''
下载图片
'''
headers = {
   'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4098.3 Safari/537.36',
}
try:
   r = requests.get(url, headers=headers)
   # 获取图片格式
   content_type = r.headers.get('Content-Type')
   image_format = content_type.split('/')[-1]
   # print(f'Image format: {image_format}')

   # 如果 image_format 不是图片格式的后缀，则将其更改为 jpeg
   if image_format not in ['jpeg', 'png', 'gif', 'bmp']:
         image_format = 'jpeg'

   # 根据检测到的格式保存文件
   with open(f'image/{folder}/{filename}.{image_format}', 'wb') as v:
         v.write(r.content)
except Exception as e:
   print('图片下载错误！')

# 修正文件夹命名
def sanitize_folder_name(name: str) -> str:
# 替换非法字符
name = re.sub(r'[<>:"/\\|?*]', '_', name)
# 删除首尾空格
name = name.strip()
# 截断过长的名称
if len(name) > 255:
   name = name[:255]
return name

def roopLink(url):
html = fetchUrl(url)
traceId = 0
for url, _, title in parsing_link(html):
   print(f"download image {url}")
   download(url, traceId, title)
   traceId += 1

if __name__ == '__main__':
choice = input("请输入选项 (1: 循环获取链接; 2: 获取文本地址并下载): ")
if choice == '1':
   while True:
         links = input("请输入小红书的链接 (输入 'end' 结束程序): ")
         if links == 'end':
            break
         roopLink(links)
elif choice == '2':
   file_path = input("请输入文本文件地址(回车默认是1.txt): ")
   if file_path == '':
         file_path = '1.txt'
   with open(file_path, 'r') as f:
         links = f.read().splitlines()
   for link in links:
         roopLink(link)

print("下载完成啦!🎉")

RKCN 发表于 2023-10-2 00:33

- 记录 : 2023-10-01 测试这段代码，无法正常获取数据，简单排查并修改了一下[@邪帝](/home.php?mod=space&uid=418158) 大人的代码

```python
import requests
from bs4 import BeautifulSoup
import os
import re
import json
# 从 Web 版直接复制下来的 cookie 值
COOKIE = ''

def mkdir(path):
'''
创建文件夹
'''
folder = os.path.exists(path)
if not folder:# 判断是否存在文件夹如果不存在则创建为文件夹
   print("---创建新的文件夹😀---")
   os.makedirs(path)# makedirs 创建文件时如果路径不存在会创建这个路径
   print("---OK 🚩 ---")
else:
   print("--- ⚠️ 文件夹已存在!---")

def fetchUrl(url):
'''
发起网络请求，获取网页源码
'''
headers = {
   'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9 ',
   'cookie': COOKIE, # 换成自己的cookie哦~
   'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4098.3 Safari/537.36',
}

r = requests.get(url, headers=headers)
return r.text

def parsing_link(html):
'''
解析html文本，提取无水印图片的 url
'''
soup = BeautifulSoup(html, 'html.parser')
script = soup.find('script', string=re.compile('window\.__INITIAL_STATE__'))

test = re.split(r'=', script.string)
# 处理字符串json数据不合理的地方
string = test.replace('undefined', 'null')

# 转换成json数据
result = json.loads(string, strict=False)

note_id = result.get('note', {}).get('firstNoteId', False)
note = result.get('note', {}).get('noteDetailMap', {}).get(note_id, {}).get('note', {})
# 获取对应字段
imageList = note.get('imageList')
title = note.get('title')
if title == '':
   title = soup.select_one('title').text
title = sanitize_folder_name(title)

if imageList and title:
   print('标题：', title)
   print('开始下载啦！🚀')

   # 调用生成以title为名的文件夹, 可自定义要保存的路径
   # file = os.path.dirname(__file__) + '/image/' + title
   file = './images/'+title
   mkdir(file)

   # 提取图片
   for i in imageList:
         for x in i['infoList']:
            if x['imageScene']=='CRD_PRV_WEBP':
               picUrl = x['url']
         yield picUrl, i['traceId'], title

def download(url, filename, folder, auto_convert=False):
'''
下载图片
'''
headers = {
   'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4098.3 Safari/537.36',
}
try:
   r = requests.get(url, headers=headers)
   # 获取图片格式
   content_type = r.headers.get('Content-Type')
   image_format = content_type.split('/')[-1]
   # print(f'Image format: {image_format}')

   target_format = {'jpeg':'JPEG'}
   # 如果 image_format 不是图片格式的后缀，则将其更改为 jpeg
   if image_format not in ['jpeg', 'png', 'gif', 'bmp', 'webp']:
         image_format = 'jpeg'

   target_filepath = f'images/{folder}/{filename}'

   if auto_convert != False and auto_convert in target_format:
         from io import BytesIO
         from PIL import Image
         byte_stream = BytesIO(r.content)
         im = Image.open(byte_stream)
         if im.mode == "RGBA":
            im.load()# required for png.split()
            background = Image.new('RGB', im.size, (255, 255, 255))
            background.paste(im, mask=im.split())
         target_filepath = target_filepath + '.' + auto_convert
         format_type = target_format
         im.save(target_filepath, format_type)
         return True

   target_filepath = target_filepath + '.' + image_format
   # 根据检测到的格式保存文件
   with open(target_filepath, 'wb') as v:
         v.write(r.content)
   return True
except Exception as e:
   print(e)
   print('图片下载错误！')
   return False

# 修正文件夹命名
def sanitize_folder_name(name: str) -> str:
# 替换非法字符
name = re.sub(r'[<>:"/\\|?*]', '_', name)
# 删除首尾空格
name = name.strip()
# 截断过长的名称
if len(name) > 255:
   name = name[:255]
return name

def roopLink(url):
html = fetchUrl(url)
traceId = 0
for url, _, title in parsing_link(html):
   print(f"download image {url}")
   # download(url, traceId, title)
   download(url, traceId, title, auto_convert='jpeg')
   traceId += 1

if __name__ == '__main__':
choice = input("请输入选项 (1: 循环获取链接; 2: 获取文本地址并下载): ")
if choice == '1':
   while True:
         links = input("请输入小红书的链接 (输入 'end' 结束程序): ")
         print('link', links)
         if links == 'end':
            break
         roopLink(links)
elif choice == '2':
   file_path = input("请输入文本文件地址(回车默认是1.txt): ")
   if file_path == '':
         file_path = '1.txt'
   with open(file_path, 'r') as f:
         links = f.read().splitlines()
   for link in links:
         roopLink(link)

print("下载完成啦!🎉")
```

现在能愉快的下载了

lbxx520 发表于 2023-3-16 16:09

厉害，学习了！

llh0101 发表于 2023-3-16 16:11

厉害👍

Zl1994668 发表于 2023-3-16 16:26

谢谢，学到了

17781110010 发表于 2023-3-16 16:26

谢谢楼主分享

qaq223333 发表于 2023-3-16 16:28

多谢
楼主分享学习学习

seventeen17 发表于 2023-3-16 16:30

感谢，学习一下

davidlikecookie 发表于 2023-3-16 16:32

刚好需要去水印，感谢！

myhzqa 发表于 2023-3-16 16:33

感谢大佬分享，学习了{:1_893:}

血情发表于 2023-3-16 16:50

有exe文件吗

页: [1] 2 3 4 5

吾爱破解 - 52pojie.cn's Archiver

某红书图片无水印单个/批量下载源码