吾爱破解 - 52pojie.cn

 找回密码
 注册[Register]

QQ登录

只需一步,快速开始

查看: 16482|回复: 116
收起左侧

[Python 转载] 小红书高清“无码”图片抓取

     关闭 [复制链接]
shanhu5235 发表于 2021-12-28 20:48
本帖最后由 苏紫方璇 于 2021-12-29 11:26 编辑

小红书抓无水印的图写好有段时间了,前几天小红书的数据又小改动了下,逛论坛的时候看到有童鞋发了
我也分享下我的(半路出家的大叔自学的,代码不规范勿喷哈)
主要思路是:短链接转换成正常链接,请求数据获取cookies,cookies有一周左右的时效性,所以用selenium打开网页(要去下载chorme driver),提取新的cookie,其他地方也就正常获取,但是我发现小红书有个webp格式的,用jpg格式保存下来,文件打不开,目前有办法能转换,但是很繁琐,等我弄好,我再更新下,下面贴代码
[Python] 纯文本查看 复制代码
#from tkinter import Tk
import tkinter as tk
import time,requests,re,json,os
import urllib.request
from selenium import webdriver
from PIL import Image
session =requests.session()
tm = time.time()
# 创建主窗口
win = tk.Tk()
# 设置标题
win.title("小红书抓图     作者:小发哥")

# 窗体大小设置
width = 1100
height = 800
# 获取屏幕分辨率
screen_width = win.winfo_screenwidth()
screen_height = win.winfo_screenheight()
position = f"{width}x{height}+{(screen_width-width)/2:.0f}+{(screen_height-height)/2:.0f}"
#win.geometry(position)
win.geometry('785x500')
# 进入消息循环,可以写控件

# 通用页面headers


# 创建提示文本
lb = tk.Label(win, text='小红书链接:')

# 创建文本框
entry = tk.Entry(win,width=80)# width 设置输入框的宽度,以字符为单位,默认值是20
# 创建一个多行文本框
t = tk.Text(win, width=105,height=30)


def getid(url, cookies):
    url = url
    headers = {
        'Host': 'xhslink.com',
        'Upgrade-Insecure-Requests': '1',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 12_4_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) '
                      'Mobile/15E148 MicroMessenger/8.0.2(0x18000234) NetType/WIFI Language/zh_CN',
        'Accept-Language': 'zh-cn',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive'
    }
    resp = session.get(url, headers=headers, allow_redirects=False)
    text = resp.content.decode('utf-8')
    #print(text)
    id = re.compile('<a href="https://www.xiaohongshu.com/discovery/item/(.*?)share').findall(text)[
        0].replace('?', '')  # 获取博主id
    print(id)
    appuid = re.compile("appuid=(.*?)&").findall(text)[0]  # 提交下次get请求必要数据
    print(appuid)
    url1 = 'https://www.xiaohongshu.com/discovery/item/' + id
    print(url1)
    headers1 = {

        'cookie': cookies,
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 12_4_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) '
                      'Mobile/15E148 MicroMessenger/8.0.2(0x18000236) NetType/WIFI Language/zh_CN',
        'accept-language': 'zh-cn',
        'accept-encoding': 'gzip, deflate'
    }
    data1 = {
        'share_from_user_hidden': 'true',
        'xhsshare': 'CopyLink',
        'appuid': appuid,
        'apptime': '1640424647'
    }
    resp1 = session.get(url1, headers=headers1, params=data1)
    text1 = resp1.content.decode('utf-8')
    cookies = requests.utils.dict_from_cookiejar(resp1.cookies)
    print(text1)
    # print(resp1.headers)
    videourl = re.compile('"url":"(.*?)"').findall(text1)[-1].encode('utf8').decode('unicode_escape')
    # print(videourl)
    name = re.compile('"nickname":"(.*?)"').findall(text1)[1].replace('*', '')
    traceId = re.compile('traceId":"(.*?)"}').findall(text1)
    txt = re.compile('description":"(.*?)",').findall(text1)[0]
    txtname = name + '.txt'
    # print(name)
    toPath2 = r'E:\小红书'  # D:\pycharm\自动下单目录\小红书\图片库
    toPath1 = os.path.join(toPath2 + "\\" + name + ' ' + id)
    if not os.path.exists(toPath1):
        os.makedirs(toPath1)
    # print(txt)
    # print(traceId)
    if 'v.xiaohongshu' in videourl:

        path = os.path.join(toPath1, id + ".mp4")
        urllib.request.urlretrieve(videourl, filename=path)
        toPath3 = os.path.join(toPath1 + '\\' + txtname)
        t.insert("insert", f"开始下载视频...\n")
        with open(toPath3, 'a', encoding='utf-8') as f:
            f.write(txt)
        print('小主,视频已下载' + '!' + '在这个目录下:' + toPath1 + '\n')
        t.insert("insert", f'小主,视频已下载' + '!' + '在这个目录下:' + toPath1 + '\n')
    else:
        t.insert("insert", f"开始下载图片...\n")
        for x in traceId:
            if text1.find('WebPage') > 0 :
                pic = 'http://sns-img-hw.xhscdn.com/' + x
                path = os.path.join(toPath1, str(x) + ".webp")#文件格式判断是否是webp格式
                urllib.request.urlretrieve(pic, filename=path)
                t.insert("insert", f"{pic}\n")
                #t.insert("insert", f"获取到{len(imgs)}张图片\n")
                time.sleep(2)

            else:
                pic = 'http://sns-img-hw.xhscdn.com/' + x
                path = os.path.join(toPath1, str(x) + ".jpg")
                urllib.request.urlretrieve(pic, filename=path)
                t.insert("insert", f"{pic}\n")
                #t.insert("insert", f"获取到{len(imgs)}张图片\n")
                print(pic)

        toPath3 = os.path.join(toPath1 + '\\' + txtname)
        with open(toPath3, 'a', encoding='utf-8') as f:
            f.write(txt)
        print('小主,图片已下载' + '!' + '在这个目录下:' + toPath1 + '\n')
        t.insert("insert", f'小主,图片已下载' + '!' + '在这个目录下:' + toPath1 + '\n')

    return id, appuid

def getcook():
    option = webdriver.ChromeOptions()
    option.add_argument('--headless')  # 设置option
    driver = webdriver.Chrome(chrome_options=option)  # 调用带参数的谷歌浏览器
    driver.get(
        "https://www.xiaohongshu.com/discovery/item/60688b91000000002103cabc?xhsshare=CopyLink&appuid=5d655b6e000000000100656a&apptime=1619168976")
    time.sleep(5)
    cookies = driver.get_cookies()
    # print(cookies)
    # print(type(cookies))
    a = cookies[0]['value']
    timestamp2sig = cookies[0]['name']
    timestamp2 = cookies[1]['value']
    xhsuid = cookies[2]['value']
    xhsTrackerId = cookies[4]['value']
    cookies_1 = 'timestamp2' + '=' + timestamp2 + ';' + timestamp2sig + '=' + a + ';' + 'xhsuid' + '=' + xhsuid + ';' \
                + \
                'extra_exp_ids=gif_clt1,ques_clt1; xhsTracker=url=noteDetail&xhsshare=CopyLink;' + 'xhsTrackerId' + \
                '=' + xhsTrackerId
    with open('cookies.txt', 'w', encoding='utf-8') as f:
        f.write(cookies_1)
    return cookies_1

    # print(text)


# 分析网页图片




# 事件函数
def down_img():
    # print("hello world")
    # 获取文本框内容
    url = entry.get()
    try:
        with open('cookies.txt', 'r', encoding='utf-8') as f:
            cookies = f.readline()
            getid(url, cookies)
    except:
        cookies = getcook()
        getid(url, cookies)

# 创建按钮
btn = tk.Button(win,text = '点击下载', command = down_img)


lb.grid(row=1,column=0,padx=10,pady=20)
entry.grid(row=1,column=1,pady=20)
btn.grid(row=1,column=2,padx=10,pady=20)
t.grid(row=3,column=0,padx=20,columnspan=10)

win.mainloop()


免费评分

参与人数 27吾爱币 +24 热心值 +23 收起 理由
playbbbb1111 + 1 + 1 热心回复!
why3316 + 1 + 1 谢谢@Thanks!
Sonroi00 + 1 我很赞同!
feng61328 + 1 + 1 用心讨论,共获提升!
Zzq9708 + 1 用心讨论,共获提升!
karveos + 1 用心讨论,共获提升!
lyslxx + 1 + 1 我很赞同!
fmqq1994 + 1 + 1 谢谢@Thanks!
诸葛亮吃凉皮儿 + 1 + 1 热心回复!
war54288 + 1 + 1 谢谢@Thanks!
火令2020 + 1 用心讨论,共获提升!
莫奇 + 1 + 1 谢谢 @Thanks!
腿毛利小五郎 + 1 + 1 感谢发布原创作品,吾爱破解论坛因你更精彩!
qwe134133987 + 1 + 1 热心回复!
hetingting + 1 我很赞同!
诗和远方代言人 + 1 + 1 用心讨论,共获提升!
alam-132 + 1 + 1 我很赞同!
kaixianxian + 1 + 1 谢谢@Thanks!
kk1212 + 1 + 1 热心回复!
yyb414 + 1 + 1 热心回复!
zhouwei336 + 1 + 1 谢谢@Thanks!
1wang + 1 + 1 我很赞同!
lgc81034 + 1 谢谢@Thanks!
nekoneko2021 + 1 + 1 我很赞同!
kinalon + 1 谢谢@Thanks!
grrr_zhao + 1 + 1 谢谢@Thanks!
lp0528 + 1 + 1 谢谢@Thanks!

查看全部评分

本帖被以下淘专辑推荐:

发帖前要善用论坛搜索功能,那里可能会有你要找的答案或者已经有人发布过相同内容了,请勿重复发帖。

wangdani 发表于 2021-12-28 21:28
一看就是大神,我一句也没看懂

免费评分

参与人数 2吾爱币 +2 热心值 +2 收起 理由
2004330 + 1 + 1 我很赞同!
grrr_zhao + 1 + 1 谢谢@Thanks!

查看全部评分

halfone 发表于 2021-12-28 22:48
本帖最后由 苏紫方璇 于 2021-12-29 11:26 编辑
sharecat2022 发表于 2021-12-28 21:27
支持一下,要是能把代码放到代码块里面就更好了

[Python] 纯文本查看 复制代码
#from tkinter import Tk
import tkinter as tk
import time,requests,re,json,os
import urllib.request
from selenium import webdriver
from PIL import Image
session =requests.session()
tm = time.time()
# 创建主窗口
win = tk.Tk()
# 设置标题
win.title("小红书抓图     作者:小发哥  ")

# 窗体大小设置
width = 1100
height = 800
# 获取屏幕分辨率
screen_width = win.winfo_screenwidth()
screen_height = win.winfo_screenheight()
position = f"{width}x{height}+{(screen_width-width)/2:.0f}+{(screen_height-height)/2:.0f}"
#win.geometry(position)
win.geometry('785x500')
# 进入消息循环,可以写控件

# 通用页面headers


# 创建提示文本
lb = tk.Label(win, text='小红书链接:')

# 创建文本框
entry = tk.Entry(win,width=80)# width 设置输入框的宽度,以字符为单位,默认值是20
# 创建一个多行文本框
t = tk.Text(win, width=105,height=30)


def getid(url, cookies):
    url = url
    headers = {
        'Host': 'xhslink.com',
        'Upgrade-Insecure-Requests': '1',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 12_4_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) '
                      'Mobile/15E148 MicroMessenger/8.0.2(0x18000234) NetType/WIFI Language/zh_CN',
        'Accept-Language': 'zh-cn',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive'
    }
    resp = session.get(url, headers=headers, allow_redirects=False)
    text = resp.content.decode('utf-8')
    #print(text)
    id = re.compile('<a href="https://www.xiaohongshu.com/discovery/item/(.*?)share').findall(text)[
        0].replace('?', '')  # 获取博主id
    print(id)
    appuid = re.compile("appuid=(.*?)&").findall(text)[0]  # 提交下次get请求必要数据
    print(appuid)
    url1 = 'https://www.xiaohongshu.com/discovery/item/' + id
    print(url1)
    headers1 = {

        'cookie': cookies,
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 12_4_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) '
                      'Mobile/15E148 MicroMessenger/8.0.2(0x18000236) NetType/WIFI Language/zh_CN',
        'accept-language': 'zh-cn',
        'accept-encoding': 'gzip, deflate'
    }
    data1 = {
        'share_from_user_hidden': 'true',
        'xhsshare': 'CopyLink',
        'appuid': appuid,
        'apptime': '1640424647'
    }
    resp1 = session.get(url1, headers=headers1, params=data1)
    text1 = resp1.content.decode('utf-8')
    cookies = requests.utils.dict_from_cookiejar(resp1.cookies)
    print(text1)
    # print(resp1.headers)
    videourl = re.compile('"url":"(.*?)"').findall(text1)[-1].encode('utf8').decode('unicode_escape')
    # print(videourl)
    name = re.compile('"nickname":"(.*?)"').findall(text1)[1].replace('*', '')
    traceId = re.compile('traceId":"(.*?)"}').findall(text1)
    txt = re.compile('description":"(.*?)",').findall(text1)[0]
    txtname = name + '.txt'
    # print(name)
    toPath2 = r'E:\小红书'  # D:\pycharm\自动下单目录\小红书\图片库
    toPath1 = os.path.join(toPath2 + "\\" + name + ' ' + id)
    if not os.path.exists(toPath1):
        os.makedirs(toPath1)
    # print(txt)
    # print(traceId)
    if 'v.xiaohongshu' in videourl:

        path = os.path.join(toPath1, id + ".mp4")
        urllib.request.urlretrieve(videourl, filename=path)
        toPath3 = os.path.join(toPath1 + '\\' + txtname)
        t.insert("insert", f"开始下载视频...\n")
        with open(toPath3, 'a', encoding='utf-8') as f:
            f.write(txt)
        print('小主,视频已下载' + '!' + '在这个目录下:' + toPath1 + '\n')
        t.insert("insert", f'小主,视频已下载' + '!' + '在这个目录下:' + toPath1 + '\n')
    else:
        t.insert("insert", f"开始下载图片...\n")
        for x in traceId:
            if text1.find('WebPage') > 0 :
                pic = 'http://sns-img-hw.xhscdn.com/' + x
                path = os.path.join(toPath1, str(x) + ".webp")#文件格式判断是否是webp格式
                urllib.request.urlretrieve(pic, filename=path)
                t.insert("insert", f"{pic}\n")
                #t.insert("insert", f"获取到{len(imgs)}张图片\n")
                time.sleep(2)

            else:
                pic = 'http://sns-img-hw.xhscdn.com/' + x
                path = os.path.join(toPath1, str(x) + ".jpg")
                urllib.request.urlretrieve(pic, filename=path)
                t.insert("insert", f"{pic}\n")
                #t.insert("insert", f"获取到{len(imgs)}张图片\n")
                print(pic)

        toPath3 = os.path.join(toPath1 + '\\' + txtname)
        with open(toPath3, 'a', encoding='utf-8') as f:
            f.write(txt)
        print('小主,图片已下载' + '!' + '在这个目录下:' + toPath1 + '\n')
        t.insert("insert", f'小主,图片已下载' + '!' + '在这个目录下:' + toPath1 + '\n')

    return id, appuid

def getcook():
    option = webdriver.ChromeOptions()
    option.add_argument('--headless')  # 设置option
    driver = webdriver.Chrome(chrome_options=option)  # 调用带参数的谷歌浏览器
    driver.get(
        "https://www.xiaohongshu.com/discovery/item/60688b91000000002103cabc?xhsshare=CopyLink&appuid=5d655b6e000000000100656a&apptime=1619168976")
    time.sleep(5)
    cookies = driver.get_cookies()
    # print(cookies)
    # print(type(cookies))
    a = cookies[0]['value']
    timestamp2sig = cookies[0]['name']
    timestamp2 = cookies[1]['value']
    xhsuid = cookies[2]['value']
    xhsTrackerId = cookies[4]['value']
    cookies_1 = 'timestamp2' + '=' + timestamp2 + ';' + timestamp2sig + '=' + a + ';' + 'xhsuid' + '=' + xhsuid + ';' \
                + \
                'extra_exp_ids=gif_clt1,ques_clt1; xhsTracker=url=noteDetail&xhsshare=CopyLink;' + 'xhsTrackerId' + \
                '=' + xhsTrackerId
    with open('cookies.txt', 'w', encoding='utf-8') as f:
        f.write(cookies_1)
    return cookies_1

    # print(text)


# 分析网页图片




# 事件函数
def down_img():
    # print("hello world")
    # 获取文本框内容
    url = entry.get()
    try:
        with open('cookies.txt', 'r', encoding='utf-8') as f:
            cookies = f.readline()
            getid(url, cookies)
    except:
        cookies = getcook()
        getid(url, cookies)

# 创建按钮
btn = tk.Button(win,text = '点击下载', command = down_img)


lb.grid(row=1,column=0,padx=10,pady=20)
entry.grid(row=1,column=1,pady=20)
btn.grid(row=1,column=2,padx=10,pady=20)
t.grid(row=3,column=0,padx=20,columnspan=10)

win.mainloop()

免费评分

参与人数 1吾爱币 +1 热心值 +1 收起 理由
chinguy + 1 + 1 我很赞同!

查看全部评分

头像被屏蔽
sharecat2022 发表于 2021-12-28 21:27
RWing 发表于 2021-12-28 21:55
大神厉害呀!
13729181580 发表于 2021-12-28 21:56
大神牛逼
peanut98 发表于 2021-12-28 22:02
好东西,非常感谢楼主分享
令狐冲了个澡爽 发表于 2021-12-28 22:19
大神威武
grrr_zhao 发表于 2021-12-28 22:42
我也一句没看懂
rumushiyi 发表于 2021-12-28 22:42
大佬牛b!!!
cherrypi 发表于 2021-12-28 22:44
非常不错,学习一下。
您需要登录后才可以回帖 登录 | 注册[Register]

本版积分规则

返回列表

RSS订阅|小黑屋|处罚记录|联系我们|吾爱破解 - LCG - LSG ( 京ICP备16042023号 | 京公网安备 11010502030087号 )

GMT+8, 2025-1-11 15:10

Powered by Discuz!

Copyright © 2001-2020, Tencent Cloud.

快速回复 返回顶部 返回列表