a5582323 发表于 2020-4-9 09:58

【原创开源】快手爬虫,根据eid批量爬取用户的所有图集和视频【已失效】

本帖最后由 a5582323 于 2020-5-14 15:26 编辑

看到之前有类似的帖子,这个人写代码很漂亮,而且发到gayhub上了,借鉴了里面一些代码
链接如下:https://www.52pojie.cn/thread-1124013-1-1.html
但是!!!但是!!!但是!!!我发这个帖子的原因如下,
1.上文的方法要登录、要cookies,麻烦
2.每个视频要重新获取无水印地址,效率低

本文采用递归查询加多线程,追求的是简单粗暴、效率
只需eid,只需eid,只需eid
更新下eid获取方法,快手APP上打开想下载的用户主页,点击右上角箭头,复制链接
链接如下:https://f.kuaishou.com/lT6Ox,电脑上打开链接,
自动跳转到新链接:https://live.kuaishou.com/profile/3xnvh7hzw7ib9ec,3xnvh7hzw7ib9ec就是eid
然后填入代码eidList里即可


2020-4-14代码已更新
用户反馈取不到数据,发现接口还是需要cookies,代码里自动获取cookies,无需用户手动

废话不多说,上代码

# -*-coding:utf-8 -*-
import requests
import time
import os
import json
import threading
import re

cookies = ""

def downVideo(video,d_url,v_name):
    if not os.path.exists(video):
      r = requests.get(d_url)
      r.raise_for_status()
      with open(video, "wb") as f:
            f.write(r.content)
      print("    视频 " + v_name + " 下载成功 √")
#    else:
#      print("    视频 " + v_name + " 已存在 √")

def downPic(j,pic,d_url,p_name):
    if not os.path.exists(pic):
      r = requests.get(d_url)
      r.raise_for_status()
      with open(pic, "wb") as f:
            f.write(r.content)
      print("    " + str(j + 1) + "/ 图片 " + p_name + " 下载成功 √")
#    else:
#      print("    " + str(j + 1) + "/ 图片 " + p_name + " 已存在 √")

def getCookies():
#    url = 'https://c.m.chenzhongtech.com/rest/lapi/getcoo?_='+str(int(round(time.time() * 1000)))
    url = 'https://live.kuaishou.com/u/3xnvh7hzw7ib9ec/3xqbgg5rrpui69c'
    headers_web = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
    'Connection': 'keep-alive',
    'Host': 'live.kuaishou.com',
#    'Origin': 'https://v.kuaishou.com',
#    'Referer': 'https://v.kuaishou.com/fw/photo/3xqbgg5rrpui69c',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'none',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
#    'Cookie':'did=web_c78c7a3f39befb6076e5891268254f0f'
    }
    rs = requests.get(url=url, headers=headers_web, allow_redirects=False)
#    resJson = json.loads(rs.content.decode(encoding='utf-8'))
    global cookies
#    cookies = resJson['cookies'].split(';')
    cookies = 'did='+rs.cookies._cookies['.kuaishou.com']['/']['did'].value

def getVideo(data):
    url = 'https://v.kuaishou.com/rest/kd/feed/profile'
    headers_web = {
    'accept': '*/*',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
    'Connection': 'keep-alive',
    'Content-Type': 'application/json',
    'Host': 'v.kuaishou.com',
    'Origin': 'https://v.kuaishou.com',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-origin',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
    #Cookie 根据自己的电脑修改
    #'Cookie': 'did=web_6ab2aa48ebfa49c18e497b1efb80429f'
    }
    headers_web["Cookie"] = cookies
    rs = requests.post(url=url, headers=headers_web, json=data)
    v_json = json.loads(rs.content.decode(encoding='utf-8'))
    if (str(v_json["result"])=="2"):
      print("服务器返回操作太快,可能触发反爬机制")
      return
    feeds = v_json["feeds"]
    for i in range(len(feeds)):
      feed = feeds
      caption = str(feed["caption"]).replace("\n","").replace("\u200b","").replace("\"","").replace("\\","")
      f_time = time.strftime('%Y-%m-%d %H%M%S', time.localtime(feed['timestamp'] / 1000))
      name = re.sub(r'[\\/:*?"<>|\r\n]+', "", feed['userName'])
      dir = "data/" + name + "(" + feed['userEid'] + ")/"
      if not os.path.exists(dir):
            os.makedirs(dir)
      if(str(feed['singlePicture']) == "False"):
            d_url = feed['mainMvUrls']['url']
            v_name = f_time + "_" + caption + ".mp4"
            video = dir + v_name
            t_downVideo = threading.Thread(target=downVideo, args=(video,d_url,v_name,))
            t_downVideo.start()
      else:
            try:
                imgList = feed['ext_params']['atlas']['list']
                cdn = feed['ext_params']['atlas']['cdn']
            except:
                imgList = []
                imgList.append(str(feed['coverUrls']['url']).replace("https://",""))
                cdn = ""
            for j in range(len(imgList)):
                p_name = f_time + "_" + caption + "_" + str(j + 1) + ".jpg"
                pic = dir + p_name
                d_url = "https://" + cdn + imgList.replace("webp","jpg")
                t_downPic = threading.Thread(target=downPic, args=(j,pic,d_url,p_name,))
                t_downPic.start()
    pcursor = v_json["pcursor"]
    if(str(pcursor) != "no_more"):
      data = {"eid":v_json['feeds']['userEid'],"count":30,"pcursor":pcursor}
      getVideo(data)



if not os.path.exists("/data"):
    os.makedirs("/data")
getCookies()
eidList = ["3xnvh7hzw7ib9ec","3xi4m53fqfftq94"]
for eid in eidList:
    data = {"eid":eid,"count":30,"pcursor":"0"}
    getVideo(data)
print("收工")

tiantangyiyun 发表于 2020-4-14 12:10


大佬好像是成功了谢谢 就是提示看不懂 这应该是成功意思吧目录里面也有文件   :lol
Exception in thread Thread-29:

Traceback (most recent call last):
File "F:\Python36\lib\threading.py", line 917, in _bootstrap_inner
    self.run()
File "F:\Python36\lib\threading.py", line 865, in run
    self._target(*self._args, **self._kwargs)
File "E:/kuaishoupil/kuaishoupiliang.py", line 15, in downVideo
    r.raise_for_status()
File "F:\Python36\lib\site-packages\requests\models.py", line 941, in raise_for_status
    raise HTTPError(http_error_msg, response=self)
requests.exceptions.HTTPError: 564 Server Error:for url: https://txmov2.a.yximgs.com/upic/2020/01/15/17/BMjAyMDAxMTUxNzIxMThfMTM4MjA5Mjg3XzIxNzcyMTczNDEwXzFfMw==_b_Bf4251feedf58a587234df0a5846642cb.mp4

Exception in thread Thread-26:
Traceback (most recent call last):
File "F:\Python36\lib\threading.py", line 917, in _bootstrap_inner
    self.run()
File "F:\Python36\lib\threading.py", line 865, in run
    self._target(*self._args, **self._kwargs)
File "E:/kuaishoupil/kuaishoupiliang.py", line 15, in downVideo
    r.raise_for_status()
File "F:\Python36\lib\site-packages\requests\models.py", line 941, in raise_for_status
    raise HTTPError(http_error_msg, response=self)
requests.exceptions.HTTPError: 564 Server Error:for url: https://txmov2.a.yximgs.com/upic/2020/01/24/18/BMjAyMDAxMjQxODE3NTJfMTM4MjA5Mjg3XzIyMzQ3NDE5NjM3XzFfMw==_b_Bcaff15ab0a5489297bc59eb1bff6f8bf.mp4

Exception in thread Thread-55:
Traceback (most recent call last):
File "F:\Python36\lib\threading.py", line 917, in _bootstrap_inner
    self.run()
File "F:\Python36\lib\threading.py", line 865, in run
    self._target(*self._args, **self._kwargs)
File "E:/kuaishoupil/kuaishoupiliang.py", line 15, in downVideo
    r.raise_for_status()
File "F:\Python36\lib\site-packages\requests\models.py", line 941, in raise_for_status
    raise HTTPError(http_error_msg, response=self)
requests.exceptions.HTTPError: 564 Server Error:for url: https://txmov2.a.yximgs.com/upic/2019/10/03/18/BMjAxOTEwMDMxODMwNDJfMTM4MjA5Mjg3XzE4MTQ2NjMzMzMxXzFfMw==_b_B14106d176cba7c9b228a12defc49ea54.mp4

    视频 2020-01-13 210454_我的新歌《爱到冬至》1月18号就上线了,你们一定要记住歌名喔到时候去上线了,大家去帮我打榜喔爱你们#全世界最好的逗奶粉 #陪伴陈逗逗 #冬季听歌在快手#.mp4 下载成功 √
    视频 2019-12-13 181645_居然在武汉偶遇了柯子颜老师,好幸运哇,赶紧和她拍个视频,她好可爱哇#陪伴陈逗逗 #全世界最好的陈逗逗 #快手颜值大赛.mp4 下载成功 √
    视频 2020-03-06 183751_#全世界最好的陈逗逗 #陈逗逗 #往后余生全是陈逗逗 今晚9:30,邀请了一个千万主播,你们猜猜是谁,我会和他一起打PK.mp4 下载成功 √
    视频 2019-07-22 185115_#全世界最好的陈逗逗 #陪伴陈逗逗 你们有没有发现什么?.mp4 下载成功 √

HEcong 发表于 2020-4-12 20:26

Traceback (most recent call last):
File "/data/user/0/ru.iiec.pydroid3/files/accomp_files/iiec_run/iiec_run.py", line 31, in <module>
    start(fakepyfile,mainpyfile)
File "/data/user/0/ru.iiec.pydroid3/files/accomp_files/iiec_run/iiec_run.py", line 30, in start
    exec(open(mainpyfile).read(),__main__.__dict__)
File "<string>", line 85, in <module>
File "<string>", line 44, in getVideo
KeyError: 'feeds'

shysunny 发表于 2020-4-9 10:16

牛人真多啊

冰棍好烫啊 发表于 2020-4-9 10:28

枫叶荻花 发表于 2020-4-9 10:37

一会打个包试试

空白的悲伤 发表于 2020-4-9 10:37

楼主,爬取过程中出现“远程主机强迫关闭了一个现有的连接”该怎么解决?

yjn866y 发表于 2020-4-9 10:57

认真学习。。谢谢分享

a5582323 发表于 2020-4-9 10:59

空白的悲伤 发表于 2020-4-9 10:37
楼主,爬取过程中出现“远程主机强迫关闭了一个现有的连接”该怎么解决?

我下了很多也没出现这个问题,不知道是你网络问题还是触发了快手的反爬机制

空白的悲伤 发表于 2020-4-9 11:18

a5582323 发表于 2020-4-9 10:59
我下了很多也没出现这个问题,不知道是你网络问题还是触发了快手的反爬机制

楼主你那个用户的eid代码是在哪里看的?

众益科技 发表于 2020-4-9 12:24

空白的悲伤 发表于 2020-4-9 11:18
楼主你那个用户的eid代码是在哪里看的?

同问+1{:1_918:}

caleb12 发表于 2020-4-9 20:41

eid我知道 但我不会代码,没看懂操作,无从下手{:1_907:}
页: [1] 2 3
查看完整版本: 【原创开源】快手爬虫,根据eid批量爬取用户的所有图集和视频【已失效】