爬取微博评论突破50页限制

木头MT · 发表于 2022-4-17 23:27

本帖最后由木头MT 于 2022-4-18 13:39 编辑

其实突破五十页限制就是用移动版weibo
今天突然想做个舆情分析，
因为看到团中央的一些事，
就去爬了一些内容，
看了一下论坛内的爬虫好像都不能用了，
就改了一下，存到csv，后期的舆情分析可以用@灵海之森这位兄弟的库
https://github.com/stay-leave/weibo-crawer
大佬勿喷
直接上代码：

[Python] 纯文本查看 复制代码

# 定义保存评论的函数
import requests
import re
import csv
import time
import random
def bcpl(weibo_id, url, headers, number):
    url1=url
    count = 0  # 设置一个初始变量count为0来进行计数
    max_id=0
        # 当count数量小于预期的number时，进行循环
    while count < number:
        # 判断是不是第一组评论，如果是的话，第一组评论不需要加max_id，之后的需要加
        if count == 0:
            try:
                url3 = url + weibo_id + '&mid=' + weibo_id + '&max_id_type=0'
                time.sleep(random.randint(0, 5))
                web_data = requests.get(url3, headers=headers)  # F12查看data信息

                js_con = web_data.json()  # 转换一下数据格式
                # 获取连接下一页评论的max_id
                max_id = js_con['data']['max_id']  # max_id在[data]中
                max_tpye = js_con['data']['max_id_type']
                print(url)
                comments = js_con['data']['data']  # 获得数据中[data]中的[data]
                for comment in comments:  # 依次循环获得comments中的数据
                    text = comment["text"]
                    create_time = time.strftime( '%Y-%m-%d %H:%M:%S',(time.strptime(comment['created_at'].replace('+0800',''))))
                    floor_number = comment['floor_number']
                    userid = comment['user']['id']
                    screen_name = comment['user']['screen_name']
                    label = re.compile(r'</?\w+[^>]*>', re.S)
                    text = re.sub(label, '', text)
                    count += 1  # count = count + 1
                    csv_opreator([count, create_time, userid, screen_name, floor_number, text])
                    print([count, create_time, userid, screen_name, floor_number, text])
                    print("第{}条数据获取成功".format(count))
            except Exception as e:
                print("出错了", e)
                continue
        else:
            try:
                url2 = url1 + weibo_id + '&mid=' + weibo_id +'&max_id=' + str(max_id) + '&max_id_type='+str(max_tpye)
                time.sleep(random.randint(0, 6))
                web_data = requests.get(url2, headers=headers)
                print(web_data,url2)
                js_con = web_data.json()
                max_id = js_con['data']['max_id']
                max_tpye=js_con['data']['max_id_type']
                comments = js_con['data']['data']
                for comment in comments:
                    text = comment["text"]
                    create_time = time.strftime( '%Y-%m-%d %H:%M:%S',(time.strptime(comment['created_at'].replace('+0800',''))))
                    floor_number = comment['floor_number']
                    userid = comment['user']['id']
                    screen_name = comment['user']['screen_name']
                    label = re.compile(r'</?\w+[^>]*>', re.S)
                    text = re.sub(label, '', text)
                    count += 1
                    csv_opreator([count, create_time, userid, screen_name, floor_number, text])
                    print([count, create_time, userid, screen_name, floor_number, text])
                    print("第{}条数据获取成功".format(count))
            except Exception as e:
                print("出错了", e)
                continue
def csv_opreator(a):
    with open("weibocoments.csv", "a") as f:
        writer = csv.writer(f)
        writer.writerow(a)
if __name__ == "__main__":
    fileHeader = ["id", "评论时间", "用户ID", "昵称", "评论楼层", "评论内容"]
    csv_opreator(fileHeader)
    user_agent = [
        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
        "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
        "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
        "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
        "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
        "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
        "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
        "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
        "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
        "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
        "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
        "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
        "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
        "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
        "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
        "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
        "UCWEB7.0.2.37/28/999",
        "NOKIA5700/ UCWEB7.0.2.37/28/999",
        "Openwave/ UCWEB7.0.2.37/28/999",
        "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
        # iPhone 6：
        "Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25",
    ]
    cookies = [
        '' # 微博的cookie
        ]
    headers = {'User-Agent': random.choice(user_agent)
        , 'Cookie': random.choice(cookies)
               # 'Referer': 'https://m.weibo.cn/detail/4497103885505673',
               # 'Sec-Fetch-Mode': 'navigate'
    }  # 设置user-agent来进行伪装，突破微博反爬限制
    url = 'https://m.weibo.cn/comments/hotflow?id='
    weibo_id = '4753831142888139' # 要爬取的微博id  #https://m.weibo.cn/detail/4478512314460101
    # 打开微博手机端网页https://m.weibo.cn，找到要爬取的微博id！
    number = 10000  # 设置爬取评论量,爬取量在第X组，爬取时会爬取下来该组的数据，所以最终数据可能会大于number，一般是个整10的数
    bcpl(weibo_id, url, headers, number)

贴一下运行图
注意一下不要访问太频繁睡一会

cookie自己抓一下填上去，一定要睡一下，大概10s左右

木头MT · 发表于 2022-4-18 13:41

qq632280928 发表于 2022-4-18 09:45
试了采集不全 1400条，只能采集 200 多条 COOKIE 更新了

自己抓一下cookie，然后这个只爬了主评论，子评论没有爬，是爬不到总评论数的

71q3M5cT9a · 发表于 2022-4-17 23:39

条条大路通罗马

CCQc · 发表于 2022-4-18 08:17

学习思路，感谢分享

wanlinwo · 发表于 2022-4-18 08:43

提示: 作者被禁止或删除内容自动屏蔽

Asra · 发表于 2022-4-18 09:02

之前只爬过B站的，微博的没爬过，学习下

lca18214474709 · 发表于 2022-4-18 09:07

新手求教

qq632280928 · 发表于 2022-4-18 09:45

试了采集不全 1400条，只能采集 200 多条 COOKIE 更新了

jinzhu160 · 发表于 2022-4-18 09:55

提示: 作者被禁止或删除内容自动屏蔽

atxz · 发表于 2022-4-18 10:22

进来学习一下，感谢大佬指点

dork · 发表于 2022-4-18 11:03

cookie还能用不？

帐号		自动登录	找回密码
密码			注册[Register]

wanlinwo wanlinwo 当前离线好友阅读权限 0 听众最后登录 1970-1-1 头像被屏蔽	4^# wanlinwo 发表于 2022-4-18 08:43 提示: 作者被禁止或删除内容自动屏蔽
	如何快速判断一个文件是否为病毒！
	回复支持举报

[Python 转载] 爬取微博评论突破50页限制

免费评分

浏览过的版块

jinzhu160 jinzhu160 当前离线好友阅读权限 0 听众最后登录 1970-1-1 头像被屏蔽	8^# jinzhu160 发表于 2022-4-18 09:55 提示: 作者被禁止或删除内容自动屏蔽

	回复支持举报