[Python] 纯文本查看 复制代码
# 定义保存评论的函数
import requests
import re
import csv
import time
import random
def bcpl(weibo_id, url, headers, number):
url1=url
count = 0 # 设置一个初始变量count为0来进行计数
max_id=0
# 当count数量小于预期的number时,进行循环
while count < number:
# 判断是不是第一组评论,如果是的话,第一组评论不需要加max_id,之后的需要加
if count == 0:
try:
url3 = url + weibo_id + '&mid=' + weibo_id + '&max_id_type=0'
time.sleep(random.randint(0, 5))
web_data = requests.get(url3, headers=headers) # F12查看data信息
js_con = web_data.json() # 转换一下数据格式
# 获取连接下一页评论的max_id
max_id = js_con['data']['max_id'] # max_id在[data]中
max_tpye = js_con['data']['max_id_type']
print(url)
comments = js_con['data']['data'] # 获得数据中[data]中的[data]
for comment in comments: # 依次循环获得comments中的数据
text = comment["text"]
create_time = time.strftime( '%Y-%m-%d %H:%M:%S',(time.strptime(comment['created_at'].replace('+0800',''))))
floor_number = comment['floor_number']
userid = comment['user']['id']
screen_name = comment['user']['screen_name']
label = re.compile(r'</?\w+[^>]*>', re.S)
text = re.sub(label, '', text)
count += 1 # count = count + 1
csv_opreator([count, create_time, userid, screen_name, floor_number, text])
print([count, create_time, userid, screen_name, floor_number, text])
print("第{}条数据获取成功".format(count))
except Exception as e:
print("出错了", e)
continue
else:
try:
url2 = url1 + weibo_id + '&mid=' + weibo_id +'&max_id=' + str(max_id) + '&max_id_type='+str(max_tpye)
time.sleep(random.randint(0, 6))
web_data = requests.get(url2, headers=headers)
print(web_data,url2)
js_con = web_data.json()
max_id = js_con['data']['max_id']
max_tpye=js_con['data']['max_id_type']
comments = js_con['data']['data']
for comment in comments:
text = comment["text"]
create_time = time.strftime( '%Y-%m-%d %H:%M:%S',(time.strptime(comment['created_at'].replace('+0800',''))))
floor_number = comment['floor_number']
userid = comment['user']['id']
screen_name = comment['user']['screen_name']
label = re.compile(r'</?\w+[^>]*>', re.S)
text = re.sub(label, '', text)
count += 1
csv_opreator([count, create_time, userid, screen_name, floor_number, text])
print([count, create_time, userid, screen_name, floor_number, text])
print("第{}条数据获取成功".format(count))
except Exception as e:
print("出错了", e)
continue
def csv_opreator(a):
with open("weibocoments.csv", "a") as f:
writer = csv.writer(f)
writer.writerow(a)
if __name__ == "__main__":
fileHeader = ["id", "评论时间", "用户ID", "昵称", "评论楼层", "评论内容"]
csv_opreator(fileHeader)
user_agent = [
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
"Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
"Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
"Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
"Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
"UCWEB7.0.2.37/28/999",
"NOKIA5700/ UCWEB7.0.2.37/28/999",
"Openwave/ UCWEB7.0.2.37/28/999",
"Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
# iPhone 6:
"Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25",
]
cookies = [
'' # 微博的cookie
]
headers = {'User-Agent': random.choice(user_agent)
, 'Cookie': random.choice(cookies)
# 'Referer': 'https://m.weibo.cn/detail/4497103885505673',
# 'Sec-Fetch-Mode': 'navigate'
} # 设置user-agent来进行伪装,突破微博反爬限制
url = 'https://m.weibo.cn/comments/hotflow?id='
weibo_id = '4753831142888139' # 要爬取的微博id #https://m.weibo.cn/detail/4478512314460101
# 打开微博手机端网页https://m.weibo.cn,找到要爬取的微博id!
number = 10000 # 设置爬取评论量,爬取量在第X组,爬取时会爬取下来该组的数据,所以最终数据可能会大于number,一般是个整10的数
bcpl(weibo_id, url, headers, number)