import requests
import json
import time
import docx
from docx.shared import Inches
s = requests.Session()
page = 1
headers = {
"user-agent": "'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1'",
}
weekdict = {
'Mon':'星期一',
'Tue':'星期二',
'Wed':'星期三',
'Thu':'星期四',
'Fri':'星期五',
'Sat':'星期六',
'Sun':'星期日',
}
mondict = {
'Jan':'01',
'Feb':'02',
'Mar':'03',
'Apr':'04',
'May':'05',
'Jun':'06',
'Jul':'07',
'Aug':'08',
'Sep':'09',
'Oct':'10',
'Nov':'11',
'Dec':'12',
}
file = docx.Document() # 创建内存中的word文档对象
def getList(since_id=None):
global page
while True:
url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value=6367430139&containerid=1076036367430139&since_id={}'.format(since_id)
res = s.get(url,headers=headers)
r = json.loads(res.text)
since_id = r['data']['cardlistInfo']['since_id']
list = r['data']['cards']
for item in list:
img_url_list = []
if item['mblog']['isLongText']:
if item['mblog']['pic_num']>0:
img_url_list = [b]item['mblog']['pics']
getLongText(file,item['mblog']['id'],item['mblog']['created_at'],img_url_list)
else:
if item['mblog']['pic_num']>0:
img_url_list = item['mblog']['pics']
txt = item['mblog']['text'].replace('<br />', '\n')
writeFile(file,item['mblog']['created_at'], txt,img_url_list)
page = page + 1
time.sleep(2)
file.save("wb.docx")
# 获取全文
def getLongText(file,id,date,imglist):
url = 'https://m.weibo.cn/statuses/extend?id={}'.format(id)
res = s.get(url)
try:
r = json.loads(res.text)
txt = r['data']['longTextContent'].replace('<br />','\n')
writeFile(file,date,txt,imglist)
print(r['data']['longTextContent'])
print('写入成功,{}'.format(url))
except:
print('写入文件出错,跳过···{}'.format(url))
time.sleep(2)
def fotmatDate(s):
list = s.split(' ')
date = '{}-{}-{} {} {}'.format(list[-1], mondict[list[1]], list[2], list[3], weekdict[list[0]])
return date
def writeFile(file,date,data,imglist):
# with open('微博mobile.txt','a',encoding='utf-8') as f:
# f.write('\n\n')
# f.write(fotmatDate(date))
# f.write('\n')
# f.write(data)
file.add_heading(fotmatDate(date),level=1)
para = file.add_paragraph(data)
run = para.add_run('')
run.add_break()
for item in imglist:
img = s.get(item['large']['url']).content
with open('img.jpg','wb') as f:
f.write(img)
run.add_picture('img.jpg', width=Inches(2))
if __name__ == '__main__':
getList()