携程web首页获取景点评论练习
使用了httpx爬取、pandas储存和jieba分词的方法import httpx
import pandas as pd
import random
import jieba
from datetime import datetime
file_path=''#放入文件位置
df=pd.read_excel(file_path,sheet_name="Sheet1")
timestamp = datetime.now().timestamp()
#jieba.enable_paddle()
url=f'https://m.ctrip.com/restapi/soa2/13444/json/getCommentCollapseList?_fxpcqlniredt
=09031179417058680787&x-traceID=09031179417058680787-1731892943346-6898368'#url链接
header={
"cookie":"" #放入自己的cookie
}
data={"arg":{"channelType":2,"collapseType":0,"commentTagId":0,"pageIndex":1,"pageSize":10,
"poiId":82720,"sourceType":1,"sortType":1,"starType":0},
"head":{"cid":"09031179417058680787","ctok":"","cver":"1.0","lang":"01","sid":"8888","syscode":"09",
"auth":"","xsid":"","extension":[]}}#76342 大熊猫基地#82720 三星堆 89895 九寨沟 82723 金沙 87301 青城山 91796
峨眉山 76447 都江堰 90391 四川博物院30752363 成都博物馆poiID景点代码
def paqu():
try:
with httpx.Client() as client:
#print(url)
for i in range(1,20): #获取评论页数设置
print(i)
data["arg"]["pageIndex"]=i
response = client.post(url, headers=header,json=data)
#print(response.json())
for a,x in enumerate(response.json()["result"]["items"]):
anow=df.shape
if x["userInfo"]:
df.loc=x["userInfo"]["userId"]
df.loc=x["userInfo"]["userNick"]
df.loc=x["userInfo"]["userMember"]
df.loc=x["content"]
df.loc=x["publishTypeTag"][:10]
df.loc=x["ipLocatedName"]
df.loc=x["score"]
df.loc=x["touristTypeDisplay"]
df.to_excel('携程三星堆博物馆评论.xlsx',index=False)
except TypeError:
print(response.json())
def fenci(): #评论分词
for i,x in enumerate(df['评论']):
seg_list =jieba.cut(df.loc,cut_all=False)
df.loc='/'.join(list(seg_list))
df.to_excel('携程四川旅游评论统计分析V1.0.xlsx',index=False)
def yuqing(): #根据评分进行分类
a={}
for i,x in enumerate(df['语义分词']):
if df.loc in and df.loc=='三星堆博物馆':
for i in x.split('/'):
if i not in a:
a=1
else:
a+=1
df.loc=str(dict(sorted(a.items(),key=lambda x:x,reverse=True)))
df.to_excel('携程四川旅游评论统计分析V1.0.xlsx',index=False)
sounds good
页:
[1]