携程web首页获取景点评论练习

tttomoki 发表于 2024-11-18 09:28

使用了httpx爬取、pandas储存和jieba分词的方法

import httpx
import pandas as pd
import random
import jieba
from datetime import datetime
file_path=''#放入文件位置
df=pd.read_excel(file_path,sheet_name="Sheet1")
timestamp = datetime.now().timestamp()
#jieba.enable_paddle()
url=f'https://m.ctrip.com/restapi/soa2/13444/json/getCommentCollapseList?_fxpcqlniredt
=09031179417058680787&x-traceID=09031179417058680787-1731892943346-6898368'#url链接

header={
"cookie":"" #放入自己的cookie
}
data={"arg":{"channelType":2,"collapseType":0,"commentTagId":0,"pageIndex":1,"pageSize":10,
"poiId":82720,"sourceType":1,"sortType":1,"starType":0},
"head":{"cid":"09031179417058680787","ctok":"","cver":"1.0","lang":"01","sid":"8888","syscode":"09",
"auth":"","xsid":"","extension":[]}}#76342 大熊猫基地#82720 三星堆 89895 九寨沟 82723 金沙 87301 青城山 91796
峨眉山 76447 都江堰 90391 四川博物院30752363 成都博物馆poiID景点代码
def paqu():
try:
   with httpx.Client() as client:
         #print(url)
         for i in range(1,20): #获取评论页数设置
            print(i)
            data["arg"]["pageIndex"]=i
            response = client.post(url, headers=header,json=data)
            #print(response.json())
            for a,x in enumerate(response.json()["result"]["items"]):
               anow=df.shape
               if x["userInfo"]:
                     df.loc=x["userInfo"]["userId"]
                     df.loc=x["userInfo"]["userNick"]
                     df.loc=x["userInfo"]["userMember"]
               df.loc=x["content"]
               df.loc=x["publishTypeTag"][:10]
               df.loc=x["ipLocatedName"]
               df.loc=x["score"]
               df.loc=x["touristTypeDisplay"]
         df.to_excel('携程三星堆博物馆评论.xlsx',index=False)
except TypeError:
   print(response.json())
def fenci(): #评论分词
for i,x in enumerate(df['评论']):
   seg_list =jieba.cut(df.loc,cut_all=False)
   df.loc='/'.join(list(seg_list))
df.to_excel('携程四川旅游评论统计分析V1.0.xlsx',index=False)
def yuqing(): #根据评分进行分类
a={}
for i,x in enumerate(df['语义分词']):
   if df.loc in and df.loc=='三星堆博物馆':
         for i in x.split('/'):
            if i not in a:
               a=1
            else:
               a+=1
df.loc=str(dict(sorted(a.items(),key=lambda x:x,reverse=True)))
df.to_excel('携程四川旅游评论统计分析V1.0.xlsx',index=False)

djseagle 发表于 2024-11-18 13:37

sounds good

页: [1]

吾爱破解 - 52pojie.cn's Archiver

携程web首页获取景点评论练习