使用Requests爬取某租车软件应用商店上的评论，并进行情感分析，同时生成词云

Fujj 发表于 2020-6-8 08:46

本帖最后由 Fujj 于 2020-6-8 12:43 编辑

大佬勿喷！！！

主要使用的是requests，SnowNLP，WordCloud这三个模块：

requests：主要用来发送接收请求
SnowNLP：用于情感分析，也可以自己训练之后，使用自己的情感库进行分析
WordCloud：用来生成词云

首先利用fiddler抓取app评论链接api，分析组成参数（这里就不详细怎么抓包了），然后利用request模拟请求该url，对返回的数据进行遍历，写入excel，写入的同时进行情感分析

import requests
import json
import xlwt
from snownlp import SnowNLP
from snownlp import sentiment
import time
import xlrd
import jieba
import pymysql
import matplotlib.pylab as plt
from wordcloud import WordCloud
from collections import Counter
import numpy as np
from PIL import Image

excel = xlwt.Workbook()
print("正在创建excel……")

# 拉取华为评论
def get_huawei():
print("准备拉取【华为应用市场】的评论")
sheet = excel.add_sheet("华为应用市场")
rowName = ['用户名', '手机版本', '评论内容', '评分（星）', '评论时间','正/负面评价']
for row in range(0,6):
   sheet.write(0, row, rowName)
re = requests.get(
   url="https://wap1.hispace.hicloud.com/uowap/index?method=internal.user.commenList3&serviceType=20&reqPageNum=15&maxResults=25&appid=C10523283&version=10.0.0&zone=&locale=zh_CN")
totalPages = json.loads(re.text)["totalPages"]
n =1
for i in range(1,totalPages+1):
   url = "https://wap1.hispace.hicloud.com/uowap/index?method=internal.user.commenList3&serviceType=20&reqPageNum="+str(i)+"&maxResults=25&appid=C10523283&version=10.0.0&zone=&locale=zh_CN"
   list = json.loads(requests.get(url).text)["list"]
   for j in list :
         sheet.write(n, 0, j["accountName"])
         sheet.write(n, 1, j["phone"])
         sheet.write(n, 2, j["commentInfo"])
         sheet.write(n, 3, j["stars"])
         sheet.write(n, 4, j["operTime"])
         if SnowNLP(j["commentInfo"]).sentiments < 0.4:
            sheet.write(n, 5, "负面评价")
         elif SnowNLP(j["commentInfo"]).sentiments > 0.6:
            sheet.write(n, 5, "正面评价")
         else:
            sheet.write(n, 5, "中性评价")
         n = n+1
         time.sleep(0.1)

# 拉取appstore评论
def get_app_store():
print("准备拉取【app store】的评论")
sheet = excel.add_sheet("苹果app store")
rowName = ['用户名', '标题', '评论内容', '评分（星）', '评论时间','正/负面评价']
for row in range(0, 6):
   sheet.write(0, row, rowName)
header = {
   "authority": "amp-api.apps.apple.com",
   "method": "OPTIONS",
   "path": "/v1/catalog/cn/apps/1112929490/reviews?l=zh-Hans-CN&offset=20&platform=web&additionalPlatforms=appletv%2Cipad%2Ciphone%2Cmac",
   "scheme": "https",
   "accept": "*/*",
   "accept-encoding": "gzip, deflate, br",
   "accept-language": "zh-CN,zh;q=0.9",
   "authorization": "Bearer eyJhbGciOiJFUzI1NiIsInR5cCI6IkpXVCIsImtpZCI6IldlYlBsYXlLaWQifQ.eyJpc3MiOiJBTVBXZWJQbGF5IiwiaWF0IjoxNTkxMDQ0NjQxLCJleHAiOjE2MDY1OTY2NDF9.qJ4vlF4w9iHGLoJLsT9gjY2RyEv510XpwDVonTY6GhxbsHD8__dpC9O7p8naB_bRFqqHYfBLzatMbwNc85iB5Q",
   # "access-control-request-headers":"authorization",
   # "access-control-request-method":"GET",
   "cache-control": "no-cache",
   "content-type": "application/x-www-form-urlencoded; charset=UTF-8",
   "origin": "https://apps.apple.com",
   "pragma": "no-cache",
   "referer": "https://apps.apple.com/cn/app/%E5%8F%AE%E5%97%92%E5%87%BA%E8%A1%8C-%E5%85%A8%E5%9B%BD%E8%BD%BB%E6%9D%BE%E7%95%85%E9%AA%91/id1112929490",
   "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
}
x,n = 10,1
while 1:
   url = "https://amp-api.apps.apple.com/v1/catalog/cn/apps/1112929490/reviews?l=zh-Hans-CN&offset=" + str(
         x) + "&platform=web&additionalPlatforms=appletv%2Cipad%2Ciphone%2Cmac"
   re = json.loads(requests.get(url=url, headers=header).text)
   list = re["data"]
   for q in list:
         sheet.write(n, 0, q["attributes"]["userName"])
         sheet.write(n, 1, q["attributes"]["title"])
         sheet.write(n, 2, q["attributes"]["review"])
         sheet.write(n, 3, q["attributes"]["rating"])
         sheet.write(n, 4, q["attributes"]["date"])
         if SnowNLP(q["attributes"]["review"]).sentiments < 0.4:
            sheet.write(n, 5, "负面评价")
         elif SnowNLP(q["attributes"]["review"]).sentiments > 0.6:
            sheet.write(n, 5, "正面评价")
         else:
            sheet.write(n, 5, "中性评价")
         n = n + 1
   if "next" in re.keys():
         pass
   else:
         print("共拉取【app store】评论"+str(n)+"条")
         break
   x = x + 10
   time.sleep(0.1)

if __name__ == '__main__':
get_huawei()
get_app_store()
excel.save('各终端应用市场评论.xls')

接下来提取excel里面的评论，生成词云

# 提取excel评论里面的词频
def getExcelData(excel, txt):
readbook = xlrd.open_workbook(excel)
result = {}
sheet = readbook.sheet_by_index(1)# 取第二个sheet页
rows = sheet.nrows
i = 0
while i < rows:
   txt += sheet.cell(i, 2).value# 取第三列的值
   i += 1
seg_list = jieba.cut(txt)
c = Counter()
for x in seg_list:
   if len(x) > 1 and x != '\r\n':
         c += 1
for (k, v) in c.most_common():
   result = v# 放到字典中，用于生成词云的源数据

sheet = readbook.sheet_by_index(0)# 取第一个sheet页
rows = sheet.nrows
i = 0
while i < rows:
   txt += sheet.cell(i, 2).value# 取第三列的值
   i += 1
seg_list = jieba.cut(txt)
c = Counter()
for x in seg_list:
   if len(x) > 1 and x != '\r\n':
         c += 1
for (k, v) in c.most_common():
   result = v# 放到字典中，用于生成词云的源数据

return result

# 根据词频生成词云
def makeWordCloud(txt):
image = Image.open('dingda.jpg')# 作为背景形状的图
graph = np.array(image)
# x, y = np.ogrid[:300, :500]
#
# mask = (x - 150) ** 2 + (y - 150) ** 2 > 150 ** 2
# mask = 255 * mask.astype(int)
wc = WordCloud(background_color="white",
               max_words=500,
               mask=graph,
               repeat=True,
               width=1000,
               height=1000,
               scale=10,# 这个数值越大，产生的图片分辨率越高，字迹越清晰
               font_path="C:\Windows\Fonts\STXINGKA.TTF")
wc.generate_from_frequencies(txt)
wc.to_file('评论词频提取.png')

if __name__ == '__main__':
txt = ''
makeWordCloud(getExcelData("各终端应用市场评论.xls", txt))
print("评论词频提取.png 已经保存在文件夹")

a186che 发表于 2020-8-18 15:58

https://wap1.hispace.hicloud.com/uowap/index?method=internal.user.commenList3这个是再华为手机市场里抓包到的吗？我怎么没看到

FANT456 发表于 2020-6-8 08:55

帮顶，前两天想用matlab生成词云，结果失败了{:301_972:}

凌晨四点半 发表于 2020-6-8 08:57

可以，很有用

ysdy 发表于 2020-6-8 09:12

不觉明历{:1_907:}{:1_907:}

云岛鹤川 发表于 2020-6-8 09:14

情感分析...是分析自己喜欢什么样的司机吗

wangqing1116 发表于 2020-6-8 09:20

不错，感谢分享

MinxArrix 发表于 2020-6-8 10:22

这些导入看的头大 {:1_909:}

zucker 发表于 2020-6-8 10:26

发一张词云图看看效果吧

处女-大龙猫 发表于 2020-6-8 10:50

词云效果图..........

王星星 发表于 2020-6-8 10:57

页: [1] 2

吾爱破解 - 52pojie.cn's Archiver

使用Requests爬取某租车软件应用商店上的评论，并进行情感分析，同时生成词云