使用Requests爬取某租车软件应用商店上的评论,并进行情感分析,同时生成词云
本帖最后由 Fujj 于 2020-6-8 12:43 编辑大佬勿喷!!!
主要使用的是requests,SnowNLP,WordCloud这三个模块:
requests:主要用来发送接收请求
SnowNLP:用于情感分析,也可以自己训练之后,使用自己的情感库进行分析
WordCloud:用来生成词云
首先利用fiddler抓取app评论链接api,分析组成参数(这里就不详细怎么抓包了),然后利用request模拟请求该url,对返回的数据进行遍历,写入excel,写入的同时进行情感分析
import requests
import json
import xlwt
from snownlp import SnowNLP
from snownlp import sentiment
import time
import xlrd
import jieba
import pymysql
import matplotlib.pylab as plt
from wordcloud import WordCloud
from collections import Counter
import numpy as np
from PIL import Image
excel = xlwt.Workbook()
print("正在创建excel……")
# 拉取华为评论
def get_huawei():
print("准备拉取【华为应用市场】的评论")
sheet = excel.add_sheet("华为应用市场")
rowName = ['用户名', '手机版本', '评论内容', '评分(星)', '评论时间','正/负面评价']
for row in range(0,6):
sheet.write(0, row, rowName)
re = requests.get(
url="https://wap1.hispace.hicloud.com/uowap/index?method=internal.user.commenList3&serviceType=20&reqPageNum=15&maxResults=25&appid=C10523283&version=10.0.0&zone=&locale=zh_CN")
totalPages = json.loads(re.text)["totalPages"]
n =1
for i in range(1,totalPages+1):
url = "https://wap1.hispace.hicloud.com/uowap/index?method=internal.user.commenList3&serviceType=20&reqPageNum="+str(i)+"&maxResults=25&appid=C10523283&version=10.0.0&zone=&locale=zh_CN"
list = json.loads(requests.get(url).text)["list"]
for j in list :
sheet.write(n, 0, j["accountName"])
sheet.write(n, 1, j["phone"])
sheet.write(n, 2, j["commentInfo"])
sheet.write(n, 3, j["stars"])
sheet.write(n, 4, j["operTime"])
if SnowNLP(j["commentInfo"]).sentiments < 0.4:
sheet.write(n, 5, "负面评价")
elif SnowNLP(j["commentInfo"]).sentiments > 0.6:
sheet.write(n, 5, "正面评价")
else:
sheet.write(n, 5, "中性评价")
n = n+1
time.sleep(0.1)
# 拉取appstore评论
def get_app_store():
print("准备拉取【app store】的评论")
sheet = excel.add_sheet("苹果app store")
rowName = ['用户名', '标题', '评论内容', '评分(星)', '评论时间','正/负面评价']
for row in range(0, 6):
sheet.write(0, row, rowName)
header = {
"authority": "amp-api.apps.apple.com",
"method": "OPTIONS",
"path": "/v1/catalog/cn/apps/1112929490/reviews?l=zh-Hans-CN&offset=20&platform=web&additionalPlatforms=appletv%2Cipad%2Ciphone%2Cmac",
"scheme": "https",
"accept": "*/*",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9",
"authorization": "Bearer eyJhbGciOiJFUzI1NiIsInR5cCI6IkpXVCIsImtpZCI6IldlYlBsYXlLaWQifQ.eyJpc3MiOiJBTVBXZWJQbGF5IiwiaWF0IjoxNTkxMDQ0NjQxLCJleHAiOjE2MDY1OTY2NDF9.qJ4vlF4w9iHGLoJLsT9gjY2RyEv510XpwDVonTY6GhxbsHD8__dpC9O7p8naB_bRFqqHYfBLzatMbwNc85iB5Q",
# "access-control-request-headers":"authorization",
# "access-control-request-method":"GET",
"cache-control": "no-cache",
"content-type": "application/x-www-form-urlencoded; charset=UTF-8",
"origin": "https://apps.apple.com",
"pragma": "no-cache",
"referer": "https://apps.apple.com/cn/app/%E5%8F%AE%E5%97%92%E5%87%BA%E8%A1%8C-%E5%85%A8%E5%9B%BD%E8%BD%BB%E6%9D%BE%E7%95%85%E9%AA%91/id1112929490",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
}
x,n = 10,1
while 1:
url = "https://amp-api.apps.apple.com/v1/catalog/cn/apps/1112929490/reviews?l=zh-Hans-CN&offset=" + str(
x) + "&platform=web&additionalPlatforms=appletv%2Cipad%2Ciphone%2Cmac"
re = json.loads(requests.get(url=url, headers=header).text)
list = re["data"]
for q in list:
sheet.write(n, 0, q["attributes"]["userName"])
sheet.write(n, 1, q["attributes"]["title"])
sheet.write(n, 2, q["attributes"]["review"])
sheet.write(n, 3, q["attributes"]["rating"])
sheet.write(n, 4, q["attributes"]["date"])
if SnowNLP(q["attributes"]["review"]).sentiments < 0.4:
sheet.write(n, 5, "负面评价")
elif SnowNLP(q["attributes"]["review"]).sentiments > 0.6:
sheet.write(n, 5, "正面评价")
else:
sheet.write(n, 5, "中性评价")
n = n + 1
if "next" in re.keys():
pass
else:
print("共拉取【app store】评论"+str(n)+"条")
break
x = x + 10
time.sleep(0.1)
if __name__ == '__main__':
get_huawei()
get_app_store()
excel.save('各终端应用市场评论.xls')
接下来提取excel里面的评论,生成词云
# 提取excel评论里面的词频
def getExcelData(excel, txt):
readbook = xlrd.open_workbook(excel)
result = {}
sheet = readbook.sheet_by_index(1)# 取第二个sheet页
rows = sheet.nrows
i = 0
while i < rows:
txt += sheet.cell(i, 2).value# 取第三列的值
i += 1
seg_list = jieba.cut(txt)
c = Counter()
for x in seg_list:
if len(x) > 1 and x != '\r\n':
c += 1
for (k, v) in c.most_common():
result = v# 放到字典中,用于生成词云的源数据
sheet = readbook.sheet_by_index(0)# 取第一个sheet页
rows = sheet.nrows
i = 0
while i < rows:
txt += sheet.cell(i, 2).value# 取第三列的值
i += 1
seg_list = jieba.cut(txt)
c = Counter()
for x in seg_list:
if len(x) > 1 and x != '\r\n':
c += 1
for (k, v) in c.most_common():
result = v# 放到字典中,用于生成词云的源数据
return result
# 根据词频生成词云
def makeWordCloud(txt):
image = Image.open('dingda.jpg')# 作为背景形状的图
graph = np.array(image)
# x, y = np.ogrid[:300, :500]
#
# mask = (x - 150) ** 2 + (y - 150) ** 2 > 150 ** 2
# mask = 255 * mask.astype(int)
wc = WordCloud(background_color="white",
max_words=500,
mask=graph,
repeat=True,
width=1000,
height=1000,
scale=10,# 这个数值越大,产生的图片分辨率越高,字迹越清晰
font_path="C:\Windows\Fonts\STXINGKA.TTF")
wc.generate_from_frequencies(txt)
wc.to_file('评论词频提取.png')
if __name__ == '__main__':
txt = ''
makeWordCloud(getExcelData("各终端应用市场评论.xls", txt))
print("评论词频提取.png 已经保存在文件夹")
https://wap1.hispace.hicloud.com/uowap/index?method=internal.user.commenList3这个是再华为手机市场里抓包到的吗?我怎么没看到 帮顶,前两天想用matlab生成词云,结果失败了{:301_972:} 可以,很有用 不觉明历{:1_907:}{:1_907:} 情感分析...是分析自己喜欢什么样的司机吗 不错,感谢分享 这些导入看的头大 {:1_909:} 发一张词云图看看效果吧 词云效果图..........
页:
[1]
2