爬取携程北京热评酒店排行并简单的做词云
本帖最后由 牵手丶若相惜 于 2020-1-7 14:28 编辑仅限学习!仅限学习!!仅限学习!!!
————————————————————
爬取的地址:https://hotels.ctrip.com/top/beijing1/zuijinreping-p1
翻两页看看 发现只是最后一个数字在变 不是异步加载 一般都写着代码里
找到代码里的数据 用正则匹配
匹配出来后 写入文件
可视化只做了一个柱状图 和 词云 可能没什么卵用 只是练手
柱状图 只做了前十个 因为那么多个不可能全部画出来
词云是把每个酒店的热评 分词得到的
词云图显示的很慢很慢很慢 词云的高度和宽度越大 显示的越慢 你如果都设置为100也挺快的 就是看不清楚
运行跑红和我一样的不是报错 下面放图
自己看吧
import pandas
import matplotlib.pyplot as plt
import jieba
from wordcloud import WordCloud
import requests
import re
headers = {
"cookie": 'magicid=ODDiKU0smOx4UPgURZ1vA0U4K8vbcR/FEDtpcCCfq83BaLSQv4yIN4/TI76Mhhde; _RSG=Jh4KiKYyfoAUW2GZ3iWNE8; _RGUID=aca9f225-2c50-4ad3-8397-82c0e340281e; _RDG=28c3e947f0d89520f613eb6277faa7e639; MKT_OrderClick=ASID=4897799752CNLL1ODh6-YCFQ2ZvAod3lQA5g7868292877463892359&AID=4897&CSID=799752&OUID=tongyong19&CT=1578203389740&CURL=https%3A%2F%2Fhotels.ctrip.com%2F%3Fallianceid%3D4897%26sid%3D799752%26ouid%3Dtongyong19%26bd_vid%3D7868292877463892359%26gclid%3DCNLL1ODh6-YCFQ2ZvAod3lQA5g%26gclsrc%3Dds&VAL={"pc_vid":"1578203387303.2a17p2"}; MKT_CKID=1578203389776.qaj18.ohh1; _gcl_dc=GCL.1578203390.CNLL1ODh6-YCFQ2ZvAod3lQA5g; _ga=GA1.2.1528899649.1578203390; MKT_Pagesource=PC; __utma=1.1528899649.1578203390.1578203407.1578203407.1; __utmz=1.1578203407.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); Session=smartlinkcode=U130026&smartlinklanguage=zh&SmartLinkKeyWord=&SmartLinkQuary=&SmartLinkHost=; Union=AllianceID=4897&SID=130026&OUID=&createtime=1578204335&Expires=1578809134634; _abtest_userid=f773ddda-83a3-47f8-8805-853acbdbd2a4; hoteluuid=AS0BC9OgMT1M2Tp1; _HGUID=%01%03%01Y%06RRUMR%03UPMT%01%04SMXSYWMXR%03P%05STPRXQ%05; fcerror=856437406; _zQdjfing=3a923ad5c086275ad0186ad95fa4cc3165bb186ad94ea084275ad0; HotelDomesticVisitedHotels1=5028983=0,0,5,54,/20030g00000086cpe5207.jpg,&37053067=0,0,5,57,/200g15000000xdeyc6E9B.jpg,&345025=0,0,4.7,6077,/200k0n000000ei2w6B6A1.jpg,; _gid=GA1.2.1560812336.1578364461; MKT_CKID_LMT=1578364460851; appFloatCnt=6; _bfs=1.1; _bfa=1.1578203387303.2a17p2.1.1578364458160.1578370337244.3.37; _RF1=117.30.47.199; _jzqco=%7C%7C%7C%7C1578364462902%7C1.1534693112.1578203389772.1578364597203.1578370340341.1578364597203.1578370340341.undefined.0.0.29.29; __zpspc=9.4.1578370340.1578370340.1%232%7Cwww.baidu.com%7C%7C%7C%7C%23; _bfi=p1%3D102085995%26p2%3D102085995%26v1%3D37%26v2%3D36',
"referer": "https://hotels.ctrip.com/top/beijing1/zuijinreping",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
}
url = "https://hotels.ctrip.com/top/beijing1/zuijinreping-p{}"
# 数据筛选
def data_filtering(data):
for i in range(len(data)):
# 酒店名称
name = data
# 地址
address = data
# 评分和评论人数
ratings_and_comments = data.split("</span>")
# 评分
ratings = ratings_and_comments.replace("</b>", "")
# 评论人数
Number_of_Commentaries = ratings_and_comments.replace("来自", "")
# 评论
comments = data.replace("\n", "").replace("\r", "").replace("\t", "").replace(",", ",")
# 价格
price = data+"起"
with open("北京酒店top.csv", "a", encoding="utf-8") as file:
file.write((name+","+address+","+ratings+","+Number_of_Commentaries+","+comments+","+price)+"\n")
# 柱状图
def Histogram(df):
# 设置字体为仿宋
plt.rcParams["font.sans-serif"] = ["simHei"]
# 去重
df = df.drop_duplicates()
# 去除前十个价格后面的“起”
df["price"][:10] = df["price"][:10].apply(lambda x: int(x.replace("起", "")))
# 价格柱状图
plt.bar(df["name"][:10], df["price"][:10], width=0.3)
Number_of_Commentaries = df["Number_of_Commentaries"][:10].apply(lambda x: int(x.replace("位住客点评", "")))
lst = []
for i in range(len(df["name"][:10])):
lst.append(i + 0.3)
# 评论人数柱状图
plt.bar(lst, Number_of_Commentaries, width=0.3)
# 设置图例
plt.legend(["价格", "评价人数"])
# 设置网格
plt.grid()
# x轴的标签旋转17度
plt.xticks(rotation=-17)
plt.show()
return df
# 词云图
def word_cloud(df):
# 把评论做成词云图
txt = ""
for i in df["comments"]:
txt += str(i)
word = " ".join(jieba.cut(txt))
wc = WordCloud(
# 字体
font_path=r"C:\Windows\Fonts\simHei.ttf",
# 最多显示的字数
max_words=2000,
# 词云宽度
width=10000,
# 词云高度
height=10000,
).generate(word)
# 关闭x轴和y轴
plt.axis('off')
plt.imshow(wc)
plt.show()
def main():
for i in range(5000):
try:
response = requests.get(url.format(str(i)), headers=headers).text
except:
return
data = re.findall(
'target="_blank" title="(.*?)" href="/hotel.*?style="color:Black">(.*?)</a></p>.*?target="_blank" ><span><b>(.*?)</a></p>.*?<i class="qot_l"></i>(.*?)<i class="qot_r">.*?<dfn>¥</dfn> (\d+)<span>',
response, re.S)
if len(data) == 0:
print("完成")
break
print("第", i, "页")
data_filtering(data)
# 等待3秒
# time.sleep(3)
names = ["name", "address", "ratings", "Number_of_Commentaries", "comments", "price"]
# 读取爬下来的文件
df = pandas.read_csv("北京酒店top.csv", sep=",", names=names)
Histogram(df)
word_cloud(df)
if __name__ == '__main__':
main()
大神大神大神大神大神 学习一下哈~~~~~~~ 大神观察细微 大神,大神,我也想研究一下了~ 为啥generate不准,误差很大,frequencies还准点,为什么呢
页:
[1]