起点中文网小说排行榜爬取并数据可视化
大数据分析的课程设计和大家分享一下import requestsimport json
import pandas as pd
from bs4 import BeautifulSoup
import MySQLdb
import pymysql
import traceback
import pandas as pd
from sqlalchemy import create_engine
import os
import numpy as np
import csv
title = []
author = []
type = []
serialize = []
update = []
last_updata_data = []
number = []
introduction = []
for i in range(1,6):
url = f'https://www.qidian.com/rank/yuepiao?page={i}'
#url = 'https://www.qidian.com/rank/yuepiao?page=1'
#url2 = 'https://www.qidian.com/rank/yuepiao?page=2'
#url3 = 'https://www.qidian.com/rank/yuepiao?page=3'
#url4 = 'https://www.qidian.com/rank/yuepiao?page=4'
#url5 = 'https://www.qidian.com/rank/yuepiao?page=5'
rq =requests.get(url)
rq.status_code
html = rq.content.decode('utf-8')
for i in range(0,1, 1):
dom = BeautifulSoup(html, 'lxml')
# rank-view-list > div > ul > li:nth-child(1) > div.book-mid-info > h4 > a
title =title +
# rank-view-list > div > ul > li:nth-child(1) > div.book-mid-info > p.author > a.name
author = author +
# rank-view-list > div > ul > li:nth-child(1) > div.book-mid-info > p.author > a:nth-child(4)
type = type +
type1 = type
# rank-view-list > div > ul > li:nth-child(1) > div.book-mid-info > p.author > span
serialize = serialize +
# rank-view-list > div > ul > li:nth-child(1) > div.book-mid-info > p.update > a
update = update +
# rank-view-list > div > ul > li:nth-child(1) > div.book-mid-info > p.update > span
last_updata_data = last_updata_data +
# rank-view-list > div > ul > li:nth-child(1) > div.book-right-info > div > p
number = number +
# rank-view-list > div > ul > li:nth-child(2) > div.book-mid-info > p.intro
introduction = introduction +
short = pd.DataFrame({
'书名': title, '作者': author,
'类型':type1,
'是否连载': serialize ,'最近更新':update ,
'最后更新时间':last_updata_data,
})
##将数据写入mysql的数据库,但需要先通过sqlalchemy.create_engine建立连接,且字符编码设置为utf8,否则有些latin字符不能处理
#yconnect = create_engine('mysql+mysqldb://root:cml123456@localhost:3306/data?charset=utf8')
#pd.io.sql.to_sql(short, 'qid_data', yconnect, schema='data', if_exists='append')
#将DataFrame存储为csv,index表示是否显示行名,default=True
short.to_csv("test.csv",index=False,sep=',',encoding='utf_8')
# 读取news_data.csv,保存到新建的news_data.txt中
data = pd.read_csv('zuozhe.csv', encoding='utf-8')
with open('zuozhe.txt', 'a+', encoding='utf-8') as f:
for line in data.values:
# str(line):csv中第0列;+','+:csv两列之间保存到txt用逗号(,)隔开;'\n':读取csv每行后在txt中换行
f.write((str(line) + ',''\n'))
#词云
text=open('zuozhe.txt',encoding='utf-8').read()
wc=WordCloud(
width=1300,
repeat=True,
font_path=r'C:\Windows\Fonts\simfang.ttf',
height=1300).generate(text)
plt.imshow(wc,interpolation="bilinear")
plt.axis("off")
plt.savefig('aaaa.jpg')
可视化部分用echars实现
一梦如虹 发表于 2020-8-1 16:47
英文跟数学实在太差了
这不是理由~
因为,我也差,虽然自知不会有什么大成就,可是写一些小工具还是可行的
所以……干就完了~ 厉害了 我觉得你们编程的好厉害 好东西~ 试用下 好用不~ 下载试试~~~~~~~~~~~~~ 好东西,感谢楼主热心分享。 路过瞅瞅,谢谢分享 有用,非常感谢分享! 我也试试 学习一下 感谢楼主