大数据分析的课程设计和大家分享一下
[Python] 纯文本查看 复制代码 [mw_shl_code=python,true]import requestsimport json
import pandas as pd
from bs4 import BeautifulSoup
import MySQLdb
import pymysql
import traceback
import pandas as pd
from sqlalchemy import create_engine
import os
import numpy as np
import csv
title = []
author = []
type = []
serialize = []
update = []
last_updata_data = []
number = []
introduction = []
for i in range(1,6):
url = f'https://www.qidian.com/rank/yuepiao?page={i}'
#url = 'https://www.qidian.com/rank/yuepiao?page=1'
#url2 = 'https://www.qidian.com/rank/yuepiao?page=2'
#url3 = 'https://www.qidian.com/rank/yuepiao?page=3'
#url4 = 'https://www.qidian.com/rank/yuepiao?page=4'
#url5 = 'https://www.qidian.com/rank/yuepiao?page=5'
rq =requests.get(url)
rq.status_code
html = rq.content.decode('utf-8')
for i in range(0,1, 1):
dom = BeautifulSoup(html, 'lxml')
# rank-view-list > div > ul > li:nth-child(1) > div.book-mid-info > h4 > a
title =title + [i.getText() for i in dom.select('#rank-view-list > div > ul > li > div.book-mid-info > h4 > a')]
# rank-view-list > div > ul > li:nth-child(1) > div.book-mid-info > p.author > a.name
author = author + [i.getText() for i in dom.select('#rank-view-list > div > ul > li > div.book-mid-info > p.author > a.name')]
# rank-view-list > div > ul > li:nth-child(1) > div.book-mid-info > p.author > a:nth-child(4)
type = type + [i.getText() for i in dom.select('#rank-view-list > div > ul > li > div.book-mid-info > p.author > a')]
type1 = type[1::2]
# rank-view-list > div > ul > li:nth-child(1) > div.book-mid-info > p.author > span
serialize = serialize + [i.getText() for i in dom.select('#rank-view-list > div > ul > li > div.book-mid-info > p.author > span')]
# rank-view-list > div > ul > li:nth-child(1) > div.book-mid-info > p.update > a
update = update + [i.getText() for i in dom.select(' #rank-view-list > div > ul > li > div.book-mid-info > p.update > a')]
# rank-view-list > div > ul > li:nth-child(1) > div.book-mid-info > p.update > span
last_updata_data = last_updata_data + [i.getText() for i in dom.select('#rank-view-list > div > ul > li > div.book-mid-info > p.update > span')]
# rank-view-list > div > ul > li:nth-child(1) > div.book-right-info > div > p
number = number + [i.getText() for i in dom.select('#rank-view-list > div > ul > li > div.book-right-info > div > p')]
# rank-view-list > div > ul > li:nth-child(2) > div.book-mid-info > p.intro
introduction = introduction + [i.getText() for i in dom.select('#rank-view-list > div > ul > li > div.book-mid-info > p.intro')]
short = pd.DataFrame({
'书名': title, '作者': author,
'类型':type1,
'是否连载': serialize ,'最近更新':update ,
'最后更新时间':last_updata_data,
})
##将数据写入mysql的数据库,但需要先通过sqlalchemy.create_engine建立连接,且字符编码设置为utf8,否则有些latin字符不能处理
#yconnect = create_engine('mysql+mysqldb://root:cml123456@localhost:3306/data?charset=utf8')
#pd.io.sql.to_sql(short, 'qid_data', yconnect, schema='data', if_exists='append')
#将DataFrame存储为csv,index表示是否显示行名,default=True
short.to_csv("test.csv",index=False,sep=',',encoding='utf_8')
# 读取news_data.csv,保存到新建的news_data.txt中
data = pd.read_csv('zuozhe.csv', encoding='utf-8')
with open('zuozhe.txt', 'a+', encoding='utf-8') as f:
for line in data.values:
# str(line[0]):csv中第0列;+','+:csv两列之间保存到txt用逗号(,)隔开;'\n':读取csv每行后在txt中换行
f.write((str(line[0]) + ',' '\n'))
#词云
text=open('zuozhe.txt',encoding='utf-8').read()
wc=WordCloud(
width=1300,
repeat=True,
font_path=r'C:\Windows\Fonts\simfang.ttf',
height=1300).generate(text)
plt.imshow(wc,interpolation="bilinear")
plt.axis("off")
plt.savefig('aaaa.jpg')
[/mw_shl_code]
可视化部分用echars实现
|