Python初学者,大佬请轻喷
学校排名
分布地区
数据爬取代码
[Asm] 纯文本查看 复制代码
from lxml import etree
import requests
import pandas as pd
import time
for i in range(2015,2022):
urls = ["https://ranking.promisingedu.com/%d-qs-all-undergraduate"%i]
session = requests.Session()
for url in urls:
response = session.get(url,headers={
'User-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3756.400 QQBrowser/10.5.4039.400'
})
content = response.text
html = etree.HTML(content)
time.sleep(1)
University = html.xpath('//table[@id="rk"]//td[2]//text()')[:200]
Country = html.xpath('//table[@id="rk"]//td[3]//text()')[:200]
Rank = html.xpath('//table[@id="rk"]//td[1]//text()')[:200]
Overall_Score = html.xpath('//table[@id="rk"]//td[10]//text()')[:200]
Year = str(i).split()*200
Content = {"Rank":Rank,"University":University,"Country":Country,"Overall_Score":Overall_Score,"Year":Year}
New = pd.DataFrame(Content)
New.to_csv("世界大学排名_%d.csv"%i)
print(New)
数据展示代码
[Asm] 纯文本查看 复制代码
import pandas as pd
from pyecharts.charts import Funnel,Pie,Timeline
from pyecharts import options as opts
import os
list = []
listfile = os.listdir(r"./data/")
t1 = Timeline()
for i in listfile:
df = pd.read_csv(r'./data/'+i,encoding='utf-8',index_col=0)
list.append(df)
Score_Data = pd.concat([df["University"], df["Rank"]], axis=1)
Rank = Score_Data.sort_values("Rank",ascending=True)[:10]
funnel = (Funnel()
.add("", [z for z in zip(Rank["University"].tolist(), Rank["Rank"].tolist())],
sort_='descending',
label_opts=opts.LabelOpts(position="inside"))
.set_global_opts(title_opts=opts.TitleOpts(title="大学排名(日期:{})".format(i[-8:-4]), pos_bottom=True))
)
t1.add(funnel,"{}".format(i[-8:-4]))
t1.render("university.html")
t2 = Timeline()
for i in listfile:
df = pd.read_csv(r'./data/'+i,encoding='utf-8',index_col=0)
list.append(df)
classfy_Country = df.groupby(df["Country"])
group_Country = classfy_Country.size().sort_values(ascending=False)
pie = (Pie()
.add('', [z for z in zip(group_Country.keys(), group_Country.tolist())],
radius=["30%", "75%"],
rosetype="radius")
.set_global_opts(title_opts=opts.TitleOpts(title="地区分布(日期:{})".format(i[-8:-4]), pos_bottom=True))
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {d}%"))
)
t2.add(pie, "{}".format(i[-8:-4]))
t2.render("region.html")
Gather = pd.concat(list,ignore_index=True)
Gather.to_csv("Gather.csv")
|