先不说爬虫,整体python的代码小错误较多,我按照你的逻辑先把程序跑通了,之前的程序跑不起来。另外说实话用xpath来筛选数据更方便一些,另外这个代码只能爬第一页,其他数据在payload.js里面,下面代码只是按照你的逻辑爬一页数据而已,楼主可以对照着看下
[Python] 纯文本查看 复制代码 import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import matplotlib
allUniv=[]
# 这是给定获取url参数页面的HTML代码的通用函数
def getHTMLText(url):
try:
r=requests.get(url,timeout=30)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except:
print("产生异常")
# 这是提取页面数据的函数模块
def fillUnivList(soup):
data = soup.find_all('tr') #找到所有的tr标签,返回列表
for tr in data:
ltd = tr.find_all('td') #找到所有的td标签,返回列表
if len(ltd) == 0: #移除标题列
continue
singleUniv = []
for td in ltd:
if td.select('.name-cn'):
singleUniv.append(td.select('.name-cn')[0].text.strip())
continue
singleUniv.append(td.text.strip()) #提取td标签中的信息
allUniv.append(singleUniv) # allUniv存储全部数据
# 这是输出高校排名函数
def printUnivList(num):
print("{0:10}\t{1:{5}<10}\t{2:{5}<10}\t{3:{5}<10}\t{4:{5}<10}".format(
"排名", "学校名称", "省市", "类型", "总分",chr(12288)))
for i in range(num):
try:
u = allUniv[i]
except Exception:
break
for j in range(5):
u[j] = u[j].strip()
print("{0:<10}\t{1:{5}<10}\t{2:{5}<10}\t{3:{5}<10}\t{4:{5}<10}".format(
u[0], u[1], u[2], u[3], u[4], chr(12288)))
# 这是统计湘苏两省前500高校数及可视化函数
def data_display(num):
schoolNum = {"湖南": 0, "江苏": 0}
for i in range(num):
try:
u = allUniv[i]
if u[2] == '湖南' or u[2] == '江苏':
schoolNum[u[2]] = schoolNum[u[2]]+1
except:
break
print(schoolNum)
name_list = ['湖南', '江苏']
num_list = [schoolNum["湖南"], schoolNum["江苏"]]
matplotlib.rcParams['font.sans-serif']=['simHei']
plt.bar(range(len(num_list)),num_list,
color=['r','g','b'],tick_label=name_list)
plt.show()
# 这是主函数
def main(num):
url = "https://www.shanghairanking.cn/rankings/bcur/2020"
html = getHTMLText(url)
soup = BeautifulSoup(html, "html.parser")
fillUnivList(soup)
printUnivList(num)
data_display(num)
main(500) |