好友
阅读权限20
听众
最后登录1970-1-1
|
yoin
发表于 2019-11-11 14:51
基础爬虫:爬取高校信息
[Python] 纯文本查看 复制代码 #!C:\Program Files\Python36 python
# -*- coding: UTF-8 -*-
"""
@author: 东方不败
"""
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import pymysql
class College:
# 初始化
def __init__(self):
self.ua = UserAgent()
self.headers = {"User-Agent": self.ua.random}
self.url = "http://xkkm.sdzk.cn/zy-manager-web/html/xx.html"
self.conn = pymysql.connect('localhost', 'root', '')
self.conn.select_db('college_bak')
# 获取页面内容,返回Bs对象
def get_page(self):
try:
response = requests.get(self.url, headers=self.headers).content
page = BeautifulSoup(response, 'html.parser')
return page
except Exception as Err:
print('Error1:', str(Err))
# 解析页面内容,获取高校信息,添加到college_infos,并返回
def get_college(self, page):
college_infos = []
# 先找到大表格,然后获取每一行的数据
colleges = page.find('tbody', {'class': 'scrollTbody'}).find_all('tr')
# 遍历每一行,获取高校信息
for college in colleges:
data = college.find_all('td')
area = data[1].text
college_id = data[2].text
college_name = data[3].text
college_site = data[5].find('a').get('href')
# 组合信息,append
college_infos.append((area, college_id, college_name, college_site))
print('总计获取%s条数据' % str(len(college_infos)))
return college_infos
# 将数据插入数据库
def insert_college(self, data):
try:
cur = self.conn.cursor()
sql = "insert into college(area, college_id, college_name, college_site) values(%s, %s, %s, %s)"
rows = cur.executemany(sql, tuple(data))
self.conn.commit()
self.conn.close()
return rows
except Exception as Err:
print('数据插入失败!!!')
print('Error2:', str(Err))
if __name__ == "__main__":
college = College()
data = college.get_college(college.get_page())
rows = college.insert_college(data)
print('总计插入%s条数据' % str(rows))
print("-"*50)
print("Done")
多线程:爬取各高校不同专业对于选考科目的要求
[Python] 纯文本查看 复制代码 #!C:\Program Files\Python36 python
# -*- coding: UTF-8 -*-
"""
@author: 东方不败
"""
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import re
import pymysql
from threading import Thread
class Subject:
def __init__(self, param):
self.ua = UserAgent()
self.headers = {"User-Agent": self.ua.random}
self.url = "http://xkkm.sdzk.cn/zy-manager-web/gxxx/searchInfor"
self.param = param
def get_page(self):
try:
response = requests.post(self.url, data=self.param, headers=self.headers).content
page = BeautifulSoup(response, 'html.parser')
return page
except Exception as Err:
print('Error1:', str(Err))
def get_subject(self, page):
college_id = self.param['dm']
subject_infos = []
subjects = page.find('div', {'id': 'ccc'}).find_all('tr')
for subject in subjects:
data = subject.find_all('td')
gradation = data[1].text
classification = data[2].text.strip()
subject = data[3].text
# 提取专业详情
temp = str(data[4]).replace("\n", "").replace("\t", "")
item = re.findall(r"\w+<br/>", temp)
item.remove(item[0])
major = "/".join(item).replace("<br/>", "")
# 组合数据
subject_infos.append((college_id, gradation, classification, subject, major))
return subject_infos
def insert_subject(self):
page = self.get_page()
data = self.get_subject(page)
college_name = self.param['mc']
db = DataBase()
sql = "insert into major(college_id, gradation, classification, subject, major) values(%s,%s,%s,%s,%s)"
rows = db.cur.executemany(sql, tuple(data))
db.conn.commit()
db.close()
print(str(college_name), ":", str(rows), "条数据插入完成")
class DataBase:
host = "localhost"
user = "root"
password = ""
database = "college_bak"
def __init__(self):
self.conn = pymysql.connect(host=self.host, user=self.user, password=self.password, database=self.database)
self.cur = self.conn.cursor()
def close(self):
self.conn.close()
def get_college():
db = DataBase()
sql = 'select college_id,college_name from college'
db.cur.execute(sql)
result = db.cur.fetchall()
db.close()
return result
if __name__ == "__main__":
college_data = list(get_college())
# 测试用
college_data = college_data[:5]
counts = len(college_data)
print('正在获取', str(counts), '所高校的专业选课要求信息')
# 开启多线程
my_threads = []
try:
while True:
# 开启线程数
for i in range(5):
data = college_data.pop()
param = {'dm': data[0], 'mc': data[1]}
subject = Subject(param)
my_thread = Thread(target=subject.insert_subject())
my_threads.append(my_thread)
my_thread.start()
except Exception as Err:
print('Error2:', str(Err))
finally:
for thread in my_threads:
thread.join()
print('+'*50)
print('Done')
pyecharts:展示爬取的数据(仅供参考,并未深入分析)
[Python] 纯文本查看 复制代码 #!C:\Program Files\Python36 python
# -*- coding: UTF-8 -*-
"""
@author: 东方不败
"""
from pyecharts import options as opts
from pyecharts.charts import Map, Bar, Pie
import pymysql
class DataBase:
host = "localhost"
user = "root"
password = ""
database = "college_bak"
def __init__(self):
self.conn = pymysql.connect(host=self.host, user=self.user, password=self.password, database=self.database)
self.cur = self.conn.cursor()
def close(self):
self.conn.close()
def college_map():
db = DataBase()
sql = "select area, count(id) from college group by area"
db.cur.execute(sql)
ret = db.cur.fetchall()
db.close()
college_map = Map(init_opts=opts.InitOpts(width="1400px", height="900px")) \
.set_global_opts(title_opts=opts.TitleOpts(title="全国高校分布图"), visualmap_opts=opts.VisualMapOpts(max_=150), )
college_map.add('高校数量', list(ret), ).set_series_opts(
label_opts=opts.LabelOpts(is_show=True, color="#00f", formatter="{b}:{c}"))
college_map.render('college_map.html')
def subject_map():
subject_map = Bar().add_xaxis(["物理", "化学", "生物", "政治", "历史", "地理"]) \
.add_yaxis("", [11931, 5414, 3153, 761, 1057, 1015]) \
.set_global_opts(title_opts=opts.TitleOpts(title="学科统计图"))
subject_map.render("subject_map.html")
def subject_pie():
# 学科占比图
subject_count = [("物理", 11931), ("化学", 5414), ("生物", 3153), ("政治", 761), ("历史", 1057), ("地理", 1015)]
subject_pie = Pie().add("", subject_count).set_global_opts(title_opts=opts.TitleOpts(title="学科比例图")) \
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
subject_pie.render("subject_pie.html")
if __name__ == "__main__":
college_map()
subject_map()
subject_pie()
print('Done')
数据库创建
[SQL] 纯文本查看 复制代码 # 删除数据库
-- drop database college;
# 创建数据库
create database college_bak default character set utf8;
use college_bak;
# 创建院校表
create table college(
id int unsigned auto_increment comment '序号',
area varchar(20) not null default '' comment '地区',
college_id char(5) not null default '00000' comment '学校代码',
college_name varchar(20) not null default '' comment '学校名称',
college_site varchar(128) not null default '' comment '学校网站',
unique (college_id),
primary key(id)# 此处不要写',',否则会报错
) engine=InnoDB default charset=utf8 comment '院校表';
# 创建专业科目表
create table major(
id int unsigned auto_increment comment '序号',
college_id char(5) not null default '00000' comment '学校代码',
gradation varchar(10) not null default '' comment '层次',
classification varchar(128) not null default '' comment '专业名称',
subject varchar(50) not null default '' comment '科目要求',
major varchar(128) not null default '' comment '所含专业',
primary key(id),
constraint foreign key(college_id) references college(college_id)
) engine=InnoDB default charset=utf8 comment '专业科目表';
相关图片
高校分布图
成品下载:(请修改后缀txt为zip)
spider.txt
(8.12 KB, 下载次数: 40)
PS:野生猿一枚,代码较为粗糙,大神勿喷
|
免费评分
-
查看全部评分
|