python爬虫、多线程、pyecharts数据展示

yoin 发表于 2019-11-11 14:51

基础爬虫：爬取高校信息

#!C:\Program Files\Python36 python
# -*- coding: UTF-8 -*-
"""
@author: 东方不败
"""
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import pymysql

class College:
# 初始化
def __init__(self):
   self.ua = UserAgent()
   self.headers = {"User-Agent": self.ua.random}
   self.url = "http://xkkm.sdzk.cn/zy-manager-web/html/xx.html"
   self.conn = pymysql.connect('localhost', 'root', '')
   self.conn.select_db('college_bak')

# 获取页面内容，返回Bs对象
def get_page(self):
   try:
         response = requests.get(self.url, headers=self.headers).content
         page = BeautifulSoup(response, 'html.parser')
         return page
   except Exception as Err:
         print('Error1:', str(Err))

# 解析页面内容，获取高校信息，添加到college_infos，并返回
def get_college(self, page):
   college_infos = []
   # 先找到大表格，然后获取每一行的数据
   colleges = page.find('tbody', {'class': 'scrollTbody'}).find_all('tr')
   # 遍历每一行，获取高校信息
   for college in colleges:
         data = college.find_all('td')
         area = data.text
         college_id = data.text
         college_name = data.text
         college_site = data.find('a').get('href')
         # 组合信息，append
         college_infos.append((area, college_id, college_name, college_site))
   print('总计获取%s条数据' % str(len(college_infos)))
   return college_infos

# 将数据插入数据库
def insert_college(self, data):
   try:
         cur = self.conn.cursor()
         sql = "insert into college(area, college_id, college_name, college_site) values(%s, %s, %s, %s)"
         rows = cur.executemany(sql, tuple(data))
         self.conn.commit()
         self.conn.close()
         return rows
   except Exception as Err:
         print('数据插入失败！！！')
         print('Error2:', str(Err))

if __name__ == "__main__":
college = College()
data = college.get_college(college.get_page())
rows = college.insert_college(data)
print('总计插入%s条数据' % str(rows))
print("-"*50)
print("Done")

多线程：爬取各高校不同专业对于选考科目的要求

#!C:\Program Files\Python36 python
# -*- coding: UTF-8 -*-
"""
@author: 东方不败
"""
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import re
import pymysql
from threading import Thread

class Subject:
def __init__(self, param):
   self.ua = UserAgent()
   self.headers = {"User-Agent": self.ua.random}
   self.url = "http://xkkm.sdzk.cn/zy-manager-web/gxxx/searchInfor"
   self.param = param

def get_page(self):
   try:
         response = requests.post(self.url, data=self.param, headers=self.headers).content
         page = BeautifulSoup(response, 'html.parser')
         return page
   except Exception as Err:
         print('Error1:', str(Err))

def get_subject(self, page):
   college_id = self.param['dm']
   subject_infos = []
   subjects = page.find('div', {'id': 'ccc'}).find_all('tr')
   for subject in subjects:
         data = subject.find_all('td')
         gradation = data.text
         classification = data.text.strip()
         subject = data.text
         # 提取专业详情
         temp = str(data).replace("\n", "").replace("\t", "")
         item = re.findall(r"\w+<br/>", temp)
         item.remove(item)
         major = "/".join(item).replace("<br/>", "")
         # 组合数据
         subject_infos.append((college_id, gradation, classification, subject, major))
   return subject_infos

def insert_subject(self):
   page = self.get_page()
   data = self.get_subject(page)
   college_name = self.param['mc']
   db = DataBase()
   sql = "insert into major(college_id, gradation, classification, subject, major) values(%s,%s,%s,%s,%s)"
   rows = db.cur.executemany(sql, tuple(data))
   db.conn.commit()
   db.close()
   print(str(college_name), ":", str(rows), "条数据插入完成")

class DataBase:
host = "localhost"
user = "root"
password = ""
database = "college_bak"

def __init__(self):
   self.conn = pymysql.connect(host=self.host, user=self.user, password=self.password, database=self.database)
   self.cur = self.conn.cursor()

def close(self):
   self.conn.close()

def get_college():
db = DataBase()
sql = 'select college_id,college_name from college'
db.cur.execute(sql)
result = db.cur.fetchall()
db.close()
return result

if __name__ == "__main__":
college_data = list(get_college())
# 测试用
college_data = college_data[:5]
counts = len(college_data)
print('正在获取', str(counts), '所高校的专业选课要求信息')
# 开启多线程
my_threads = []
try:
   while True:
         # 开启线程数
         for i in range(5):
            data = college_data.pop()
            param = {'dm': data, 'mc': data}
            subject = Subject(param)
            my_thread = Thread(target=subject.insert_subject())
            my_threads.append(my_thread)
            my_thread.start()
except Exception as Err:
   print('Error2:', str(Err))
finally:
   for thread in my_threads:
         thread.join()

   print('+'*50)
   print('Done')

pyecharts：展示爬取的数据（仅供参考，并未深入分析）

#!C:\Program Files\Python36 python
# -*- coding: UTF-8 -*-
"""
@author: 东方不败
"""
from pyecharts import options as opts
from pyecharts.charts import Map, Bar, Pie
import pymysql

class DataBase:
host = "localhost"
user = "root"
password = ""
database = "college_bak"

def __init__(self):
   self.conn = pymysql.connect(host=self.host, user=self.user, password=self.password, database=self.database)
   self.cur = self.conn.cursor()

def close(self):
   self.conn.close()

def college_map():
db = DataBase()
sql = "select area, count(id) from college group by area"
db.cur.execute(sql)
ret = db.cur.fetchall()
db.close()
college_map = Map(init_opts=opts.InitOpts(width="1400px", height="900px")) \
   .set_global_opts(title_opts=opts.TitleOpts(title="全国高校分布图"), visualmap_opts=opts.VisualMapOpts(max_=150), )
college_map.add('高校数量', list(ret), ).set_series_opts(
   label_opts=opts.LabelOpts(is_show=True, color="#00f", formatter="{b}:{c}"))
college_map.render('college_map.html')

def subject_map():
subject_map = Bar().add_xaxis(["物理", "化学", "生物", "政治", "历史", "地理"]) \
   .add_yaxis("", ) \
   .set_global_opts(title_opts=opts.TitleOpts(title="学科统计图"))
subject_map.render("subject_map.html")

def subject_pie():
# 学科占比图
subject_count = [("物理", 11931), ("化学", 5414), ("生物", 3153), ("政治", 761), ("历史", 1057), ("地理", 1015)]
subject_pie = Pie().add("", subject_count).set_global_opts(title_opts=opts.TitleOpts(title="学科比例图")) \
   .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
subject_pie.render("subject_pie.html")

if __name__ == "__main__":
college_map()
subject_map()
subject_pie()
print('Done')

数据库创建

# 删除数据库
-- drop database college;

# 创建数据库
create database college_bak default character set utf8;

use college_bak;

# 创建院校表
create table college(
id int unsigned auto_increment comment '序号',
area varchar(20) not null default '' comment '地区',
college_id char(5) not null default '00000' comment '学校代码',
college_name varchar(20) not null default '' comment '学校名称',
college_site varchar(128) not null default '' comment '学校网站',
unique (college_id),
primary key(id)# 此处不要写',',否则会报错
) engine=InnoDB default charset=utf8 comment '院校表';

# 创建专业科目表
create table major(
id int unsigned auto_increment comment '序号',
college_id char(5) not null default '00000' comment '学校代码',
gradation varchar(10) not null default '' comment '层次',
classification varchar(128) not null default '' comment '专业名称',
subject varchar(50) not null default '' comment '科目要求',
major varchar(128) not null default '' comment '所含专业',
primary key(id),
constraint foreign key(college_id) references college(college_id)
) engine=InnoDB default charset=utf8 comment '专业科目表';

相关图片


成品下载：（请修改后缀txt为zip）

PS：野生猿一枚，代码较为粗糙，大神勿喷

838384855 发表于 2019-11-11 15:07

感谢分享

王星星 发表于 2019-11-11 15:09

fjy2001 发表于 2019-11-11 15:13

谢谢分享

在线小学生 发表于 2019-11-11 15:17

厉害厉害，正准备学习这个展示这块呢。

lvbenyu 发表于 2019-11-11 15:18

能改成爬取商品名称跟价格吗？
我悬赏里有，如何悬赏不够我可以再加

wuty2019 发表于 2019-11-11 15:19

感谢分享

sharokku4869 发表于 2019-11-11 15:21

感谢大佬分享！复制保存学习一下。

香辣鸡肉面 发表于 2019-11-11 15:30

大佬啊，膜拜了

lucky2009 发表于 2019-11-11 15:44

谢谢分享！

页: [1] 2 3

吾爱破解 - 52pojie.cn's Archiver

python爬虫、多线程、pyecharts数据展示