python爬虫、多线程、pyecharts数据展示

yoin · 发表于 2019-11-11 14:51

基础爬虫：爬取高校信息

[Python] 纯文本查看 复制代码

#!C:\Program Files\Python36 python
# -*- coding: UTF-8 -*-
"""
@author: 东方不败
"""
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import pymysql


class College:
    # 初始化
    def __init__(self):
        self.ua = UserAgent()
        self.headers = {"User-Agent": self.ua.random}
        self.url = "http://xkkm.sdzk.cn/zy-manager-web/html/xx.html"
        self.conn = pymysql.connect('localhost', 'root', '')
        self.conn.select_db('college_bak')

    # 获取页面内容，返回Bs对象
    def get_page(self):
        try:
            response = requests.get(self.url, headers=self.headers).content
            page = BeautifulSoup(response, 'html.parser')
            return page
        except Exception as Err:
            print('Error1:', str(Err))

    # 解析页面内容，获取高校信息，添加到college_infos，并返回
    def get_college(self, page):
        college_infos = []
        # 先找到大表格，然后获取每一行的数据
        colleges = page.find('tbody', {'class': 'scrollTbody'}).find_all('tr')
        # 遍历每一行，获取高校信息
        for college in colleges:
            data = college.find_all('td')
            area = data[1].text
            college_id = data[2].text
            college_name = data[3].text
            college_site = data[5].find('a').get('href')
            # 组合信息，append
            college_infos.append((area, college_id, college_name, college_site))
        print('总计获取%s条数据' % str(len(college_infos)))
        return college_infos

    # 将数据插入数据库
    def insert_college(self, data):
        try:
            cur = self.conn.cursor()
            sql = "insert into college(area, college_id, college_name, college_site) values(%s, %s, %s, %s)"
            rows = cur.executemany(sql, tuple(data))
            self.conn.commit()
            self.conn.close()
            return rows
        except Exception as Err:
            print('数据插入失败！！！')
            print('Error2:', str(Err))


if __name__ == "__main__":
    college = College()
    data = college.get_college(college.get_page())
    rows = college.insert_college(data)
    print('总计插入%s条数据' % str(rows))
    print("-"*50)
    print("Done")

多线程：爬取各高校不同专业对于选考科目的要求

[Python] 纯文本查看 复制代码

#!C:\Program Files\Python36 python
# -*- coding: UTF-8 -*-
"""
@author: 东方不败
"""
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import re
import pymysql
from threading import Thread


class Subject:
    def __init__(self, param):
        self.ua = UserAgent()
        self.headers = {"User-Agent": self.ua.random}
        self.url = "http://xkkm.sdzk.cn/zy-manager-web/gxxx/searchInfor"
        self.param = param

    def get_page(self):
        try:
            response = requests.post(self.url, data=self.param, headers=self.headers).content
            page = BeautifulSoup(response, 'html.parser')
            return page
        except Exception as Err:
            print('Error1:', str(Err))

    def get_subject(self, page):
        college_id = self.param['dm']
        subject_infos = []
        subjects = page.find('div', {'id': 'ccc'}).find_all('tr')
        for subject in subjects:
            data = subject.find_all('td')
            gradation = data[1].text
            classification = data[2].text.strip()
            subject = data[3].text
            # 提取专业详情
            temp = str(data[4]).replace("\n", "").replace("\t", "")
            item = re.findall(r"\w+<br/>", temp)
            item.remove(item[0])
            major = "/".join(item).replace("<br/>", "")
            # 组合数据
            subject_infos.append((college_id, gradation, classification, subject, major))
        return subject_infos

    def insert_subject(self):
        page = self.get_page()
        data = self.get_subject(page)
        college_name = self.param['mc']
        db = DataBase()
        sql = "insert into major(college_id, gradation, classification, subject, major) values(%s,%s,%s,%s,%s)"
        rows = db.cur.executemany(sql, tuple(data))
        db.conn.commit()
        db.close()
        print(str(college_name), ":", str(rows), "条数据插入完成")


class DataBase:
    host = "localhost"
    user = "root"
    password = ""
    database = "college_bak"

    def __init__(self):
        self.conn = pymysql.connect(host=self.host, user=self.user, password=self.password, database=self.database)
        self.cur = self.conn.cursor()

    def close(self):
        self.conn.close()


def get_college():
    db = DataBase()
    sql = 'select college_id,college_name from college'
    db.cur.execute(sql)
    result = db.cur.fetchall()
    db.close()
    return result


if __name__ == "__main__":
    college_data = list(get_college())
    # 测试用
    college_data = college_data[:5]
    counts = len(college_data)
    print('正在获取', str(counts), '所高校的专业选课要求信息')
    # 开启多线程
    my_threads = []
    try:
        while True:
            # 开启线程数
            for i in range(5):
                data = college_data.pop()
                param = {'dm': data[0], 'mc': data[1]}
                subject = Subject(param)
                my_thread = Thread(target=subject.insert_subject())
                my_threads.append(my_thread)
                my_thread.start()
    except Exception as Err:
        print('Error2:', str(Err))
    finally:
        for thread in my_threads:
            thread.join()

        print('+'*50)
        print('Done')

pyecharts：展示爬取的数据（仅供参考，并未深入分析）

[Python] 纯文本查看 复制代码

#!C:\Program Files\Python36 python
# -*- coding: UTF-8 -*-
"""
@author: 东方不败
"""
from pyecharts import options as opts
from pyecharts.charts import Map, Bar, Pie
import pymysql


class DataBase:
    host = "localhost"
    user = "root"
    password = ""
    database = "college_bak"

    def __init__(self):
        self.conn = pymysql.connect(host=self.host, user=self.user, password=self.password, database=self.database)
        self.cur = self.conn.cursor()

    def close(self):
        self.conn.close()


def college_map():
    db = DataBase()
    sql = "select area, count(id) from college group by area"
    db.cur.execute(sql)
    ret = db.cur.fetchall()
    db.close()
    college_map = Map(init_opts=opts.InitOpts(width="1400px", height="900px")) \
        .set_global_opts(title_opts=opts.TitleOpts(title="全国高校分布图"), visualmap_opts=opts.VisualMapOpts(max_=150), )
    college_map.add('高校数量', list(ret), ).set_series_opts(
        label_opts=opts.LabelOpts(is_show=True, color="#00f", formatter="{b}:{c}"))
    college_map.render('college_map.html')


def subject_map():
    subject_map = Bar().add_xaxis(["物理", "化学", "生物", "政治", "历史", "地理"]) \
        .add_yaxis("", [11931, 5414, 3153, 761, 1057, 1015]) \
        .set_global_opts(title_opts=opts.TitleOpts(title="学科统计图"))
    subject_map.render("subject_map.html")


def subject_pie():
    # 学科占比图
    subject_count = [("物理", 11931), ("化学", 5414), ("生物", 3153), ("政治", 761), ("历史", 1057), ("地理", 1015)]
    subject_pie = Pie().add("", subject_count).set_global_opts(title_opts=opts.TitleOpts(title="学科比例图")) \
        .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
    subject_pie.render("subject_pie.html")


if __name__ == "__main__":
    college_map()
    subject_map()
    subject_pie()
    print('Done')

数据库创建

[SQL] 纯文本查看 复制代码

# 删除数据库
-- drop database college;

# 创建数据库
create database college_bak default character set utf8;

use college_bak;

# 创建院校表
create table college(
	id int unsigned auto_increment comment '序号',
	area varchar(20) not null default '' comment '地区',
	college_id char(5) not null default '00000' comment '学校代码',
	college_name varchar(20) not null default '' comment '学校名称',
	college_site varchar(128) not null default '' comment '学校网站',
	unique (college_id),
	primary key(id)# 此处不要写',',否则会报错
) engine=InnoDB default charset=utf8 comment '院校表';

# 创建专业科目表
create table major(
	id int unsigned auto_increment comment '序号',
	college_id char(5) not null default '00000' comment '学校代码',
	gradation varchar(10) not null default '' comment '层次',
	classification varchar(128) not null default '' comment '专业名称',
	subject varchar(50) not null default '' comment '科目要求',
	major varchar(128) not null default '' comment '所含专业',
	primary key(id),
	constraint foreign key(college_id) references college(college_id)
) engine=InnoDB default charset=utf8 comment '专业科目表';

帐号		自动登录	找回密码
密码			注册[Register]

王星星王星星当前离线好友阅读权限 0 听众最后登录 1970-1-1 头像被屏蔽	王星星发表于 2019-11-11 15:09 吾爱破解论坛没有任何官方QQ群，禁止留联系方式，禁止任何商业交易。提示: 作者被禁止或删除内容自动屏蔽
王星星王星星当前离线好友阅读权限 0 听众最后登录 1970-1-1 头像被屏蔽	如何升级？如何获得积分？积分对应解释说明！
	回复支持举报

[讨论] python爬虫、多线程、pyecharts数据展示

免费评分