本帖最后由 0xSui 于 2021-8-16 08:31 编辑
代码规则有部分变动,修改下div和class判断,现在可以识别题目的。
顺便查看了下,官网的题目(/m)是移动端页面,(/jq)是pc页面。
今天单位让答题,打开链接准备答题,提示:电脑上面答不了,还只能微信扫码答题,看了下网页的内容,是问卷星的答题卷;
题目和选项都在页面里面写好了,所以就简单写了个爬虫代码,用了beautifulsoup、requests,另外存储抓取内容用了mysql数据库;
题目的选项个数不一定统一,所以数据大家使用的时候,可以直接把抓取到的题目都放到一个字段里面存着,后面要用的时候,直接读出来,遍历就行;
再就是,这网站暂时没有防爬机制,所以直接while循环遍历那种随机题目的地址,就能刷出来整个题库的题目(稍微改改代码就能爬整个网站的题库了);
代码比较简单,简单分享,仅供学习~
[Python] 纯文本查看 复制代码
# -*- coding: utf-8 -*-
import time
import requests
import random
from bs4 import BeautifulSoup
import pymysql
from faker import Faker
fake = Faker(locale='zh_CN')
# 在这里配置mysql数据库的配置
mysql_host = '127.0.0.1'
# 数据库名称
mysql_db = 'wjx'
# 账号
mysql_user = 'root'
# 密码
mysql_password = '123'
# 端口
mysql_port = 3306
pages = []
def ua(refer_str):
headers = {'User-Agent': fake.user_agent()}
if len(refer_str) > 0:
headers.update(Referer=refer_str)
return headers
def db_insert(title, a, b, c):
sql_s = '''insert ignore into questions (title, answer_a, answer_b, answer_c) values(%s, %s, %s, %s)'''
db = pymysql.connect(host=mysql_host, port=mysql_port, user=mysql_user, password=mysql_password, db=mysql_db, charset='utf8')
cursor = db.cursor()
cursor.execute(sql_s, (title, a, b, c))
db.commit()
db.close()
def html_get(url, header):
with requests.Session() as s:
s.keep_alive = False
html = ""
while html == "":
try:
res_get = s.get(url, headers=header)
# stream=True
print(res_get.status_code)
html = res_get.content
return res_get.content
except Exception as e:
print(e)
print('下载出错: %s' % url)
continue
def get_wjx(wj_url):
html_str = html_get(wj_url, ua(refer_str='http://ks.wjx.top'))
# print(str(html_str, 'utf-8'))
bs_str_all = BeautifulSoup(str(html_str, 'utf-8'), 'html.parser').findAll("div", attrs={"class": "field ui-field-contain"})
# print(bs_str_all)
for item in bs_str_all:
t = ''
ss = []
title = item.findAll('div', attrs={'class': 'field-label'})
sections = item.findAll('div', attrs={'class': 'label'})
# if title[0].get_text() == '基本信息:*':
# print("跳过该记录")
# else:
for t in title:
t = t.get_text()
for section in sections:
ss.append(section.get_text())
if len(ss) < 1:
print('跳过空白题目')
else:
print(t)
print(ss)
# 开始插入数据库
# 不确定选项数,我抓题用的是三个选项的,根据题目情况特殊处理
if len(ss) < 3:
ss.append('')
if len(ss) < 4:
ss.append('')
s_a = ss[0]
s_b = ss[1]
s_c = ss[2]
s_d = ss[3]
db_insert(title=t, a=s_a, b=s_b, c=s_c, d=s_d)
def job():
print('开始抓取')
while True:
url1 = 'https://ks.wjx.top/jq/63596149.aspx'
get_wjx(wj_url=url1)
time.sleep(3)
if __name__ == '__main__':
# 调试执行
job()
|