Python爬虫 爬取考研信息
准备考研,就自己捣鼓了爬取考研院校各个专业的信息想考那个省的,直接输入就ok,代码不是很完善,有能力的大佬就自己动手吧
直接上代码↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓
import requests
import re
import os
class Kaoyan():
def get_url(self, shen):
url = 'https://school.kaoyan.cn/jd/school/lists'
params = {
'format': 'json',
'page': 1,
'province': shen,
'class': '',
'type': ''
}
resp = requests.get(url, params=params).text.encode().decode('unicode_escape')
# print(resp.encode().decode('unicode_escape'))
return resp
def get_name(self, resp):
list_name = re.findall('name":"(.*?)"', resp)
school_id = re.findall('"school_id":"(.*?)"', resp)
school_xx = dict(zip(list_name, school_id))
return school_xx
def get_zhuanye(self, school_xx):
for i in school_xx:
url = f'https://school.kaoyan.cn/jd/school/subject?school_id={school_xx}'
# url = 'https://school.kaoyan.cn/jd/school/subject?school_id=1048'
resp = requests.get(url).text
# href = "/jd/school/specialdetail?id=281896" > < p > 机械(航空电子电气学院) < / p >
href = re.findall('href="(.*?)"', resp)[1:]
zhuanye = re.findall('<p>(.*?)<', resp)[:-3]
# print(href, zhuanye)
school_zhuanye = dict(zip(zhuanye, href))
print(f'开始获取:{i}')
self.get_zhuanye_xinxi(school_zhuanye, i)
def re_max(self, resp, name):
专业详情 = re.findall('class="promotipn-title">(.*?)<', resp, re.S)[0]
招生信息 = re.findall('<div class="inner-body".*?<p>(.*?)</div>', resp, re.S)[0]
# 招生信息 = re.findall('<p>(.*?)<', resp1, re.S)
# _______________________专业招生详情
研究方向 = re.findall('研究方向:</span.*?class="right-text">(.*?)</span>', resp, re.S)
招生人数 = re.findall('招生人数:</span.*?ss="right-text">(.*?)</span>', resp, re.S)
考试科目 = re.findall('>考试科目:</sp.*?ss="right-text">(.*?)</span>', resp, re.S)
备注信息 = re.findall('>备注信息:</spa.*? class="right-text">(.*?)</span>', resp, re.S)
try:
with open(f'{name}.txt', 'a') as f:
print(f'开始写入:{name}----{专业详情}')
f.write(专业详情.strip())
f.write('\n')
f.write(招生信息.strip())
f.write('\n')
f.write('专业招生详情')
f.write(研究方向[0].strip())
f.write('\n')
f.write(招生人数[0].strip())
f.write('\n')
f.write(考试科目[0].strip())
f.write('\n')
f.write(备注信息[0].strip())
f.write('\n')
f.write('*' * 100)
f.write('\n')
except UnicodeEncodeError:
pass
with open(f'{name}.txt', 'r') as r, open(f'_{name}.txt', 'a') as f:
for i in r:
rr = i.replace('</p>', '')
rr = rr.replace('<p>', '')
rr = rr.replace('<br/>', '')
rr = rr.replace('<br>', '')
f.write(rr)
os.remove(f'{name}.txt')
def get_zhuanye_xinxi(self, zhuanye, name):
for i in zhuanye:
url = 'https://school.kaoyan.cn' + zhuanye
resp = requests.get(url).text
self.re_max(resp, name)
def run(self):
shen = input('输入想获取的省份:')
school_xx = self.get_name(self.get_url(shen))
self.get_zhuanye(school_xx)
if __name__ == '__main__':
ky = Kaoyan()
ky.run()
爬完之后发现也没多大用处
就这样吧! 过来学习一下 感谢分享 没想到看到我母校了,快逃 import requests
import re
import os
class Kaoyan():
def get_url(self, shen):
url = 'https://school.kaoyan.cn/jd/school/lists'
params = {
'format': 'json',
'page': 1,
'province': shen,
'class': '',
'type': ''
}
resp = requests.get(url, params=params).text.encode().decode('unicode_escape')
# print(resp.encode().decode('unicode_escape'))
return resp
def get_name(self, resp):
list_name = re.findall('name":"(.*?)"', resp)
school_id = re.findall('"school_id":"(.*?)"', resp)
school_xx = dict(zip(list_name, school_id))
return school_xx
def get_zhuanye(self, school_xx):
for i in school_xx:
url = f'https://school.kaoyan.cn/jd/school/subject?school_id={school_xx}'
# url = 'https://school.kaoyan.cn/jd/school/subject?school_id=1048'
resp = requests.get(url).text
# href = "/jd/school/specialdetail?id=281896" > < p > 机械(航空电子电气学院) < / p >
href = re.findall('href="(.*?)"', resp)
zhuanye = re.findall('<p>(.*?)<', resp)[:-3]
# print(href, zhuanye)
school_zhuanye = dict(zip(zhuanye, href))
print(f'开始获取:{i}')
self.get_zhuanye_xinxi(school_zhuanye, i)
def re_max(self, resp, name):
专业详情 = re.findall('class="promotipn-title">(.*?)<', resp, re.S)
招生信息 = re.findall('<div class="inner-body".*?<p>(.*?)</div>', resp, re.S)
# 招生信息 = re.findall('<p>(.*?)<', resp1, re.S)
# _______________________专业招生详情
研究方向 = re.findall('研究方向:</span.*?class="right-text">(.*?)</span>', resp, re.S)
招生人数 = re.findall('招生人数:</span.*?ss="right-text">(.*?)</span>', resp, re.S)
考试科目 = re.findall('>考试科目:</sp.*?ss="right-text">(.*?)</span>', resp, re.S)
备注信息 = re.findall('>备注信息:</spa.*? class="right-text">(.*?)</span>', resp, re.S)
try:
with open(f'{name}.txt', 'a') as f:
print(f'开始写入:{name}----{专业详情}')
f.write(专业详情.strip())
f.write('\n')
f.write(招生信息.strip())
f.write('\n')
f.write('专业招生详情')
f.write(研究方向.strip())
f.write('\n')
f.write(招生人数.strip())
f.write('\n')
f.write(考试科目.strip())
f.write('\n')
f.write(备注信息.strip())
f.write('\n')
f.write('*' * 100)
f.write('\n')
except UnicodeEncodeError:
pass
with open(f'{name}.txt', 'r') as r, open(f'_{name}.txt', 'a') as f:
for i in r:
rr = i.replace('</p>', '')
rr = rr.replace('<p>', '')
rr = rr.replace('<br/>', '')
rr = rr.replace('<br>', '')
f.write(rr)
os.remove(f'{name}.txt')
def get_zhuanye_xinxi(self, zhuanye, name):
for i in zhuanye:
url = 'https://school.kaoyan.cn' + zhuanye
resp = requests.get(url).text
self.re_max(resp, name)
def run(self):
shen = input('输入想获取的省份:')
school_xx = self.get_name(self.get_url(shen))
self.get_zhuanye(school_xx)
if __name__ == '__main__':
ky = Kaoyan()
ky.run() 多谢分享 学习一下,谢谢分享啦 谢谢分享 哈哈哈哈哈哈,绝了,属于是无聊没事练练爬虫了,可以考虑做个ip池,挂个DL,这样子多少防着点被请喝茶。 过来学习一下,谢谢分享
页:
[1]
2