爬取教育部人文社科项目

灵海之森 发表于 2021-2-28 12:09

API接口：http://pub.sinoss.net/portal/webgate/CmdNormalList
思路概述：根据条件检索，获取到项目的负责人和id，再通过这两个数据组配到项目详情的网址，进行爬取。
第一步：根据条件检索概览页面。
#coding='utf-8'
import requests
import re
import xlwt

headers = {
                              'Accept-Encoding': 'gzip, deflate, sdch',
                              'Accept-Language': 'en-US,en;q=0.8',
                              'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
                           'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                              'Referer': 'https://www.baidu.com/',
                        'Connection': 'keep-alive',
                           }

url_1='http://pub.sinoss.net/webgate/CmdSearchNormal?curr_page='
url_2='&total_pages=269&project_name=&SUBJECT_name=0&people_name=&univ=0&year=2000&project_sort=5'
url_0=list(range(1,270))#生成一个1到181的数字列表
urls=[]#网址列表

def require(url):
   """获取网页源码
   """
   response = requests.get(url, headers=headers)
   print(response.status_code)#状态码
   print(response.encoding)#首选编码
   '''print(response.apparent_encoding)#备选编码'''
   #response.encoding=response.apparent_encoding
   html=response.text#源代码文本
   return html

for i in url_0:
   i=url_1+str(i)+url_2
   urls.append(i)

def get_infor(one_url):
   '''进入一个页面，获取该页面的信息，返回列表的列表all'''
   html=require(one_url)
   result_1=re.findall('<table width="910" border="0" cellspacing="1" cellpadding="5" bgcolor="E4EEFE" Align=center>(.*?)</table>',html,re.S)#进入table
   print('result_1是：')
   print(result_1)
   #<td height=23><a class=s href="/portal/webgate/CmdProjectView?proj_id=212334&applier=严法善" target=_blank>
   i_2=re.findall('<a class=s href="(.*?)" target=_blank>',str(result_1),re.S)#取出网址片段
   print('i_2是：')
   print(i_2)
   #提取中文
   xm=[]#姓名
   for i in i_2:
            i=re.sub("\,\。///?_=&=]", "", str(i))
            xm.append(i)
   print("姓名是")
   print(xm)

   id=[]#id号
   for i in i_2:
            i=re.sub("\D", "", str(i))
            id.append(i)
   print("id是")
   print(id)
   all=[]#总的结果
   #列表拼接
   for i in range(len(i_2)):
            all.append(xm+id)
   return all

"""保存为excel"""
f=xlwt.Workbook()
sheet1=f.add_sheet(u'sheet1',cell_overwrite_ok=True)
sheet1.write(0,0,'姓名')
sheet1.write(0,1,'id')

if __name__=="__main__":
   i=1
   alls=[]
   for one_url in urls:
            alls.append(get_infor(one_url))#全部页的数据分为一个181个子列表的列表
   for all in alls:#遍历每一页
            for data in all:#遍历每一行
                     for j in range(len(data)):#取每一单元格
                           sheet1.write(i,j,data)#写入单元格
                     i=i+1#往下一行
   f.save('2000年网址.xls')
   #保存所有

第二步：依次组配单项目详情网址，爬取具体数据。
#coding='utf-8'
import requests
import re
import time as t
import xlwt
import xlrd
import os
import pandas as pd
from openpyxl import Workbook
from urllib.parse import quote
import random

def extract(inpath,l):
"""
第一个是文件路径，第二个是取第几列
第0列是姓名，第1列是id
取出l列数据
"""
data = xlrd.open_workbook(inpath, encoding_override='gb2312')
table = data.sheets()#选定表
nrows = table.nrows#获取行号
ncols = table.ncols#获取列号
numbers=[]
for i in range(1, nrows):#第0行为表头
   alldata = table.row_values(i)#循环输出excel表中每一行，即所有数据
   result = alldata#取出表中第一列数据
   numbers.append(result)
return numbers

headers = {
   'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'en-US,en;q=0.8',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Referer': 'http://pub.sinoss.net/portal/webgate/CmdNormalList',
   'Connection': 'close',
}

url_1='http://pub.sinoss.net/portal/webgate/CmdProjectView?proj_id='
url_2='&applier='
url_3=extract('2000年网址.xls',1)#id列表
url_4=extract('2000年网址.xls',0)#姓名列表，需要遍历时进行URLencode加密

urls=[]#网址列表

for i in range(len(url_3)):
i=url_1+str(url_3)+url_2+str(quote(url_4.encode('gbk')))
urls.append(i)

def require(url):
"""获取网页源码
"""
requests.adapters.DEFAULT_RETRIES = 10
s = requests.session()
s.keep_alive = False
while True:
try:
response = requests.get(url, headers=headers,timeout=(30,50),verify=False)
break
except:
print("Connection refused by the server..")
print("Let me sleep for 5 seconds")
print("ZZzzzz...")
t.sleep(5)
print("Was a nice sleep, now let me continue...")
continue

html=response.text#源代码文本
return html

def get_infor(one_url):
'''进入一个页面，获取该页面的信息，返回列表的列表all'''
html=require(one_url)
result_1=re.findall('<table width="908" border="0" cellspacing="1" cellpadding="5" bgcolor="E4EEFE" Align=center>(.*?)</table>',html,re.S)#进入table
i_2=re.findall('<td bgcolor="#F3F3F3" align=left height=23>(.*?)</td>',str(result_1),re.S)#取出内容
i_3=[]#清洗后的

for i in i_2:
i=i.replace('\\r\\n ','')
i=i.replace('\\t\\r\\n','')
i=i.replace('\\t ','')
i=i.replace(' ','')
i=i.replace('\\t','')
i=i.replace('</tr><tr><tdbgcolor="#3E7CC2"class=talign=center><fontcolor="#FFFFFF"><b>批准经费</b></font>','')
i_3.append(i)
print(i_3)
return i_3

"""保存为excel"""
f=xlwt.Workbook()
sheet1=f.add_sheet(u'sheet1',cell_overwrite_ok=True)
sheet1.write(0,0,'立项年份')
sheet1.write(0,1,'所属院校')
sheet1.write(0,2,'项目名称')
sheet1.write(0,3,'项目来源')
sheet1.write(0,4,'项目编号')
sheet1.write(0,5,'负责人')
sheet1.write(0,6,'成果类型')
sheet1.write(0,7,'批准经费')

if __name__=="__main__":
i=1
alls=[]
for one_url in urls:
print(one_url)
alls.append(get_infor(one_url))#全部项目的数据分为一个n个子列表的列表
#t.sleep(1)
t.sleep(random.randint(1,3))
for all in alls:#遍历每一页
#for data in all:#遍历每一行
for j in range(len(all)):#取每一单元格
sheet1.write(i,j,all)#写入单元格
i=i+1#往下一行
f.save('2000年度教育部人文社会科学研究一般项目立项一览表.xls')
#保存所有

爬取时需要注意等待时间长一点，不然网站会拒绝请求。

yeast 发表于 2021-2-28 12:15

谢谢，正好需要搜集

xiaobaisky 发表于 2021-2-28 12:23

学习了，谢谢！

liuhaifei520sdo 发表于 2021-2-28 12:25

学习了，谢谢！

status_0 发表于 2021-2-28 12:31

谢谢学习了

领头羊 发表于 2021-2-28 12:31

学习学习，谢谢！

sunjianxin33 发表于 2021-2-28 12:33

学习学习，正需要这种技术

hui20200721 发表于 2021-2-28 12:33

学习学习

风乘云集水浒卡 发表于 2021-2-28 12:36

学习学习

上百度 发表于 2021-2-28 12:44

学习了，感谢

页: [1] 2

吾爱破解 - 52pojie.cn's Archiver

爬取教育部人文社科项目