import requests
from bs4 import BeautifulSoup
import re
import json
import time
#python 标准头 cookie referer host 等 具体分析
#useragent 不需要更改
headers={
'User-Agent': '这个需要替换成你自己的',
'Accept-Encoding': 'gzip, deflate',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Connection': 'keep-alive',
'Host': 'm.51job.com',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Referer':'https://m.51job.com/search/joblist.php?keyword=python&keywordtype=2&funtype=0000&indtype=00&jobarea=000000&jobterm=99&cotype=99&issuedate=9&saltype=99°ree=99&landmark=0&workyear=99&cosize=99&radius=-1&lonlat=0%2C0',
'Cookie': 'guid=f20363b27d082fd30e051677795994d9; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; search=jobarea%7E%60040000%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60040000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FApython%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch1%7E%60040000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch2%7E%60240000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21; m_search=keyword%3Dpython%26%7C%26areacode%3D040000; partner=51jobhtml5',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-Use': '?1'
}
def get_page(code,keyword,pagenumber):
for p in range(int(pagenumber)):
url = 'https://m.51job.com/search/joblist.php?jobarea='+str(code)+'&keyword='+str(keyword)+'&keywordtype=2&lonlat=0%2C0&radius=-1&pageno='+str(p+1)
r=requests.get(url,headers=headers)
soup = BeautifulSoup(r.content.decode('utf-8'),'html5lib')
#完成数据清洗 挑出 url详情页
find_p=re.findall('href="(.*?)"><b class="jobid"',str(soup))
a=1
print(find_p)
for i in find_p:
try:
info_dict=get_info(i)
print('已经读取',a,'份信息')
a=a+1
#if info_dict:
# insert_db(info_dict)
except Exception as e:
print(e)
time.sleep(5)
return r
def insert_db(info_dict):
import time
from sqlalchemy import Column,Integer,String,DateTime
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
import json
engine = create_engine('mysql+pymysql://root:root@localhost:3306/51job?charset=utf8',echo=True)
Base=declarative_base()
class table_info(Base):
__tablename__='job_info'
id=Column(Integer(),primary_key=True)
job_id=Column(String(100),comment='职位ID')
company_name=Column(String(100),comment='企业名称')
company_type=Column(String(100),comment='企业类型')
company_scale=Column(String(100),comment='企业规模')
company_trade=Column(String(100),comment='企业经营范围')
company_welfare=Column(String(1000),comment='企业福利')
job_name=Column(String(3000),comment='职位名称')
job_pay=Column(String(100),comment='职位薪酬')
job_years=Column(String(100),comment='工龄要求')
job_education=Column(String(100),comment='学历要求')
job_member=Column(String(100),comment='招聘人数')
job_location=Column(String(3000),comment='上班地址')
job_describe=Column(String(3000),comment='工作描述')
job_date=Column(String(100),comment='发布日期')
recruit_sources=Column(String(100),comment='招聘来源')
log_date=Column(String(100),comment='记录日期')
Base.metadata.create_all(engine)
from sqlalchemy.orm import sessionmaker
DBSession=sessionmaker(bind = engine)
SQLsession = DBSession()
new_data=table_info(
#temp_id=info_dict["job_id"],
job_id=info_dict.get('job_id',''),
#company_name='hhan',
company_name=info_dict.get('company_name'),
company_type=info_dict.get('company_type',''),
company_trade=info_dict.get('company_trade',''),
company_scale=info_dict.get('company_scale',''),
job_name=info_dict.get('job_name',''),
job_pay=info_dict.get('job_pay',''),
job_years=info_dict.get('job_years',''),
job_education=info_dict.get('job_education',''),
job_member=info_dict.get('job_member',''),
job_location=info_dict.get('job_location',''),
job_describe=info_dict.get('job_describe',''),
recruit_sources=info_dict.get('recruit_sources',''),
job_date=info_dict.get('job_date',''),
company_welfare=info_dict.get('company_welfare'),
log_date=time.strftime('%Y-%m-%d',time.localtime(time.time()))
)
SQLsession.add(new_data)
SQLsession.commit()
SQLsession.close()
def get_info(url):
print(url)
#两次的请求头 发生了 变化 cookie 和 hosT
headers={
'User-Agent': '同上',
'Accept-Encoding': 'gzip, deflate',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Connection': 'keep-alive',
'Host': 'msearch.51job.com',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Referer':'https://m.51job.com/search/joblist.php?keyword=python&keywordtype=2&funtype=0000&indtype=00&jobarea=000000&jobterm=99&cotype=99&issuedate=9&saltype=99°ree=99&landmark=0&workyear=99&cosize=99&radius=-1&lonlat=0%2C0',
'Cookie': 'guid=f20363b27d082fd30e051677795994d9; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; search=jobarea%7E%60040000%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60040000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FApython%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch1%7E%60040000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch2%7E%60240000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21; m_search=keyword%3Dpython%26%7C%26areacode%3D040000; partner=51jobhtml5',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-Use': '?1'
}
try:
if "m.51job.com" in url:
r = requests.get(url,headers=headers)
print(r)
time.sleep(1.5)
soup =BeautifulSoup(r.content.decode('utf-8'),'html5lib')
#数据清洗
temp_dict={}
temp_dict['job_id']='python'
#print(temp_dict['job_id'])
try:
temp_dict['job_name']=soup.find('div',class_='jt').find('p').getText()
except:
temp_dict['job_name']='kong'
try:
temp_dict['job_date']=soup.find('div',class_='jt').find('span').getText()
except:
temp_dict['job_date']='kong'
try:
temp_dict['job_pay']=soup.find('p',class_='jp').getText()
except:
temp_dict['job_pay']='kong'
try:
temp_dict['company_name']=soup.find('p',class_='c_444').getText()
except:
temp_dict['company_name']='kong'
try:
qy_info=soup.find('div',class_='at').getText()
except:
qy_info='kong0|kong1|kong2'
temp_dict['company_type']=qy_info.split('|')[0]
temp_dict['company_scale']=qy_info.split('|')[1]
temp_dict['company_trade']=qy_info.split('|')[2]
try:
temp_dict['job_location']=soup.find('a',class_='arr a2').find('span').getText()
except:
temp_dict['job_location']='kong'
try:
temp_dict['job_member']=soup.find('div',class_='jd').find('span').getText()
except:
temp_dict['job_member']='kong'
try:
temp_dict['job_education']=soup.find('span',class_='s_x').getText()
except:
temp_dict['job_education']='kong'
try:
temp_dict['job_years']=soup.find('span',class_='s_n').getText()
except:
temp_dict['job_years']='kong'
try:
temp_dict['job_describe']=soup.find('article').getText().replace('//t','')+soup.find('br').getText().replace('//t','')
except:
temp_dict['job_describe']='kong'
try:
temp_dict['company_welfare']=soup.find('div',class_='welfare').find('span').getText()
except:
temp_dict['company_welfare']='kong'
print('21')
insert_db(temp_dict)
'''
with open('temp.text','w+') as f:
f.write(str(temp_dict))
f.close()
'''
else:
print('读取结束')
except:
print('出错 跳出')
return temp_dict
print(get_page('040000','python','2'))