python 爬51job
本帖最后由 苏紫方璇 于 2020-6-26 15:22 编辑这是我在吾爱第一次发自己做的 方式可能很简陋先说一下正文是51job 职位详情的爬取 并导入库
其实主要还是有两个问题想请教
1:我之前爬一个影视网站 找到了 某个影片最后播放的url,我的本意是想下载到本地,用浏览器访问这个url之后,从f12是可以看到一些一段一段的.ts格式文件的这个应该就是我最后需要下载的。我对照了.ts的url发现 其中有一段数字的来源始终找不到希望大佬能给一个思路或者方向也可以
2:我按照书上的思路去爬qq音乐 我成功拿到了许嵩大大的一部分歌曲 我的想法是向上遍历歌手名单 需要歌手的mid(歌手的编号之类)从f12进入是可以看到 编号的但是用beaurifulsoup清洗响应发现响应中少一部分网页代码 也正是我想要的我有想过将歌手信息一页一页人工找到 并存到文档中然后用之前的程序 去读 倒是我觉得有点小气还请大佬指点
下面就是正文了:记一次 51job爬取职位详情 并导入数据库并附上了相应源代码和城市编号文本格式
总的思路就是 查看url分析url组成部分 然后一级一级通过数据清洗拿到自己想要的东西 重组url访问清洗数据组成字典格式 然后 数据入库 数据中有汉字 汉字入数据库还需要设置一下不然 报错
踩过的坑:headers 加上之后 我没遇到什么我觉得主要是cookie一定要有
我加了许多try 容错机制嘛可能有些不必要主要是职位详情中 有些格式并不固定 可能通过find方法找不到 然后就会报错所以try 是有必要的
数据入库的时候 还是比较懵的我用的也比较少 先是连接上然后创建映射关系 然后。。在提交之类的(我现在还不是很清楚)
总的来说 我现在只爬了三个网站只有这个是比较成功的
```
import requests
from bs4 import BeautifulSoup
import re
import json
import time
#python标准头cookie refererhost等 具体分析
#useragent不需要更改
headers={
'User-Agent': '这个需要替换成你自己的',
'Accept-Encoding': 'gzip, deflate',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Connection': 'keep-alive',
'Host': 'm.51job.com',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Referer':'https://m.51job.com/search/joblist.php?keyword=python&keywordtype=2&funtype=0000&indtype=00&jobarea=000000&jobterm=99&cotype=99&issuedate=9&saltype=99°ree=99&landmark=0&workyear=99&cosize=99&radius=-1&lonlat=0%2C0',
'Cookie': 'guid=f20363b27d082fd30e051677795994d9; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; search=jobarea%7E%60040000%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60040000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FApython%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch1%7E%60040000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch2%7E%60240000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21; m_search=keyword%3Dpython%26%7C%26areacode%3D040000; partner=51jobhtml5',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-Use': '?1'
}
def get_page(code,keyword,pagenumber):
for p in range(int(pagenumber)):
url = 'https://m.51job.com/search/joblist.php?jobarea='+str(code)+'&keyword='+str(keyword)+'&keywordtype=2&lonlat=0%2C0&radius=-1&pageno='+str(p+1)
r=requests.get(url,headers=headers)
soup = BeautifulSoup(r.content.decode('utf-8'),'html5lib')
#完成数据清洗挑出 url详情页
find_p=re.findall('href="(.*?)"><b class="jobid"',str(soup))
a=1
print(find_p)
for i in find_p:
try:
info_dict=get_info(i)
print('已经读取',a,'份信息')
a=a+1
#if info_dict:
# insert_db(info_dict)
except Exception as e:
print(e)
time.sleep(5)
return r
def insert_db(info_dict):
import time
from sqlalchemy import Column,Integer,String,DateTime
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
import json
engine = create_engine('mysql+pymysql://root:root@localhost:3306/51job?charset=utf8',echo=True)
Base=declarative_base()
class table_info(Base):
__tablename__='job_info'
id=Column(Integer(),primary_key=True)
job_id=Column(String(100),comment='职位ID')
company_name=Column(String(100),comment='企业名称')
company_type=Column(String(100),comment='企业类型')
company_scale=Column(String(100),comment='企业规模')
company_trade=Column(String(100),comment='企业经营范围')
company_welfare=Column(String(1000),comment='企业福利')
job_name=Column(String(3000),comment='职位名称')
job_pay=Column(String(100),comment='职位薪酬')
job_years=Column(String(100),comment='工龄要求')
job_education=Column(String(100),comment='学历要求')
job_member=Column(String(100),comment='招聘人数')
job_location=Column(String(3000),comment='上班地址')
job_describe=Column(String(3000),comment='工作描述')
job_date=Column(String(100),comment='发布日期')
recruit_sources=Column(String(100),comment='招聘来源')
log_date=Column(String(100),comment='记录日期')
Base.metadata.create_all(engine)
from sqlalchemy.orm import sessionmaker
DBSession=sessionmaker(bind = engine)
SQLsession = DBSession()
new_data=table_info(
#temp_id=info_dict["job_id"],
job_id=info_dict.get('job_id',''),
#company_name='hhan',
company_name=info_dict.get('company_name'),
company_type=info_dict.get('company_type',''),
company_trade=info_dict.get('company_trade',''),
company_scale=info_dict.get('company_scale',''),
job_name=info_dict.get('job_name',''),
job_pay=info_dict.get('job_pay',''),
job_years=info_dict.get('job_years',''),
job_education=info_dict.get('job_education',''),
job_member=info_dict.get('job_member',''),
job_location=info_dict.get('job_location',''),
job_describe=info_dict.get('job_describe',''),
recruit_sources=info_dict.get('recruit_sources',''),
job_date=info_dict.get('job_date',''),
company_welfare=info_dict.get('company_welfare'),
log_date=time.strftime('%Y-%m-%d',time.localtime(time.time()))
)
SQLsession.add(new_data)
SQLsession.commit()
SQLsession.close()
def get_info(url):
print(url)
#两次的请求头 发生了 变化cookie和 hosT
headers={
'User-Agent': '同上',
'Accept-Encoding': 'gzip, deflate',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Connection': 'keep-alive',
'Host': 'msearch.51job.com',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Referer':'https://m.51job.com/search/joblist.php?keyword=python&keywordtype=2&funtype=0000&indtype=00&jobarea=000000&jobterm=99&cotype=99&issuedate=9&saltype=99°ree=99&landmark=0&workyear=99&cosize=99&radius=-1&lonlat=0%2C0',
'Cookie': 'guid=f20363b27d082fd30e051677795994d9; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; search=jobarea%7E%60040000%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60040000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FApython%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch1%7E%60040000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch2%7E%60240000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21; m_search=keyword%3Dpython%26%7C%26areacode%3D040000; partner=51jobhtml5',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-Use': '?1'
}
try:
if "m.51job.com" in url:
r = requests.get(url,headers=headers)
print(r)
time.sleep(1.5)
soup =BeautifulSoup(r.content.decode('utf-8'),'html5lib')
#数据清洗
temp_dict={}
temp_dict['job_id']='python'
#print(temp_dict['job_id'])
try:
temp_dict['job_name']=soup.find('div',class_='jt').find('p').getText()
except:
temp_dict['job_name']='kong'
try:
temp_dict['job_date']=soup.find('div',class_='jt').find('span').getText()
except:
temp_dict['job_date']='kong'
try:
temp_dict['job_pay']=soup.find('p',class_='jp').getText()
except:
temp_dict['job_pay']='kong'
try:
temp_dict['company_name']=soup.find('p',class_='c_444').getText()
except:
temp_dict['company_name']='kong'
try:
qy_info=soup.find('div',class_='at').getText()
except:
qy_info='kong0|kong1|kong2'
temp_dict['company_type']=qy_info.split('|')
temp_dict['company_scale']=qy_info.split('|')
temp_dict['company_trade']=qy_info.split('|')
try:
temp_dict['job_location']=soup.find('a',class_='arr a2').find('span').getText()
except:
temp_dict['job_location']='kong'
try:
temp_dict['job_member']=soup.find('div',class_='jd').find('span').getText()
except:
temp_dict['job_member']='kong'
try:
temp_dict['job_education']=soup.find('span',class_='s_x').getText()
except:
temp_dict['job_education']='kong'
try:
temp_dict['job_years']=soup.find('span',class_='s_n').getText()
except:
temp_dict['job_years']='kong'
try:
temp_dict['job_describe']=soup.find('article').getText().replace('//t','')+soup.find('br').getText().replace('//t','')
except:
temp_dict['job_describe']='kong'
try:
temp_dict['company_welfare']=soup.find('div',class_='welfare').find('span').getText()
except:
temp_dict['company_welfare']='kong'
print('21')
insert_db(temp_dict)
'''
with open('temp.text','w+') as f:
f.write(str(temp_dict))
f.close()
'''
else:
print('读取结束')
except:
print('出错跳出')
return temp_dict
print(get_page('040000','python','2'))
``` killua2011 发表于 2020-6-26 17:51
请问一下这个报错如何解决?
Traceback (most recent call last):
File "D:/PycharmProjects/untitled/ ...
我没遇到过我一般遇到报错都是先翻译一下如果还是没思路就会去浏览器上找找一般都会有人遇到过尤其是csdn上 请问一下这个报错如何解决?
Traceback (most recent call last):
File "D:/PycharmProjects/untitled/网络爬取/51job.py", line 213, in <module>
print(get_page('040000','python','2'))
File "D:/PycharmProjects/untitled/网络爬取/51job.py", line 30, in get_page
r=requests.get(url,headers=headers)
File "D:\Python\Python37\lib\site-packages\requests\api.py", line 76, in get
return request('get', url, params=params, **kwargs)
File "D:\Python\Python37\lib\site-packages\requests\api.py", line 61, in request
return session.request(method=method, url=url, **kwargs)
File "D:\Python\Python37\lib\site-packages\requests\sessions.py", line 530, in request
resp = self.send(prep, **send_kwargs)
File "D:\Python\Python37\lib\site-packages\requests\sessions.py", line 643, in send
r = adapter.send(request, **kwargs)
File "D:\Python\Python37\lib\site-packages\requests\adapters.py", line 449, in send
timeout=timeout
File "D:\Python\Python37\lib\site-packages\urllib3\connectionpool.py", line 672, in urlopen
chunked=chunked,
File "D:\Python\Python37\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
conn.request(method, url, **httplib_request_kw)
File "D:\Python\Python37\lib\http\client.py", line 1252, in request
self._send_request(method, url, body, headers, encode_chunked)
File "D:\Python\Python37\lib\http\client.py", line 1293, in _send_request
self.putheader(hdr, value)
File "D:\Python\Python37\lib\http\client.py", line 1225, in putheader
values = one_value.encode('latin-1')
UnicodeEncodeError: 'latin-1' codec can't encode characters in position 0-10: ordinal not in range(256)
哇,大佬厉害!{:301_1003:} 学习学习 受教了感谢楼主分享 谢谢楼主分享 这个try except语句就比较有灵魂了 可以,试试看 感谢分享 你搞得那个ts是视频片段。你只需要抓到一个m3u8的地址,然后找一个m3u8的下载器,下载就可以了! 帮您把代码部分格式编辑了一下,您再看下有没有出什么问题