爬取offershow的从2020年的校招爆料信息
偶然看到offershow的校招岗位信息,发现这类信息对于毕业生来说挺有用的,并且还是小程序中的内容。于是尝试利用python进行练手爬取。仅供学习。```
# coding=utf-8
import json
from concurrent.futures import ThreadPoolExecutor
import requests
import xlwt
response = []
def getone(datalistOne):
headers = {
'Host': 'www.ioffershow.com',
'content-type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 14_4_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) '
'Mobile/15E148 MicroMessenger/8.0.7(0x18000730) NetType/4G Language/zh_CN',
'Referer': 'https://servicewechat.com/wx67fbba6cd94591e4/44/page-frame.html',
}
data = 'id=29314&access_token=%24ytkzhLIvv5%2BsYwytrpIDkg26d4HQpxjr6pCoffershowzju1qaz.1625656632929' \
'.2da42dc3bbfbf7800a3b354ebe97c4b3 '
temp_response = requests.post('https://www.ioffershow.com//webapi/v2/offer_detail', headers=headers,
data=datalistOne,
verify=False)
return temp_response.text.encode('latin-1').decode('unicode_escape')
pass
def get():
requests.packages.urllib3.disable_warnings()
datalist = []
for i in range(29314, 65506):
datalist.append(
"id={0}&access_token=%24ytkzhLIvv5%2BsYwytrpIDkg26d4HQpxjr6pCoffershowzju1qaz.1625656632929.2da42dc3bbfbf7800a3b354ebe97c4b3 ".format(
i))
# min=29314max=65505 2021-07 cut 21587
executor = ThreadPoolExecutor(max_workers=20)
i = 1
for result in executor.map(getone, datalist):
print("第%d条" % i)
response.append(result)
i += 1
file = open('data.txt', 'w', encoding='utf-8')
for ip in response:
file.write(ip)
file.write('\n')
file.close()
# 公司company 职位position 工资salary 地点city备注信息remark 行业hangye 学历xueli 可信度number 发布时间time 浏览量
def port():
temp_response = response
workbook = xlwt.Workbook()
sheet = workbook.add_sheet(sheetname="校招")
# sheet.col(0).width = 3333
for i in range(10):
sheet.col(i).width = 3333
sheet.write(0, 0, label='公司')
sheet.write(0, 1, label='职位')
sheet.write(0, 2, label='地点')
sheet.write(0, 3, label='工资')
sheet.write(0, 4, label='可信度')
sheet.write(0, 5, label='行业')
sheet.write(0, 6, label='学历')
sheet.write(0, 7, label='发布时间')
sheet.write(0, 8, label='浏览量')
sheet.write(0, 9, label='备注信息')
for i in range(len(temp_response)):
try:
data = json.loads(temp_response, strict=False)
info = data['info']
sheet.write(i + 1, 0, label=info['company'])
sheet.write(i + 1, 1, label=info['position'])
sheet.write(i + 1, 2, label=info['city'])
sheet.write(i + 1, 3, label=info['salary'])
sheet.write(i + 1, 4, label=info['score'])
sheet.write(i + 1, 5, label=info['hangye'])
sheet.write(i + 1, 6, label=info['xueli'])
sheet.write(i + 1, 7, label=info['time'])
sheet.write(i + 1, 8, label=info['number'])
sheet.write(i + 1, 9, label=info['remark'])
except Exception as e:
print(i)
print(e)
workbook.save('校招.xls')
pass
if __name__ == '__main__':
get()
port()
```
感谢分享 学习了。 能搞到多少数据呢? chaihuibin 发表于 2021-8-4 16:52
能搞到多少数据呢?
大概几万条 已收藏。感谢分享!
页:
[1]