[Asm] 纯文本查看 复制代码
import requests
from lxml import etree
import time
from threading import Thread
import json
import pandas as pd
headers={
#请求头
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
#定义get函数
def get_html(url):
try:
r = requests.get(url,headers=headers)
r.encoding='utf8'
print('开始采集')
time.sleep(2)
return r.text
except EnvironmentError as e:
return e
def get_xpath(html):
data=json.loads(html)
names=[]
locations=[]
bgns=[]
products=[]
cates=[]
times=[]
for i in data['Data']['Posts']:
#职位名称
name = i['RecruitPostName']
#所在地
location = i['LocationName']
#
bgn = i['BGName']
#
product = i['ProductName']
#工作类型
cate = i['CategoryName']
#发布时间
time = i['LastUpdateTime']
names.append(name)
locations.append(location)
bgns.append(bgn)
products.append(product)
cates.append(cate)
times.append(time)
tp = pd.DataFrame({
'职位':names,
'地址':locations,
'分级':bgns,
'部门':products,
'类型':cates,
'发布时间':times
})
tp.to_csv('腾讯招聘.csv',encoding='utf8',mode='a',index=None,header=False)
def main(start_url,end_url):
for i in range(start_url,end_url):
url='https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1566200594583&countryId=&cityId=&bgIds=&productId=&categoryId=40001001,40001002,40001003,40001004,40001005,40001006,40002001,40002002,40003001,40003002,40003003,40004,40005001,40005002,40006,40007,40008,40009,40010,40011&parentCategoryId=&attrId=&keyword=&pageIndex={}&pageSize=10&language=zh-cn&area=cn'.format(i)
data=get_html(url=url)
get_xpath(html=data)
if __name__ == '__main__':
#多线程
thread=[]
t1=Thread(target=main,args=(0,100))
t2=Thread(target=main,args=(100,200))
t3=Thread(target=main, args=(200,300))
t4=Thread(target=main,args=(300,400))
t5=Thread(target=main,args=(400,492))
thread +=[t1,t2,t3,t4,t5]
for i in thread:
i.start()
for i in thread:
i.join()