本帖最后由 double07 于 2021-3-17 12:34 编辑
[Python] 纯文本查看 复制代码 #导入模块
import requests
import pandas
from lxml import etree
data_list=[]
#获取网页内容
def gethtml(url):
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'}
response=requests.get(url,headers=headers)
return response.content.decode()
#获取网页数据
def getpath(r):
html=etree.HTML(r)
b=html.xpath('//div//ul[@class="sojob-list"]/li')
for i in b:
list={}
list['职位'] = i.xpath("./div/div/h3/a/text()")[0]
list['月薪']= i.xpath('./div/div[1]/p[1]/span[1]/text()')[0]
list['发布时间'] = i.xpath('./div/div[1]/p[2]/time/text()')[0]
list['招聘企业'] = i.xpath('./div/div[2]/p[1]/a/text()')[0]
list['工作地点'] = i.xpath('.//*[@class="area"]/text()')[0]
list['链接'] = i.xpath("./div/div/h3/a/@href")[0]
data_list.append(list)
return data_list
#翻页
def next_page():
url_np='https://www.liepin.com/zhaopin/?&key=运营分析总监&curPage={}'
url_list=[url_np.format(i) for i in range(0,1,1)]
return url_list
#主程序
def run_liep():
page = next_page()
for i in page:
gh=gethtml(i)
gp=getpath(gh)
gp = pandas.DataFrame(gp)
gp.to_excel('./liepin.xlsx', index=False)
return gp
if __name__ == '__main__':
print(run_liep())
用这段代码爬取的数据,与网页打开的不一样?哪里出问题?
已解决:增加cookie即可 |