不要用关键字list作为变量名,可能会导致未知问题。
[Python] 纯文本查看 复制代码 # 导入模块
import requests
import pandas
import time
from lxml import etree
p = 0
data_list = []
dqs = ""
pubtime = "3"
salary = "30"
industries = "200"
position = ''
curPage = 1
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
}
# 获取网页内容
def gethtml(url):
response = requests.get(url, headers=headers)
r_response = response.content.decode().replace("https://www.liepin.com", "")
return r_response
# 获取网页数据
def parse_url(r):
html = etree.HTML(r)
b = html.xpath('//ul[@class="sojob-list"]/li')
for i in b:
lst = {}
lst['职位'] = i.xpath('./div/div[1]/h3/a/text()')[0].strip()
lst['招聘企业'] = i.xpath('./div/div[2]/p[1]/a/text()')[0].strip()
lst['工作地点'] = i.xpath('.//*[@class="area"]/text()')[0].strip()
lst['月薪'] = i.xpath('./div/div[1]/p[1]/span[1]/text()')[0].strip()
lst['发布时间'] = i.xpath('./div/div[1]/p[2]/time/text()')[0].strip()
href_list = "https://www.liepin.com" + i.xpath("./div/div/h3/a/@href")[0].strip()
href_r = requests.get(href_list, headers=headers)
href_text = href_r.content.decode()
href_Parse = etree.HTML(href_text)
job_list = filter(lambda x: x.strip() != '', href_Parse.xpath('//div[3]/div/text()'))
lst['工作职责'] = [i.strip() for i in job_list]
# for j in job_list:
# lst['工作职责'].append(j.strip())
# print(lst)
data_list.append(lst)
print(data_list)
return data_list
# 翻页
def next_page():
url_np = 'https://www.liepin.com/zhaopin/?compkind=&dqs{}=&pubTime={}&pageSize=40&salary={}%24&compTag=&sortFlag=15°radeFlag=0&compIds=&subIndustry=&jobKind=&industries={}&compscale=&key={}&curPage={}'
url_list = [url_np.format(dqs, pubtime, salary, industries, position, i) for i in range(0, curPage)]
return url_list
# 主程序
def run_liep():
page = next_page()
time.sleep(1)
p = 0
for i in page:
p += 1
print('正在获取第{}页数据'.format(p))
gh = gethtml(i)
gp = parse_url(gh)
gp = pandas.DataFrame(gp)
gp.to_excel('./liepin.xlsx', index=False)
return gp
if __name__ == '__main__':
print(run_liep())
|