本帖最后由 lz270978971 于 2019-9-19 09:32 编辑
继上个版本后,又花了点时间,增加了翻页的小功能,还有优化的空间。
等有时间了,在继续优化一下。
现在来看下源码:[Python] 纯文本查看 复制代码
# -*- coding: utf-8 -*-
# [url=home.php?mod=space&uid=238618]@Time[/url] : 2019/9/18 5:24 PM
# [url=home.php?mod=space&uid=686208]@AuThor[/url] : python-小智!!
# @FileName: zhilian_1.py
# @Software: IntelliJ IDEA
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from pyquery import PyQuery as pq
import time
class ZhiLian:
def __init__(self):
# 设置 chrome 无界面化模式
self.chrome_options = Options()
self.chrome_options.add_argument('--headless')
self.chrome_options.add_argument('--disable-gpu')
self.driver = webdriver.Chrome(options=self.chrome_options)
def home_url(self, search='python'):
"""
获取搜索职位的url, demo里面默认搜索python,url是成都地址
:param search:
:return:
"""
self.driver.get("https://www.zhaopin.com/chengdu")
element = self.driver.find_element_by_class_name("zp-search__input")
element.send_keys(f"{search}")
element.send_keys(Keys.ENTER)
# 切换窗口
self.driver.switch_to.window(self.driver.window_handles[1])
# 等待js渲染完成后,在获取html
time.sleep(4)
html = self.driver.find_element_by_xpath("//*").get_attribute("outerHTML")
return html
def get_content(self):
"""
翻页获取数据
:return:
"""
page = 0
html = self.home_url()
url = self.driver.current_url
host, para = url.split("?")
doc = pq(html)
flag = False
while not flag:
if page == 0:
page += 1
else:
page += 1
page_url = f"{host}?p={page}&{para}"
print(page_url)
self.driver.get(page_url)
time.sleep(3)
doc = pq(self.driver.find_element_by_xpath("//*").get_attribute("outerHTML"))
disable = doc.find(".contentpile__jobcontent__notext")
flag = 1 if disable else ""
yield doc
def data_processing(self):
"""
处理数据
:return:
"""
for doc in self.get_content():
contents = doc(".contentpile__content__wrapper")
for content in contents.items():
jobname = content(".contentpile__content__wrapper__item__info__box__jobname__title").text()
companyname = content(".contentpile__content__wrapper__item__info__box__cname").text()
saray = content(".contentpile__content__wrapper__item__info__box__job__saray").text()
demand = content(".contentpile__content__wrapper__item__info__box__job__demand").text()
yield jobname, companyname, saray, ",".join(demand.split("\n"))
datas = ZhiLian().data_processing()
for data in datas:
print(data)
|