Python 使用selenium爬取拉钩网Python职位信息（爬虫）

baihuhu 发表于 2019-11-11 21:09

本帖最后由 baihuhu 于 2019-11-11 21:15 编辑

整体思路：
1 使用我们最近讲的selenium模块进行模拟浏览器爬取2 网页解析使用 xpath（底层为c语言，效率高）
3保存为csv数据

需要的模块：import random

import time

import csv

from urllib.parse import quote

from lxml import etree

from selenium import webdriver其中 selenium 和 lxml 需要 pip install 命令进行安装
class LaGoSpider(object):

'''

封装为一个类，方便操作

'''

def __init__(self):

   options = webdriver.ChromeOptions()

   options.add_argument('--headless')

   options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})

   self.driver = webdriver.Chrome(r'D:\外安装软件\selenium1\chromedriver_win32\chromedriver.exe', options=options)

   self.data_list = []

def address_url(self):

'''

获取目标url（拼接）

'''

   self.citys = ['全国', '北京', '深圳', '广州', '杭州', '成都', '南京', '上海', '厦门', '西安', '长沙']

   self.baseurl = 'https://www.lagou.com/jobs/list_python?px=default&city={}'

   for self.city in self.citys:

         self.url = self.baseurl.format(quote(self.city))

         self.driver.get(self.url)

         print('正在爬取<%s>' % self.city)

         while True:

            source = self.driver.page_source

            self.position_url_parse(source)

            next_page = self.driver.find_element_by_xpath('//span[@class="pager_next "]')

            if 'contains(class, "pager_next")' in next_page.get_attribute('class'): # 判断一页是否爬取完成

               print('<%s爬取完毕>' % self.city)

               break

            else:

               self.driver.execute_script("arguments.click()", next_page)

               print('----------------爬取下一页--------------')

               time.sleep(random.randint(3, 5))

def position_url_parse(self, source):

'''

获取每个职位的url

'''

   html = etree.HTML(source)

   lis = html.xpath('//ul[@class="item_con_list"]//li')

   for li in lis:

         position_url = li.xpath('.//a[@class="position_link"]//@href')

         self.request_urls(position_url)

         time.sleep(random.randint(1, 3))

def request_urls(self, list_url):

   self.driver.execute_script('window.open("%s")' % list_url)

   self.driver.switch_to_window(self.driver.window_handles)

   source = self.driver.page_source

   self.parse_position(source)

   time.sleep(random.randint(1, 3))

   self.driver.close()

   self.driver.switch_to_window(self.driver.window_handles)

   time.sleep(random.randint(1, 3))

def parse_position(self, source):

'''

抓取每个职位的详情信息

'''

   self.data = {}

   html = etree.HTML(source)

   company = html.xpath('//dl[@class="job_company"]/dt/a/img/@alt')

   print(company)

   self.data['公司'] = company

   name = html.xpath('//div[@class="position-content-l"]//span[@class="name"]/text()')

   self.data['名称'] = name

   salary = html.xpath('//dd[@class="job_request"]/p/span[@class="salary"]/text()')

   self.data['薪资'] = salary

   city = ''.join(html.xpath('//dd[@class="job_request"]/p/span/text()')).replace('/','')

   self.data['城市'] = city

   jinyan = ''.join(html.xpath('//dd[@class="job_request"]/p/span/text()')).replace('/', '')

   self.data['经验'] = jinyan

   xueli = ''.join(html.xpath('//dd[@class="job_request"]/p/span/text()')).replace('/','')

   self.data['学历'] = xueli

   zhihuo = html.xpath('//*[@id="job_detail"]/dd/p/text()')

   self.data['职位诱惑'] = zhihuo

   zhimiao = ''.join(html.xpath('//div[@class="job-detail"]//p//text()')).replace('岗位职责: ', '').replace('岗位要求：', '').replace('岗位职责：', '').replace('工作职责：', '').replace('项目背景：', '').replace('-', '').strip()

   self.data['职位描述'] = zhimiao

   self.data_list.append(self.data)

   self.csv_()

def csv_(self):

'''

保存数据为csv

'''

   header = ['公司', '名称', '薪资', '城市', '经验', '学历', '职位诱惑', '职位描述']

   with open('lagou_quanguo.csv', 'w', encoding='utf-8', newline='')as fb:

         writer = csv.DictWriter(fb, header)

         writer.writeheader()

         writer.writerows(self.data_list)

if __name__ == '__main__':

LG = LaGoSpider()

LG.address_url()

6e25h 发表于 2019-11-11 22:26

dlyanxiaomei 发表于 2019-11-11 22:11
说实话，满想学的，有教程么

dalipan.com搜上海黑马python37期，学完网络编程前面的直接跳到后面爬虫的就可以了。

baihuhu 发表于 2019-11-12 09:54

6e25h 发表于 2019-11-11 22:25
除了selenium还有其他办法能够解决动态加载之类的吗。。。
感觉selenium好没魔法
（爬虫新人）

那你得把js学好

yuan1028 发表于 2019-11-11 21:38

感谢分享，有用

去年夏天我还在 发表于 2019-11-11 21:38

牢底坐穿警告{:17_1068:}

dlyanxiaomei 发表于 2019-11-11 22:11

说实话，满想学的，有教程么

6e25h 发表于 2019-11-11 22:25

除了selenium还有其他办法能够解决动态加载之类的吗。。。
感觉selenium好没魔法
（爬虫新人）

黑猫的猫 发表于 2019-11-11 22:30

python有点看不懂。。。

iwxiao 发表于 2019-11-11 22:38

爬虫学得好，牢饭吃得早哈哈哈哈

MerlintheWizard 发表于 2019-11-12 10:31

python抓去数据是真方便啊

页: [1] 2

吾爱破解 - 52pojie.cn's Archiver

Python 使用selenium爬取拉钩网Python职位信息（爬虫）