Python 使用selenium爬取拉钩网Python职位信息(爬虫)
本帖最后由 baihuhu 于 2019-11-11 21:15 编辑整体思路:
1 使用我们最近讲的selenium模块进行模拟浏览器爬取2 网页解析使用 xpath(底层为c语言,效率高)
3保存为csv数据
需要的模块:import random
import time
import csv
from urllib.parse import quote
from lxml import etree
from selenium import webdriver其中 selenium 和 lxml 需要 pip install 命令进行安装
class LaGoSpider(object):
'''
封装为一个类,方便操作
'''
def __init__(self):
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
self.driver = webdriver.Chrome(r'D:\外安装软件\selenium1\chromedriver_win32\chromedriver.exe', options=options)
self.data_list = []
def address_url(self):
'''
获取目标url(拼接)
'''
self.citys = ['全国', '北京', '深圳', '广州', '杭州', '成都', '南京', '上海', '厦门', '西安', '长沙']
self.baseurl = 'https://www.lagou.com/jobs/list_python?px=default&city={}'
for self.city in self.citys:
self.url = self.baseurl.format(quote(self.city))
self.driver.get(self.url)
print('正在爬取<%s>' % self.city)
while True:
source = self.driver.page_source
self.position_url_parse(source)
next_page = self.driver.find_element_by_xpath('//span[@class="pager_next "]')
if 'contains(class, "pager_next")' in next_page.get_attribute('class'): # 判断一页是否爬取完成
print('<%s爬取完毕>' % self.city)
break
else:
self.driver.execute_script("arguments.click()", next_page)
print('----------------爬取下一页--------------')
time.sleep(random.randint(3, 5))
def position_url_parse(self, source):
'''
获取每个职位的url
'''
html = etree.HTML(source)
lis = html.xpath('//ul[@class="item_con_list"]//li')
for li in lis:
position_url = li.xpath('.//a[@class="position_link"]//@href')
self.request_urls(position_url)
time.sleep(random.randint(1, 3))
def request_urls(self, list_url):
self.driver.execute_script('window.open("%s")' % list_url)
self.driver.switch_to_window(self.driver.window_handles)
source = self.driver.page_source
self.parse_position(source)
time.sleep(random.randint(1, 3))
self.driver.close()
self.driver.switch_to_window(self.driver.window_handles)
time.sleep(random.randint(1, 3))
def parse_position(self, source):
'''
抓取每个职位的详情信息
'''
self.data = {}
html = etree.HTML(source)
company = html.xpath('//dl[@class="job_company"]/dt/a/img/@alt')
print(company)
self.data['公司'] = company
name = html.xpath('//div[@class="position-content-l"]//span[@class="name"]/text()')
self.data['名称'] = name
salary = html.xpath('//dd[@class="job_request"]/p/span[@class="salary"]/text()')
self.data['薪资'] = salary
city = ''.join(html.xpath('//dd[@class="job_request"]/p/span/text()')).replace('/','')
self.data['城市'] = city
jinyan = ''.join(html.xpath('//dd[@class="job_request"]/p/span/text()')).replace('/', '')
self.data['经验'] = jinyan
xueli = ''.join(html.xpath('//dd[@class="job_request"]/p/span/text()')).replace('/','')
self.data['学历'] = xueli
zhihuo = html.xpath('//*[@id="job_detail"]/dd/p/text()')
self.data['职位诱惑'] = zhihuo
zhimiao = ''.join(html.xpath('//div[@class="job-detail"]//p//text()')).replace('岗位职责: ', '').replace('岗位要求:', '').replace('岗位职责:', '').replace('工作职责:', '').replace('项目背景:', '').replace('-', '').strip()
self.data['职位描述'] = zhimiao
self.data_list.append(self.data)
self.csv_()
def csv_(self):
'''
保存数据为csv
'''
header = ['公司', '名称', '薪资', '城市', '经验', '学历', '职位诱惑', '职位描述']
with open('lagou_quanguo.csv', 'w', encoding='utf-8', newline='')as fb:
writer = csv.DictWriter(fb, header)
writer.writeheader()
writer.writerows(self.data_list)
if __name__ == '__main__':
LG = LaGoSpider()
LG.address_url()
dlyanxiaomei 发表于 2019-11-11 22:11
说实话,满想学的,有教程么
dalipan.com搜上海黑马python37期,学完网络编程前面的直接跳到后面爬虫的就可以了。 6e25h 发表于 2019-11-11 22:25
除了selenium还有其他办法能够解决动态加载之类的吗。。。
感觉selenium好没魔法
(爬虫新人)
那你得把js学好 感谢分享,有用 牢底坐穿警告{:17_1068:} 说实话,满想学的,有教程么 除了selenium还有其他办法能够解决动态加载之类的吗。。。
感觉selenium好没魔法
(爬虫新人) python有点看不懂。。。 爬虫学得好,牢饭吃得早 哈哈哈哈 python抓去数据是真方便啊
页:
[1]
2