本帖最后由 duokebei 于 2022-11-16 08:19 编辑
前言
在跟着B站学完爬虫以后,想着去爬一个相对有用的东西去巩固一下基础,同时在吾爱发文章一来方便自己查阅,二来用来分享,给他人借鉴,写的不是太好,还望大家多多包含。
简介
这次选取的网页主要爬的是北京公交各个路线,话不多说,直接上源码:
[Python] 纯文本查看 复制代码 # 作者: duokebei
# 开发时间: 2022/11/3 17:16
import random
import time
import traceback
import requests
from lxml import etree
href = 'https://beijing.8684.cn'
adress = ['/list1', '/list2', '/list3', '/list4', '/list5', '/list6', '/list7', '/list8', '/list9', '/listB',
'/listC',
'/listD', '/listF', '/listG', '/listH', '/listK', '/listL', '/listM', '/listP', '/listS', '/listT',
'/listX', '/listY', '/listZ']
# 构造每一个页面的url
def page_url(n):
url = href + adress[n]
return url
def request(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
}
proxies_pool = [
{'http': '223.96.90.216:8085'},
{'http': '121.13.252.58:41564'}
]
proxies = random.choice(proxies_pool)
res = requests.get(url=url, headers=headers, proxies=proxies)
return res.text
# 获取每一条路线的链接
def routeLink(content):
etree_html = etree.HTML(content)
link = etree_html.xpath("//div[@class='list clearfix']/a/@href")
return link
# 获取公交车路线
def route(content):
etree_html = etree.HTML(content)
path = etree_html.xpath('//div[@class="bus-lzlist mb15"]//li/a/@aria-label')
return path
# 获取路线名字
def routeName(content):
etree_html = etree.HTML(content)
name = etree_html.xpath("//div[@class='list clearfix']/a/@title")
return name
# 构建获取路线的链接
def page_url_02(link, k):
# k代表第几个线路
url = href + link[k]
# print(url)
return url
def routeSize(link):
return len(link)
# 获取都有哪些开头的路线
def parse(content):
etree_html = etree.HTML(content)
name = etree_html.xpath("//div[@class='category']/a/text()")
return name
for j in range(len(adress)):
try:
url_01 = page_url(j) # 创建爬取路线的链接
content_01 = request(url_01) # 获取源码
name = parse(content_01)
route_link = routeLink(content_01) # 获取各个路线链接
print("北京公交以{}字开头的路线如下:".format(name[j]))
for i in range(routeSize(route_link)):
url_01 = page_url(j) # 创建爬取路线的链接
content_01 = request(url_01) # 获取源码
initial = parse(content_01) # 获取都有哪些开头的路线
route_link = routeLink(content_01) # 获取各个路线链接
route_name = routeName(content_01) # 获取路线名字
url_02 = page_url_02(route_link, i) # 构建获取路线的链接
content_02 = request(url_02) # 获取有路线的源码
journey = route(content_02) # 得到路线
print(route_name[i] + "为:{}".format(journey))
time.sleep(10)
except:
print(traceback.format_exc())
补充
最终效果如下,你可以自己进行保存到本地
我写的不是很好,希望大家指出我的错误,谢谢! |