使用代{过}{滤}理爬取58同城指定城市整租房信息(不含:品牌公寓)
本帖最后由 panison 于 2021-8-16 20:18 编辑本程序使用的代{过}{滤}理IP提供商是代{过}{滤}理精灵。
源代码中将代{过}{滤}理精灵的添加白名单、删除白名单API地址中的敏感数据appid、appkey的值使用??????代替。可在【创建ProxyIpServer对象】相关代码中修改。
本人非计算机专业,自学的Python。算是爱好编程的小白。分享是为了交流学习。也欢迎大家提出改进意见。
"""
爬取58同城指定城市整租房信息(不含:品牌公寓)
__________________________________________
关于使用代{过}{滤}理IP的问题
1、稳定高匿的代{过}{滤}理IP需要收费。本程序使用的代{过}{滤}理IP提供商是代{过}{滤}理精灵。
2、稳定高匿的代{过}{滤}理IP对于爬虫项目的成功实施非常关键
3、使用代{过}{滤}理IP程序会报错:使用代{过}{滤}理IP报错:ValueError: check_hostname requires server_hostname
解决办法:将urllib3降级。pip3 install urllib3==1.25.11
__________________________________________
V1.0版本:
1、统一设置:请求的间隔时间,代{过}{滤}理IP的API调用间隔,请求出错的尝试次数
2、去重问题:定义了代{过}{滤}理IP去重间隔,提高代{过}{滤}理IP利用效率
3、CSV文件:如果爬取的字符中有半角逗号,那么替换成全角逗号
__________________________________________
"""
import time
import random
import urllib3
from lxml import etree
from selenium import webdriver
import requests
# 禁止由于移除SSL认证而出现的警告信息
urllib3.disable_warnings()
# 将使用过的代{过}{滤}理ip放入ip_pool_list
ip_pool_list = list()
# 通过代{过}{滤}理IP服务商提供的API获取的代{过}{滤}理IP列表
proxy_ip_list = list()
# 用于存放整租房详细信息
house_rent_info = list()
house_rent_info.append([
"省/自治区/直辖市", "市", "区域分支", "区域次级分支", "小区名称", "房源更新时间", "房源标题", "房源租赁方式",
"房源价格数值", "房源价格单位", "房源面积数值", "房源面积单位", "房源户型", "房源装修情况", "房源详细地址", "房源URL"
])
# 定义类,用于代{过}{滤}理IP提供商的相关属性和动作
class ProxyIpServer:
def __init__(self, name, add_white_ip_api, del_white_ip_api, proxy_ip_api, time_interval):
self.name = name
self.add_white_ip_api = add_white_ip_api
self.del_white_ip_api = del_white_ip_api
self.proxy_ip_api = proxy_ip_api
self.time_interval = time_interval
def add_white_ip(self):
r = requests.get(url=self.add_white_ip_api, timeout=5)
return r.json()
def del_white_ip(self):
r = requests.get(url=self.del_white_ip_api, timeout=5)
return r.json()
def get_proxy_ips(self):
time.sleep(self.time_interval)
r = requests.get(url=self.proxy_ip_api, timeout=5)
return r.json()
# 获取客户端IP。注意,返回一个元组(客户端IP, 请求成功标志)
def get_client_ip(proxy_ip=""):
url = "https://httpbin.org/ip"
if proxy_ip == "":
r = requests.get(url=url, verify=False, timeout=5)
return r.json()["origin"], True
else:
try:
ip = proxy_ip.split(":")[0]
port = proxy_ip.split(":")[1]
proxies = {"https": "https://{0:}:{1:}".format(ip, port), "http": "http://{0:}:{1:}".format(ip, port)}
r = requests.get(url=url, verify=False, proxies=proxies, timeout=5)
return r.json()["origin"], True
except Exception:
return "0.0.0.0", False
# 定义函数check_proxy_ip(),对代{过}{滤}理IP进行可用性检测。
def check_proxy_ip(proxy_ip, local_ip, used_ips_interval):
mask_ip = proxy_ip.split(":")[0]
client_ip = get_client_ip(proxy_ip)[0]
client_ip_success_connect = get_client_ip(proxy_ip)[1]
if not client_ip_success_connect:
print("\xa0\xa0代{过}{滤}理IP{}连接标志:{}".format(proxy_ip, client_ip_success_connect), end=";")
flag1 = flag2 = False
else:
print("\xa0\xa0代{过}{滤}理IP{}连接标志:{}".format(proxy_ip, client_ip_success_connect), end=";")
# 检测是否是高匿代{过}{滤}理IP
if client_ip != local_ip:
flag1 = True
else:
flag1 = False
print("代{过}{滤}理IP高匿标志:{}".format(flag1), end=";")
# 检测代{过}{滤}理IP重复出现所间隔的IP使用次数是否满足条件
global ip_pool_list
if mask_ip not in ip_pool_list:
ip_pool_list.append(mask_ip)
flag2 = True
print("代{过}{滤}理IP完全去重标志:{}".format(flag2), end=";")
else:
if ip_pool_list[::-1].index(mask_ip) >= used_ips_interval:
ip_pool_list.append(mask_ip)
flag2 = True
else:
flag2 = False
print("代{过}{滤}理IP相对去重标志:{}".format(flag2), end=";")
print("代{过}{滤}理IP可用标志:{}".format(flag1 and flag2))
return flag1 and flag2
# 定义函数get_proxy_ip_list()获取代{过}{滤}理IP列表
def get_proxy_ips_list(proxy_ip_server, local_ip):
# 获取代{过}{滤}理IP列表。
try:
ls = list()
dic_list = proxy_ip_server.get_proxy_ips()["data"]
for dic in dic_list:
ls.append(dic["IP"] + ":" + str(dic["Port"]))# 精灵代{过}{滤}理
except Exception:
ls = list()
# 如果返回空列表或者连接错误,那么使用本地IP。
if len(ls) == 0:
ls.append(local_ip)
print("\xa0\xa0本次获取的代{过}{滤}理IP列表:[{}]".format(" , ".join(ls)))
return ls
# 定义函数get_proxy_ip获取单个代{过}{滤}理IP
def get_proxy_ip(proxy_ip_server, local_ip):
global proxy_ip_list
if len(proxy_ip_list) == 0:
proxy_ip_list = get_proxy_ips_list(proxy_ip_server, local_ip)
print("\xa0\xa0其中未用的代{过}{滤}理IP列表:[{}]".format(" , ".join(proxy_ip_list)))
else:
print("\xa0\xa0其中未用的代{过}{滤}理IP列表:[{}]".format(" , ".join(proxy_ip_list)))
return proxy_ip_list.pop(0)
# 定义函数get_response()返回元组(response对象, 获取成功标志)。参数request_times为请求出错的尝试次数。
def get_response(url, proxy_ip_server, local_ip, request_times, sleep_seconds, used_ips_interval):
# 随机请求头
ua_list = [
{'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0'},
{'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/18.0.1'},
{'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130331 Firefox/21.0'},
{"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"}
]
# 输出间隔行
border = "\xa0\xa0" + "*" * 100
print(border)
print("\xa0\xa0URL:{}".format(url))
while True:
proxy_ip = get_proxy_ip(proxy_ip_server, local_ip)
# 无代{过}{滤}理IP则使用
if proxy_ip == local_ip:
for n in range(1, request_times + 1):
time.sleep(sleep_seconds)
try:
print("\xa0\xa0本机IP{}第{}/{}次发起请求".format(proxy_ip, n, request_times), end=";")
r = requests.get(url=url, headers=random.choice(ua_list), timeout=5)
if r.status_code == 200:
print("请求成功!\n" + border)
return r, True
except Exception:
print("请求失败!")
if n == request_times:
print(border)
return "", False
else:
while check_proxy_ip(proxy_ip, local_ip, used_ips_interval):
for n in range(1, request_times + 1):
time.sleep(sleep_seconds)
try:
ip = proxy_ip.split(":")[0]
port = proxy_ip.split(":")[1]
print("\xa0\xa0代{过}{滤}理IP{}第{}/{}次发起请求".format(ip, n, request_times), end=";")
proxies = {"https": "https://{0:}:{1:}".format(ip, port), "http": "http://{0:}:{1:}".format(ip, port)}
r = requests.get(url=url, headers=random.choice(ua_list), proxies=proxies, verify=False, timeout=5)
if r.status_code == 200:
print("请求成功!\n" + border)
return r, True
except Exception:
print("请求失败!")
if n == request_times:
print(border)
return "", False
# 定义函数get_cities(),获取58同城省、市名称及城市URL。参数为Response对象。
def get_cities(url):
driver = webdriver.Chrome()
driver.get(url)
html_source = driver.page_source
dic_cities = dict()
letters = etree.HTML(html_source).xpath("//div[@class='content-letter']")
for letter in letters:
content_provinces = letter.xpath("div[@class='content-province']")
for content_province in content_provinces:
province_name = content_province.xpath("div[@class='content-province-title']/text()")[0]
dic_cities = dict()
content_cities = content_province.xpath("div[@class='content-cities']")
for content_city in content_cities:
city_names = content_city.xpath("a/text()")
city_urls = content_city.xpath("a/@href")
for i in range(len(city_urls)):
city_urls = "https:" + city_urls
for city_name, city_url in zip(city_names, city_urls):
dic_cities = list()
dic_cities.append(city_url)
driver.quit()
# 增加直辖市
dic_cities["直辖市"] = {
"北京": ["https://bj.58.com/"], "天津": ["https://tj.58.com/"],
"上海": ["https://sh.58.com/"], "重庆": ["https://cq.58.com/"]
}
# 删除字典中的境外数据
del dic_cities["其他"]
del dic_cities["海外"]
return dic_cities
# 定义函数 get_inputs(),获取输入数据,返回元组(省/自治区/直辖市名称、城市名称、及城市URL)。参数为get_cities()的返回值。
def get_inputs(dic):
province = ""
city = ""
print("\xa0\xa0[省/自治区/直辖市名称参照表:", end="\n\n")
print("\xa0\xa0", end="")
for i in range(len(list(dic.keys()))):
print(i, list(dic.keys()), end="、")
if (i + 1) % 10 == 0:
print("\n\xa0\xa0", end="")
flag = True
while flag:
print("\n")
number_province = input("\xa0\xa0请输入[省/自治区/直辖市序号:")
if number_province in [str(i) for i in range(len(list(dic.keys())))]:
province = list(dic.keys())[int(number_province)]
flag = False
else:
print("\xa0\xa0输入序号不在参照表范围,请重新输入!")
print("\n\xa0\xa0{}所属城市名称参照表:".format(province), end="\n\n")
print("\xa0\xa0", end="")
for i in range(len(list(dic.keys()))):
print(i, list(dic.keys()), end="、")
if (i + 1) % 10 == 0:
print("\n\xa0\xa0", end="")
flag = True
while flag:
print("\n")
number_city = input("\xa0\xa0请输入{}所属城市序号:".format(province))
if number_city in [str(i) for i in range(len(list(dic.keys())))]:
city = list(dic.keys())[int(number_city)]
flag = False
else:
print("\xa0\xa0输入序号不在参照表范围,请重新输入!")
return province, city, dic[0]
# 定义函数get_page_count(),获取页面中的索引页总页数
def get_page_count(r):
return int(etree.HTML(r.text).xpath("//div[@class='pager']//span/text()")[-2])
# 定义函数get_house_cell_urls_in_one_page() 获取指定城市单页的整租房详情页的URL。参数为Response对象
def get_house_cell_urls_in_one_page(r):
list_url = list()
house_cells = etree.HTML(r.text).xpath("//li")
for house_cell in house_cells:
house_cell_url = house_cell.xpath("div[@class='des']/h2/a/@href")[0]
list_url.append(house_cell_url)
return list_url
# 定义函数get_house_cell_detail(),获取单个租房详情页的数据,返回租房详情的列表
def get_house_cell_detail(r):
# 获取详情页标题。调试用。
html_title = etree.HTML(r.text).xpath("//title/text()")[0].strip()
print("\xa0\xa0详情页标题:{}(此信息调试用)".format(html_title))
# 获取房源数据
house_title = etree.HTML(r.text).xpath("//div[@class='house-title']/h1/text()")[0].strip().replace(",", ",")
print("\xa0\xa0房源标题:{}".format(house_title))
house_update_time = etree.HTML(r.text).xpath("//div[@class='house-title']/p/text()")[1].strip().replace("\xa0", "").replace(" ", "").replace("\n", "")
print("\xa0\xa0房源更新时间:{}".format(house_update_time))
house_price = etree.HTML(r.text).xpath("//div/span/b/text()")[0].strip()
house_price_per = etree.HTML(r.text).xpath("//div/span/text()")[0].strip().replace(" ", "").replace("\n", "")
print("\xa0\xa0房源价格:{}{}".format(house_price, house_price_per))
house_rent_mode = etree.HTML(r.text).xpath("//div//li/span/text()")[0].strip()
print("\xa0\xa0房源租赁方式:{}".format(house_rent_mode))
house_unit = etree.HTML(r.text).xpath("//div//li/span/text()")[0].split("\xa0\xa0")[0].strip()
print("\xa0\xa0房源户型:{}".format(house_unit))
house_area = etree.HTML(r.text).xpath("//div//li/span/text()")[0].split("\xa0\xa0")[1].strip()[0:-1].strip()
house_area_per = etree.HTML(r.text).xpath("//div//li/span/text()")[0].split("\xa0\xa0")[1].strip()[-1]
print("\xa0\xa0房源面积:{}{}".format(house_area, house_area_per))
house_decorate = etree.HTML(r.text).xpath("//div//li/span/text()")[0].split("\xa0\xa0")[2].strip()
print("\xa0\xa0房源装修情况:{}".format(house_decorate))
house_location = etree.HTML(r.text).xpath("//div//li/span/a/text()")[0].strip()
print("\xa0\xa0房源所在小区:{}".format(house_location))
house_division_branch = etree.HTML(r.text).xpath("//div//li/span/a/text()")[0].strip()
print("\xa0\xa0区域分支:{}".format(house_division_branch))
try:
house_division_subbranch = etree.HTML(r.text).xpath("//div//li/span/a/text()")[1].strip()
except Exception:
house_division_subbranch = "NULL"
print("\xa0\xa0区域次级分支:{}".format(house_division_subbranch))
house_detail_address = etree.HTML(r.text).xpath("//div//li/span/text()")[0].strip().replace(",", ",")
print("\xa0\xa0房源详细地址:{}".format(house_detail_address))
print("\n")
# 省/自治区/直辖市名称、城市名称暂时留空
province_name = ""
city_name = ""
house_cell_url = ""
line = [
province_name, city_name, house_division_branch, house_division_subbranch, house_location, house_update_time,
house_title, house_rent_mode, house_price, house_price_per,house_area, house_area_per, house_unit,
house_decorate, house_detail_address, house_cell_url
]
return line
# 定义函数main_job(),作为爬虫的主体部分,完成爬虫的核心任务。返回整租房信息的列表。
def main_job(tuple_city):
# *****************初始设置*******************
# 获取本机IP
local_ip = get_client_ip()[0]
# 请求的间隔时间
sleep_seconds = random.randint(0, 0) / 100
# 对于同一URL,请求出错的尝试次数
request_times = 20
# 代{过}{滤}理IP去重间隔
used_ips_interval = 10
# *******创建ProxyIpServer对象:*******
pis_name = "代{过}{滤}理IP提供商"
pis_add_white_ip_api = "http://www.jinglingdaili.com/Users-whiteIpAddNew.html?appid=??????&appkey=??????&type=dt&whiteip=" + local_ip + "&index="
pis_del_white_ip_api = "http://www.jinglingdaili.com/Users-whiteIpDelNew.html?appid=??????&appkey=??????&type=dt&whiteip=all"
pis_proxy_ip_api = "http://t.ipjldl.com/index.php/api/entry?method=proxyServer.generate_api_url&packid=0&fa=0&fetch_key=&groupid=0&qty=4&time=100&pro=&city=&port=1&format=json&ss=5&css=&dt=1&specialTxt=3&specialJson=&usertype=2"
pis_time_interval = 5# 代{过}{滤}理IP调用间隔秒数
pis = ProxyIpServer(pis_name, pis_add_white_ip_api, pis_del_white_ip_api, pis_proxy_ip_api, pis_time_interval)
# 将本机IP添加到代{过}{滤}理IP提供商的白名单
msg_add_white_ip = pis.add_white_ip()["msg"]
print("\xa0\xa0{}:IP白名单{}。\n".format(pis_name, msg_add_white_ip))
# **获取整租房数据,并将信息存入列表house_rent_info**
# 拼接城市整租房URL
house_rent_city_url = tuple_city[2] + "zufang/"
# 获取城市整租房URL页面中的总索引页数
r = get_response(house_rent_city_url, pis, local_ip, request_times, sleep_seconds, used_ips_interval)[0]
page_count = get_page_count(r)
# 遍历城市整租房URL页面中的索引页码
for page in range(1, page_count + 1):
# 拼接单个索引页的URL
house_rent_city_page_url = house_rent_city_url + "pn" + str(page) + "/"
# 获取单个索引页中整租房记录的URL
urls = list()
for q in range(1, request_times + 1):
try:
print("\xa0\xa0获取该索引页信息:第{}/{}次:".format(q, request_times))
r = get_response(house_rent_city_page_url, pis, local_ip, request_times, sleep_seconds, used_ips_interval)[0]
urls = get_house_cell_urls_in_one_page(r)
if len(urls) > 0:
print("\xa0\xa0单个索引页中整租房记录的URL列表获取成功!")
break
else:
if q < request_times:
print("\xa0\xa0单个索引页中整租房记录的URL列表获取失败!15秒后重新获取!")
time.sleep(15)
else:
print("\xa0\xa0单个索引页中整租房记录的URL列表获取失败!将略过此索引页!")
except Exception:
if q < request_times:
print("\xa0\xa0单个索引页中整租房记录的URL列表获取失败!15秒后重新获取!")
time.sleep(15)
else:
print("\xa0\xa0单个索引页中整租房记录的URL列表获取失败!将略过此索引页!")
# 遍历单个索引页中整租房记录的URL
for i in range(len(urls)):
print("\xa0\xa0整租房页面记录定位:{}省{}市,当前第[{}/{}]页,页面URL:{}。当前页第[{}/{}]条数据。".format(tuple_city[0], tuple_city[1], page, page_count, house_rent_city_page_url, i+1, len(urls)))
print("\xa0\xa0整租房页面详情地址:{}".format(urls))
# 获取详情页信息
for j in range(1,request_times + 1):
try:
print("\xa0\xa0获取该详情页信息:第{}/{}次:".format(j, request_times))
r = get_response(urls, pis, local_ip, request_times, sleep_seconds, used_ips_interval)[0]
line = get_house_cell_detail(r)
line[0] = tuple_city[0]
line[1] = tuple_city[1]
line[-1] = urls
global house_rent_info
house_rent_info.append(line)
break
except Exception:
if j < request_times:
print("\xa0\xa0获取该详情页信息失败!15秒后重新获取!")
time.sleep(15)
else:
print("\xa0\xa0获取该详情页信息失败!将略过此详情页!")
print("\xa0\xa058同城{}{}整租房信息获取完毕。\n".format(tuple_city[0], tuple_city[1]))
# 将本机IP从代{过}{滤}理IP提供商的白名单中删除
msg_del_white_ip = pis.del_white_ip()["msg"]
print("\xa0\xa0{}:IP白名单{}。\n".format(pis_name, msg_del_white_ip))
def main():
# *****************获取城市*******************
# 58同城索引页地址
url_sort = "https://www.58.com/changecity.html?fullpath=0"
print("1、获取58同城境内城市URL地址……\n")
dic_cities = get_cities(url_sort)
print("\xa0\xa058同城境内城市URL地址获取完毕!\n")
print("2、选择指定城市……\n")
tuple_city = get_inputs(dic_cities)
province_name, city_name, city_url = tuple_city
print("\n\xa0\xa0您选择的是[{}][{}]。\n".format(province_name, city_name))
# *****************获取整租房信息*******************
print("3、获取58同城{}{}整租房信息:\n".format(province_name, city_name))
main_job(tuple_city)
# ************将获取的整租房信息写入文件**************
print("4、将58同城{}{}整租房信息写入文件:\n".format(province_name, city_name))
with open("58同城{}{}整租房信息.csv".format(province_name, city_name), 'w+', encoding="utf-8") as f:
for line in house_rent_info:
f.write(",".join(line) + "\n")
print("\xa0\xa058同城{}{}整租房信息写入完毕!\n".format(province_name, city_name))
if __name__ == '__main__':
main()
复制一下,研究研究 可不可以出个成品,小白表示看不懂啊 备注的好详细 虽然看不懂,但备注的真的很详细。 怎么用啊这个
老哥,能帮我弄一下这个不,可以长期合作,Q32274590
页:
[1]