使用代{过}{滤}理爬取58同城指定城市整租房信息(不含：品牌公寓)

panison 发表于 2021-8-16 20:14

本帖最后由 panison 于 2021-8-16 20:18 编辑

本程序使用的代{过}{滤}理IP提供商是代{过}{滤}理精灵。
源代码中将代{过}{滤}理精灵的添加白名单、删除白名单API地址中的敏感数据appid、appkey的值使用??????代替。可在【创建ProxyIpServer对象】相关代码中修改。
本人非计算机专业，自学的Python。算是爱好编程的小白。分享是为了交流学习。也欢迎大家提出改进意见。

"""
爬取58同城指定城市整租房信息(不含：品牌公寓)
__________________________________________
关于使用代{过}{滤}理IP的问题
1、稳定高匿的代{过}{滤}理IP需要收费。本程序使用的代{过}{滤}理IP提供商是代{过}{滤}理精灵。
2、稳定高匿的代{过}{滤}理IP对于爬虫项目的成功实施非常关键
3、使用代{过}{滤}理IP程序会报错：使用代{过}{滤}理IP报错：ValueError: check_hostname requires server_hostname
解决办法：将urllib3降级。pip3 install urllib3==1.25.11
__________________________________________
V1.0版本：
1、统一设置：请求的间隔时间,代{过}{滤}理IP的API调用间隔,请求出错的尝试次数
2、去重问题：定义了代{过}{滤}理IP去重间隔，提高代{过}{滤}理IP利用效率
3、CSV文件：如果爬取的字符中有半角逗号，那么替换成全角逗号
__________________________________________
"""
import time
import random
import urllib3
from lxml import etree
from selenium import webdriver
import requests

# 禁止由于移除SSL认证而出现的警告信息
urllib3.disable_warnings()

# 将使用过的代{过}{滤}理ip放入ip_pool_list
ip_pool_list = list()

# 通过代{过}{滤}理IP服务商提供的API获取的代{过}{滤}理IP列表
proxy_ip_list = list()

# 用于存放整租房详细信息
house_rent_info = list()
house_rent_info.append([
"省/自治区/直辖市", "市", "区域分支", "区域次级分支", "小区名称", "房源更新时间", "房源标题", "房源租赁方式",
"房源价格数值", "房源价格单位", "房源面积数值", "房源面积单位", "房源户型", "房源装修情况", "房源详细地址", "房源URL"
])

# 定义类，用于代{过}{滤}理IP提供商的相关属性和动作
class ProxyIpServer:
def __init__(self, name, add_white_ip_api, del_white_ip_api, proxy_ip_api, time_interval):
   self.name = name
   self.add_white_ip_api = add_white_ip_api
   self.del_white_ip_api = del_white_ip_api
   self.proxy_ip_api = proxy_ip_api
   self.time_interval = time_interval

def add_white_ip(self):
   r = requests.get(url=self.add_white_ip_api, timeout=5)
   return r.json()

def del_white_ip(self):
   r = requests.get(url=self.del_white_ip_api, timeout=5)
   return r.json()

def get_proxy_ips(self):
   time.sleep(self.time_interval)
   r = requests.get(url=self.proxy_ip_api, timeout=5)
   return r.json()

# 获取客户端IP。注意，返回一个元组(客户端IP, 请求成功标志)
def get_client_ip(proxy_ip=""):
url = "https://httpbin.org/ip"
if proxy_ip == "":
   r = requests.get(url=url, verify=False, timeout=5)
   return r.json()["origin"], True
else:
   try:
         ip = proxy_ip.split(":")[0]
         port = proxy_ip.split(":")[1]
         proxies = {"https": "https://{0:}:{1:}".format(ip, port), "http": "http://{0:}:{1:}".format(ip, port)}
         r = requests.get(url=url, verify=False, proxies=proxies, timeout=5)
         return r.json()["origin"], True
   except Exception:
         return "0.0.0.0", False

# 定义函数check_proxy_ip(),对代{过}{滤}理IP进行可用性检测。
def check_proxy_ip(proxy_ip, local_ip, used_ips_interval):
mask_ip = proxy_ip.split(":")[0]
client_ip = get_client_ip(proxy_ip)[0]
client_ip_success_connect = get_client_ip(proxy_ip)[1]
if not client_ip_success_connect:
   print("\xa0\xa0代{过}{滤}理IP{}连接标志:{}".format(proxy_ip, client_ip_success_connect), end=";")
   flag1 = flag2 = False
else:
   print("\xa0\xa0代{过}{滤}理IP{}连接标志:{}".format(proxy_ip, client_ip_success_connect), end=";")
   # 检测是否是高匿代{过}{滤}理IP
   if client_ip != local_ip:
         flag1 = True
   else:
         flag1 = False
   print("代{过}{滤}理IP高匿标志:{}".format(flag1), end=";")
   # 检测代{过}{滤}理IP重复出现所间隔的IP使用次数是否满足条件
   global ip_pool_list
   if mask_ip not in ip_pool_list:
         ip_pool_list.append(mask_ip)
         flag2 = True
         print("代{过}{滤}理IP完全去重标志:{}".format(flag2), end=";")
   else:
         if ip_pool_list[::-1].index(mask_ip) >= used_ips_interval:
            ip_pool_list.append(mask_ip)
            flag2 = True
         else:
            flag2 = False
         print("代{过}{滤}理IP相对去重标志:{}".format(flag2), end=";")
print("代{过}{滤}理IP可用标志:{}".format(flag1 and flag2))
return flag1 and flag2

# 定义函数get_proxy_ip_list()获取代{过}{滤}理IP列表
def get_proxy_ips_list(proxy_ip_server, local_ip):
# 获取代{过}{滤}理IP列表。
try:
   ls = list()
   dic_list = proxy_ip_server.get_proxy_ips()["data"]
   for dic in dic_list:
         ls.append(dic["IP"] + ":" + str(dic["Port"]))# 精灵代{过}{滤}理
except Exception:
   ls = list()
# 如果返回空列表或者连接错误，那么使用本地IP。
if len(ls) == 0:
   ls.append(local_ip)
print("\xa0\xa0本次获取的代{过}{滤}理IP列表:[{}]".format(" , ".join(ls)))
return ls

# 定义函数get_proxy_ip获取单个代{过}{滤}理IP
def get_proxy_ip(proxy_ip_server, local_ip):
global proxy_ip_list
if len(proxy_ip_list) == 0:
   proxy_ip_list = get_proxy_ips_list(proxy_ip_server, local_ip)
   print("\xa0\xa0其中未用的代{过}{滤}理IP列表:[{}]".format(" , ".join(proxy_ip_list)))
else:
   print("\xa0\xa0其中未用的代{过}{滤}理IP列表:[{}]".format(" , ".join(proxy_ip_list)))
return proxy_ip_list.pop(0)

# 定义函数get_response()返回元组(response对象, 获取成功标志)。参数request_times为请求出错的尝试次数。
def get_response(url, proxy_ip_server, local_ip, request_times, sleep_seconds, used_ips_interval):
# 随机请求头
ua_list = [
   {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0'},
   {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/18.0.1'},
   {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130331 Firefox/21.0'},
   {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"}
]
# 输出间隔行
border = "\xa0\xa0" + "*" * 100
print(border)
print("\xa0\xa0URL:{}".format(url))
while True:
   proxy_ip = get_proxy_ip(proxy_ip_server, local_ip)
   # 无代{过}{滤}理IP则使用
   if proxy_ip == local_ip:
         for n in range(1, request_times + 1):
            time.sleep(sleep_seconds)
            try:
               print("\xa0\xa0本机IP{}第{}/{}次发起请求".format(proxy_ip, n, request_times), end=";")
               r = requests.get(url=url, headers=random.choice(ua_list), timeout=5)
               if r.status_code == 200:
                     print("请求成功!\n" + border)
                     return r, True
            except Exception:
               print("请求失败!")
               if n == request_times:
                     print(border)
                     return "", False
   else:
         while check_proxy_ip(proxy_ip, local_ip, used_ips_interval):
            for n in range(1, request_times + 1):
               time.sleep(sleep_seconds)
               try:
                     ip = proxy_ip.split(":")[0]
                     port = proxy_ip.split(":")[1]
                     print("\xa0\xa0代{过}{滤}理IP{}第{}/{}次发起请求".format(ip, n, request_times), end=";")
                     proxies = {"https": "https://{0:}:{1:}".format(ip, port), "http": "http://{0:}:{1:}".format(ip, port)}
                     r = requests.get(url=url, headers=random.choice(ua_list), proxies=proxies, verify=False, timeout=5)
                     if r.status_code == 200:
                        print("请求成功!\n" + border)
                        return r, True
               except Exception:
                     print("请求失败!")
                     if n == request_times:
                        print(border)
                        return "", False

# 定义函数get_cities(),获取58同城省、市名称及城市URL。参数为Response对象。
def get_cities(url):
driver = webdriver.Chrome()
driver.get(url)
html_source = driver.page_source
dic_cities = dict()
letters = etree.HTML(html_source).xpath("//div[@class='content-letter']")
for letter in letters:
   content_provinces = letter.xpath("div[@class='content-province']")
   for content_province in content_provinces:
         province_name = content_province.xpath("div[@class='content-province-title']/text()")[0]
         dic_cities = dict()
         content_cities = content_province.xpath("div[@class='content-cities']")
         for content_city in content_cities:
            city_names = content_city.xpath("a/text()")
            city_urls = content_city.xpath("a/@href")
            for i in range(len(city_urls)):
               city_urls = "https:" + city_urls
            for city_name, city_url in zip(city_names, city_urls):
               dic_cities = list()
               dic_cities.append(city_url)
driver.quit()
# 增加直辖市
dic_cities["直辖市"] = {
   "北京": ["https://bj.58.com/"], "天津": ["https://tj.58.com/"],
   "上海": ["https://sh.58.com/"], "重庆": ["https://cq.58.com/"]
}
# 删除字典中的境外数据
del dic_cities["其他"]
del dic_cities["海外"]
return dic_cities

# 定义函数 get_inputs(),获取输入数据，返回元组(省/自治区/直辖市名称、城市名称、及城市URL)。参数为get_cities()的返回值。
def get_inputs(dic):
province = ""
city = ""
print("\xa0\xa0[省/自治区/直辖市名称参照表:", end="\n\n")
print("\xa0\xa0", end="")
for i in range(len(list(dic.keys()))):
   print(i, list(dic.keys()), end="、")
   if (i + 1) % 10 == 0:
         print("\n\xa0\xa0", end="")
flag = True
while flag:
   print("\n")
   number_province = input("\xa0\xa0请输入[省/自治区/直辖市序号:")
   if number_province in [str(i) for i in range(len(list(dic.keys())))]:
         province = list(dic.keys())[int(number_province)]
         flag = False
   else:
         print("\xa0\xa0输入序号不在参照表范围，请重新输入！")
print("\n\xa0\xa0{}所属城市名称参照表:".format(province), end="\n\n")
print("\xa0\xa0", end="")
for i in range(len(list(dic.keys()))):
   print(i, list(dic.keys()), end="、")
   if (i + 1) % 10 == 0:
         print("\n\xa0\xa0", end="")
flag = True
while flag:
   print("\n")
   number_city = input("\xa0\xa0请输入{}所属城市序号:".format(province))
   if number_city in [str(i) for i in range(len(list(dic.keys())))]:
         city = list(dic.keys())[int(number_city)]
         flag = False
   else:
         print("\xa0\xa0输入序号不在参照表范围，请重新输入！")
return province, city, dic[0]

# 定义函数get_page_count(),获取页面中的索引页总页数
def get_page_count(r):
return int(etree.HTML(r.text).xpath("//div[@class='pager']//span/text()")[-2])

# 定义函数get_house_cell_urls_in_one_page() 获取指定城市单页的整租房详情页的URL。参数为Response对象
def get_house_cell_urls_in_one_page(r):
list_url = list()
house_cells = etree.HTML(r.text).xpath("//li")
for house_cell in house_cells:
   house_cell_url = house_cell.xpath("div[@class='des']/h2/a/@href")[0]
   list_url.append(house_cell_url)
return list_url

# 定义函数get_house_cell_detail(),获取单个租房详情页的数据，返回租房详情的列表
def get_house_cell_detail(r):
# 获取详情页标题。调试用。
html_title = etree.HTML(r.text).xpath("//title/text()")[0].strip()
print("\xa0\xa0详情页标题:{}(此信息调试用)".format(html_title))
# 获取房源数据
house_title = etree.HTML(r.text).xpath("//div[@class='house-title']/h1/text()")[0].strip().replace(",", "，")
print("\xa0\xa0房源标题:{}".format(house_title))
house_update_time = etree.HTML(r.text).xpath("//div[@class='house-title']/p/text()")[1].strip().replace("\xa0", "").replace(" ", "").replace("\n", "")
print("\xa0\xa0房源更新时间:{}".format(house_update_time))
house_price = etree.HTML(r.text).xpath("//div/span/b/text()")[0].strip()
house_price_per = etree.HTML(r.text).xpath("//div/span/text()")[0].strip().replace(" ", "").replace("\n", "")
print("\xa0\xa0房源价格:{}{}".format(house_price, house_price_per))
house_rent_mode = etree.HTML(r.text).xpath("//div//li/span/text()")[0].strip()
print("\xa0\xa0房源租赁方式:{}".format(house_rent_mode))
house_unit = etree.HTML(r.text).xpath("//div//li/span/text()")[0].split("\xa0\xa0")[0].strip()
print("\xa0\xa0房源户型:{}".format(house_unit))
house_area = etree.HTML(r.text).xpath("//div//li/span/text()")[0].split("\xa0\xa0")[1].strip()[0:-1].strip()
house_area_per = etree.HTML(r.text).xpath("//div//li/span/text()")[0].split("\xa0\xa0")[1].strip()[-1]
print("\xa0\xa0房源面积:{}{}".format(house_area, house_area_per))
house_decorate = etree.HTML(r.text).xpath("//div//li/span/text()")[0].split("\xa0\xa0")[2].strip()
print("\xa0\xa0房源装修情况:{}".format(house_decorate))
house_location = etree.HTML(r.text).xpath("//div//li/span/a/text()")[0].strip()
print("\xa0\xa0房源所在小区:{}".format(house_location))
house_division_branch = etree.HTML(r.text).xpath("//div//li/span/a/text()")[0].strip()
print("\xa0\xa0区域分支:{}".format(house_division_branch))
try:
   house_division_subbranch = etree.HTML(r.text).xpath("//div//li/span/a/text()")[1].strip()
except Exception:
   house_division_subbranch = "NULL"
print("\xa0\xa0区域次级分支:{}".format(house_division_subbranch))
house_detail_address = etree.HTML(r.text).xpath("//div//li/span/text()")[0].strip().replace(",", "，")
print("\xa0\xa0房源详细地址:{}".format(house_detail_address))
print("\n")
# 省/自治区/直辖市名称、城市名称暂时留空
province_name = ""
city_name = ""
house_cell_url = ""
line = [
   province_name, city_name, house_division_branch, house_division_subbranch, house_location, house_update_time,
   house_title, house_rent_mode, house_price, house_price_per,house_area, house_area_per, house_unit,
   house_decorate, house_detail_address, house_cell_url
]
return line

# 定义函数main_job(),作为爬虫的主体部分，完成爬虫的核心任务。返回整租房信息的列表。
def main_job(tuple_city):

# *****************初始设置*******************
# 获取本机IP
local_ip = get_client_ip()[0]
# 请求的间隔时间
sleep_seconds = random.randint(0, 0) / 100
# 对于同一URL，请求出错的尝试次数
request_times = 20
# 代{过}{滤}理IP去重间隔
used_ips_interval = 10

# *******创建ProxyIpServer对象:*******
pis_name = "代{过}{滤}理IP提供商"
pis_add_white_ip_api = "http://www.jinglingdaili.com/Users-whiteIpAddNew.html?appid=??????&appkey=??????&type=dt&whiteip=" + local_ip + "&index="
pis_del_white_ip_api = "http://www.jinglingdaili.com/Users-whiteIpDelNew.html?appid=??????&appkey=??????&type=dt&whiteip=all"
pis_proxy_ip_api = "http://t.ipjldl.com/index.php/api/entry?method=proxyServer.generate_api_url&packid=0&fa=0&fetch_key=&groupid=0&qty=4&time=100&pro=&city=&port=1&format=json&ss=5&css=&dt=1&specialTxt=3&specialJson=&usertype=2"
pis_time_interval = 5# 代{过}{滤}理IP调用间隔秒数
pis = ProxyIpServer(pis_name, pis_add_white_ip_api, pis_del_white_ip_api, pis_proxy_ip_api, pis_time_interval)

# 将本机IP添加到代{过}{滤}理IP提供商的白名单
msg_add_white_ip = pis.add_white_ip()["msg"]
print("\xa0\xa0{}:IP白名单{}。\n".format(pis_name, msg_add_white_ip))

# **获取整租房数据，并将信息存入列表house_rent_info**
# 拼接城市整租房URL
house_rent_city_url = tuple_city[2] + "zufang/"
# 获取城市整租房URL页面中的总索引页数
r = get_response(house_rent_city_url, pis, local_ip, request_times, sleep_seconds, used_ips_interval)[0]
page_count = get_page_count(r)
# 遍历城市整租房URL页面中的索引页码
for page in range(1, page_count + 1):
   # 拼接单个索引页的URL
   house_rent_city_page_url = house_rent_city_url + "pn" + str(page) + "/"
   # 获取单个索引页中整租房记录的URL
   urls = list()
   for q in range(1, request_times + 1):
         try:
            print("\xa0\xa0获取该索引页信息:第{}/{}次:".format(q, request_times))
            r = get_response(house_rent_city_page_url, pis, local_ip, request_times, sleep_seconds, used_ips_interval)[0]
            urls = get_house_cell_urls_in_one_page(r)
            if len(urls) > 0:
               print("\xa0\xa0单个索引页中整租房记录的URL列表获取成功!")
               break
            else:
               if q < request_times:
                     print("\xa0\xa0单个索引页中整租房记录的URL列表获取失败!15秒后重新获取!")
                     time.sleep(15)
               else:
                     print("\xa0\xa0单个索引页中整租房记录的URL列表获取失败!将略过此索引页!")
         except Exception:
            if q < request_times:
               print("\xa0\xa0单个索引页中整租房记录的URL列表获取失败!15秒后重新获取!")
               time.sleep(15)
            else:
               print("\xa0\xa0单个索引页中整租房记录的URL列表获取失败!将略过此索引页!")
   # 遍历单个索引页中整租房记录的URL
   for i in range(len(urls)):
         print("\xa0\xa0整租房页面记录定位:{}省{}市，当前第[{}/{}]页，页面URL:{}。当前页第[{}/{}]条数据。".format(tuple_city[0], tuple_city[1], page, page_count, house_rent_city_page_url, i+1, len(urls)))
         print("\xa0\xa0整租房页面详情地址:{}".format(urls))
         # 获取详情页信息
         for j in range(1,request_times + 1):
            try:
               print("\xa0\xa0获取该详情页信息:第{}/{}次:".format(j, request_times))
               r = get_response(urls, pis, local_ip, request_times, sleep_seconds, used_ips_interval)[0]
               line = get_house_cell_detail(r)
               line[0] = tuple_city[0]
               line[1] = tuple_city[1]
               line[-1] = urls
               global house_rent_info
               house_rent_info.append(line)
               break
            except Exception:
               if j < request_times:
                     print("\xa0\xa0获取该详情页信息失败!15秒后重新获取!")
                     time.sleep(15)
               else:
                     print("\xa0\xa0获取该详情页信息失败!将略过此详情页!")
print("\xa0\xa058同城{}{}整租房信息获取完毕。\n".format(tuple_city[0], tuple_city[1]))

# 将本机IP从代{过}{滤}理IP提供商的白名单中删除
msg_del_white_ip = pis.del_white_ip()["msg"]
print("\xa0\xa0{}:IP白名单{}。\n".format(pis_name, msg_del_white_ip))

def main():

# *****************获取城市*******************
# 58同城索引页地址
url_sort = "https://www.58.com/changecity.html?fullpath=0"

print("1、获取58同城境内城市URL地址……\n")
dic_cities = get_cities(url_sort)
print("\xa0\xa058同城境内城市URL地址获取完毕!\n")

print("2、选择指定城市……\n")
tuple_city = get_inputs(dic_cities)
province_name, city_name, city_url = tuple_city
print("\n\xa0\xa0您选择的是[{}][{}]。\n".format(province_name, city_name))

# *****************获取整租房信息*******************
print("3、获取58同城{}{}整租房信息:\n".format(province_name, city_name))
main_job(tuple_city)

# ************将获取的整租房信息写入文件**************
print("4、将58同城{}{}整租房信息写入文件:\n".format(province_name, city_name))
with open("58同城{}{}整租房信息.csv".format(province_name, city_name), 'w+', encoding="utf-8") as f:
   for line in house_rent_info:
         f.write(",".join(line) + "\n")
print("\xa0\xa058同城{}{}整租房信息写入完毕!\n".format(province_name, city_name))

if __name__ == '__main__':
main()

githubi 发表于 2021-8-16 22:46

复制一下，研究研究

风在这里停 发表于 2021-8-17 00:07

可不可以出个成品，小白表示看不懂啊

Vickie9 发表于 2021-8-17 00:37

备注的好详细

bachelor66 发表于 2021-8-17 09:21

虽然看不懂，但备注的真的很详细。

洪达盛 发表于 2021-9-11 18:07

怎么用啊这个

yanjiangkun 发表于 2022-8-5 12:33

老哥，能帮我弄一下这个不，可以长期合作，Q32274590

页: [1]

吾爱破解 - 52pojie.cn's Archiver

使用代{过}{滤}理爬取58同城指定城市整租房信息(不含：品牌公寓)