最近研究房产数据,把安居客的数据抓了一把,但发现部分数据有乱码,不知如何改进代码?[Python] 纯文本查看 复制代码 import time
import chardet
import pandas as pd
import requests
from lxml import etree
p = 0
data_list = []
st = time.strftime("%Y-%m-%d", time.localtime())
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36',
'cookie':
}
def gethtml_detail(url):
retry_count = 4
proxy = get_proxy().get("proxy")
while retry_count > 0:
try:
response = requests.get(url, headers=header, proxies={"http": "http://{}".format(proxy)})
encodingInfo = chardet.detect(response.content)
r_response = response.content.decode(encodingInfo['encoding'], 'ignore').replace("变卖价", "起拍价")
return r_response
except Exception:
retry_count -= 1
delete_proxy(proxy)
return None
def area_link(html):
html = etree.HTML(html)
link_list = []
links = html.xpath('//*[@id="__layout"]/div/section/section[2]/div[1]/section/div/ul/li/a/@href') # 链接列表
for i in links[1:]:
detail_url = i # 区域http链接
link_list.append(detail_url)
return link_list
def area_second_content_link(html):
html = etree.HTML(html)
link_list = []
links = html.xpath('//*[@id="__layout"]/div/section/section[3]/section/div[2]/a/@href') # 链接列表
for i in links:
detail_url = i # 区域http链接
link_list.append(detail_url)
return link_list
# 调用代{过}{滤}理API
def get_proxy():
return requests.get("http://127.0.0.1:5010/get/").json()
def delete_proxy(proxy):
requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy))
# 主程序
def main():
global p
# url = 'https://chongqing.anjuke.com/community'
# html = gethtml_detail(url)
# area_links = area_link(html)
# for i in area_links:
i='https://chongqing.anjuke.com/community/jiangbei/'
p_lst = i + "p{}" + "/"
url_list = [p_lst.format(i + 1) for i in range(0, 29)]
for i in url_list:
html = gethtml_detail(i)
lst = area_second_content_link(html)
for i in lst:
html_detail = gethtml_detail(i)
html = etree.HTML(html_detail)
lst = {}
lst['索引'] = ''
lst['小区名称'] = html.xpath('/html/body/div[2]/div[3]/div[1]/a/@title')[0].strip()
lst['小区地址'] = html.xpath('/html/body/div[2]/div[3]/div[1]/h1/span/text()')[0].strip()
lst['物业费'] = html.xpath('//*[@id="basic-infos-box"]/dl/dd[2]/text()')[0].strip()
lst['竣工时间'] = html.xpath('//*[@id="basic-infos-box"]/dl/dd[5]/text()')[0].strip()
lst['容积率'] = html.xpath('//*[@id="basic-infos-box"]/dl/dd[7]/text()')[0].strip()
lst['绿化率'] = html.xpath('//*[@id="basic-infos-box"]/dl/dd[8]/text()')[0].strip()
lst['开发商'] = html.xpath('//*[@id="basic-infos-box"]/dl/dd[9]/text()')[0].strip()
lst['物业公司'] = html.xpath('//*[@id="basic-infos-box"]/dl/dd[10]/text()')[0].strip()
lst['所属商圈'] = html.xpath('//*[@id="basic-infos-box"]/dl/dd[11]/text()')[0].strip()
lst['小区id'] = eval(html.xpath('/html/body/div[2]/div[3]/div[2]/a[1]/@href')[0].split("/")[-1])
lst['小区链接'] = html.xpath('/html/body/div[2]/div[3]/div[2]/a[1]/@href')[0].strip()
data_list.append(lst)
df = pd.DataFrame(data_list)
for i in df.index:
df['索引'].at[i] = i + 1
df.to_excel("C:/Users/Administrator/Desktop/Python/安居客/重庆住宅小区数据" + st + ".xlsx", index=False)
p = p + 1
print('第%s条数据已保存' % p)
time.sleep(4)
if __name__ == '__main__':
main()
|