好友
阅读权限10
听众
最后登录1970-1-1
|
如题,用贝壳网的新房信息做可视化分析
代码编写平台:jupyter notebook
选用数据:贝壳网广州地区的房源信息
第一部分:爬取房源信息代码
这部分代码网上有很多
[Python] 纯文本查看 复制代码 import random
import requests
from bs4 import BeautifulSoup
from time import sleep
import pandas as pd
USER_AGENTS = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
]
#设置广州各区域对应的网页url
region = {
'nansha': 'nansha/',
'liwan': 'liwan/',
'yuexiu': 'yuexiu/',
'haizhu': 'haizhu/',
'tianhe': 'tianhe/',
'baiyun': 'baiyun/',
'huangpu': 'huangpugz/',
'panyu': 'panyu/',
'huadou': 'huadou/',
'zengcheng': 'zengcheng/',
'conghua': 'conghua/'
}
#随机取user-agents
headers = {"User-Agent": random.choice(USER_AGENTS)}
def spider(regions):
'''regions可设置为广州某个地区或者是全部爬取'''
main_url = 'https://gz.fang.ke.com/loupan/'
for key, values in region.items():
if key == regions:
all_list = []
url = main_url + values
response = requests.get(url, timeout=10, headers=headers)
html = response.content
soup = BeautifulSoup(html, "lxml")
#通过得到的结果计算页数,每页10个,并进行四舍五入
page = round(int(soup.find('span', class_="value").string) / 10)
for i in range(1, page + 1):
sleep(1.7)
page_url = main_url + values + f'pg{i}'
response = requests.get(page_url, timeout=10, headers=headers)
html = response.content
soup = BeautifulSoup(html, "lxml")
#发现网页在到达50多页的时候不会出现数据,防报错
try:
house_elements = soup.find_all('li', class_="resblock-list post_ulog_exposure_scroll has-results")
pass
for house_elem in house_elements:
#房价
price = house_elem.find('span', class_="number")
#提取是否存在支持vr看房的,支持为1,不支持为0
try:
desc = house_elem.find('li', class_="icon vr vr-animation-forever").text
if desc == "":
have_vr = 1
except Exception as e:
have_vr = 0
#总价的阈值
total = house_elem.find('div', class_="second")
#楼盘的名称
loupan = house_elem.find('a', class_='name')
# 清理数据,去除空白文本和无用的中文计量单位
try:
price = price.text.strip()
except Exception as e:
price = '0'
loupan = loupan.text.replace("\n", "")
try:
total = total.text.strip().replace(u'总价', '')
total = total.replace(u'/套起', '').replace('(万/套)', '')
except Exception as e:
total = '0'
#数据装入列表
data = loupan, price, total, have_vr
all_list.append(data)
except:
break
#通过pandas保存为csv文件
df = pd.DataFrame(all_list)
df.to_csv(f"{key}.csv", index=False,encoding="utf_8_sig")
print(f"{key}.csv保存完毕")
else:
if regions == 'all':
all_list = []
url = main_url + values
response = requests.get(url, timeout=10, headers=headers)
html = response.content
soup = BeautifulSoup(html, "lxml")
page = round(int(soup.find('span', class_="value").string) / 10)
for i in range(1, page + 1):
sleep(1.7)
page_url = main_url + values + f'pg{i}'
response = requests.get(page_url, timeout=10, headers=headers)
html = response.content
soup = BeautifulSoup(html, "lxml")
try:
house_elements = soup.find_all('li',
class_="resblock-list post_ulog_exposure_scroll has-results")
pass
for house_elem in house_elements:
price = house_elem.find('span', class_="number")
try:
desc = house_elem.find('li', class_="icon vr vr-animation-forever").text
if desc == "":
have_vr = 1
except Exception as e:
have_vr = 0
total = house_elem.find('div', class_="second")
loupan = house_elem.find('a', class_='name')
# 继续清理数据
try:
price = price.text.strip()
except Exception as e:
price = '0'
loupan = loupan.text.replace("\n", "")
try:
total = total.text.strip().replace(u'总价', '')
total = total.replace(u'/套起', '').replace('(万/套)', '')
except Exception as e:
total = '0'
data = loupan, price, total, have_vr
all_list.append(data)
except:
break
df = pd.DataFrame(all_list)
df.to_csv(f"{key}.csv", index=False, encoding="utf_8_sig")
print(f"{key}.csv保存完毕")
spider('all')
第二部分:柱状图
[Python] 纯文本查看 复制代码 #对比不同区域支持vr看房的情况
import pandas as pd
from pyecharts.globals import CurrentConfig, NotebookType
from pyecharts import options as opts
import os
from pyecharts.charts import Bar
#设定类型并加载锦泰资源
CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_NOTEBOOK
CurrentConfig.ONLINE_HOST="https://assets.pyecharts.org/assets/"
#通过遍历文件夹找到各区域的csv文件
file_list = os.listdir()
dicts = {}
all_count = []
for file in file_list:
vr_count = 0
if file.endswith("csv"):
df = pd.read_csv(file, encoding='utf-8')
#统计支持vr的房源数
list1 = df.values.tolist()
for n in list1:
if n[3] == 1:
vr_count += 1
else:
continue
#装入字典的同时进行处理key值
dicts[file.replace(".csv", "")] = vr_count
the_a = ["白云", "从化", "海珠", "花都", "黄埔", "荔湾", "南沙", "番禺", "天河", "越秀", "增城"]
value = []
for key, values in dicts.items():
value.append(int(values))
#添加x,y轴,设置标题
bar = Bar()
bar.add_xaxis(the_a)
bar.add_yaxis("各区支持VR", value)
bar.set_global_opts(title_opts=opts.TitleOpts(title="广州各区VR看房情况"))
bar.render_notebook()
第二部分:饼状图
[Python] 纯文本查看 复制代码 #每个区域所有房源房价的均值(元/㎡)
import os
from pyecharts.charts import Pie
file_list = os.listdir()
#设置均值列表
avg_count = []
for file in file_list:
if file.endswith("csv"):
df = pd.read_csv(file, encoding='utf-8')
all_counts = df.shape[0]
list1 = df.values.tolist()
sum_all = 0
#对数据进行细处理,处理掉无用的数据,并在总数上减一
for n in list1:
if str(n[1]).isdigit():
sum_all += int(n[1])
else:
all_counts -= 1
#得到该地区的均值
avg_ = round(sum_all / all_counts)
avg_count.append(avg_)
pie = Pie(init_opts=opts.InitOpts(width="600px", height="400px"))
#对数据进行打包转换
pie.add("", data_pair=[(i, j)for i, j in zip(the_a, avg_count)])
pie.set_global_opts(title_opts=opts.TitleOpts(title="广州各区域房价均值分布(元/㎡)"))
pie.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {d}%"))
pie.render_notebook()
第三部分:地区地图
[Python] 纯文本查看 复制代码 #统计广州各区房源的数量分布
import os
from pyecharts.charts import Map
import pandas as pd
from pyecharts import options as opts
file_list = os.listdir()
num_count = []
for file in file_list:
if file.endswith("csv"):
df = pd.read_csv(file, encoding='utf-8')
num_count.append(df.shape[0])
the_a = ["白云区", "从化区", "海珠区", "花都区", "黄埔区", "荔湾区", "南沙区", "番禺区", "天河区", "越秀区", "增城区"]
#以地图方式可视化房源分布状况
maps = (Map()
.add("",[list(z) for z in zip(the_a,num_count)],maptype='广州')
.set_global_opts(visualmap_opts=opts.VisualMapOpts(max_=200,is_piecewise=True),title_opts=opts.TitleOpts(title="广州各区房源数量分布"))
)
maps.render_notebook()
首次用jupyternotebook写分区代码块,跟pycharm开发的方式不同,但相较于可视化来讲还是很方便
文章仅供交流,如有侵权请联系删帖 |
免费评分
-
查看全部评分
|