本帖最后由 dsct3003 于 2020-6-2 22:08 编辑
这是自己自学写的第一个房产的网络爬虫,想爬取的内容能够输出,但无法保存写入。主要是不知道如何把字典写入到文本,希望论坛大牛给予帮助。。。由于第一次写,代码的结构顺序也比较乱,还望大家不要取笑。。
[Asm] 纯文本查看 复制代码 coding='utf-8'
import requests
from bs4 import BeautifulSoup
import os
Headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
if not os.path.exists('./链家房产信息'):
os.makedirs('./链家房产信息')[i]#创建一个保存链家房产信息的文件夹
[/i][i]
[/i]def get_pega_urls(url):
respans=requests.get(url=url,headers=Headers)
respans.encoding=respans.apparent_encoding
soup=BeautifulSoup(respans.text,'lxml')
urls=soup.find_all('a',{'class':"noresultRecommend"})[i]#找到存放每一页信息的url的tag
[/i][i] [/i]for i in urls:
pega_urls=i['href']
yemian=pega_info(pega_urls)
print(yemian)
def pega_info(url):
yemian={}
respan=requests.get(url,headers=Headers)
respan.encoding=respan.apparent_encoding
soup=BeautifulSoup(respan.text,'lxml')
info=soup.select('div.communityName')
xqmz=info[0].select('a')[0].get_text()
yemian['小区名称: ']=xqmz
jbxx=soup.select('div.m-content')
hx=jbxx[0].select('li')[0].get_text()
yemian['房屋户型: ']=hx
price=soup.select('div.price')
qian1=price[0].select('span')[0].get_text()
qian2=price[0].select('span')[1].get_text()
qian=qian1+qian2
yemian['价格: ']=qian
lc=jbxx[0].select('li')[1].get_text()
yemian['所在楼层: ']=lc
mj=jbxx[0].select('li')[2].get_text()
yemian['建筑面积: ']=mj
jg=jbxx[0].select('li')[3].get_text()
yemian['户型结构: ']=jg
snmj=jbxx[0].select('li')[4].get_text()
yemian['套内面积: ']=snmj
lx=jbxx[0].select('li')[5].get_text()
yemian['建筑类型: ']=lx
cx=jbxx[0].select('li')[6].get_text()
yemian['房屋朝向: ']=cx
jg=jbxx[0].select('li')[7].get_text()
yemian['建筑结构: ']=jg
zxqk=jbxx[0].select('li')[8].get_text()
yemian['装修情况: ']=zxqk
th=jbxx[0].select('li')[9].get_text()
yemian['梯户比例: ']=th
gn=jbxx[0].select('li')[10].get_text()
yemian['供暖方式: ']=gn
bt=jbxx[0].select('li')[11].get_text()
yemian['配备电梯: ']=bt
return yemian
def spider():
base_url='https://zz.lianjia.com/ershoufang/pg{}/'
for i in range(1,8):
url=base_url.format(str(i))[i]#获取1—8页的url
[/i][i] [/i]pega_urls=get_pega_urls(url)
print('=*'*30)
print('开始爬取第'+str(i)+'页')[i]#输出一个分页标志
[/i][i]
[/i]spider() |