分享一个爬虫,获取房地产网(anjuke)一手房信息
本帖最后由 龙小 于 2020-3-21 08:20 编辑先上图:
下面是正菜,大概意思就是获取售房信息并保存到文件
import urllib.request
from bs4 import BeautifulSoup
import time
import os
class spiderdown(object):
"""description of class"""
_headers=''
_url=''
_citylist=[]
def set_headers(self,headers):
self._headers = headers
def set_url(self,url,citylist):
self._url=url
self._citylist=citylist
def down_load_page(self,url):
try:
#此处可优化,后面修改成自动获取浏览器version信息
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'}
req = urllib.request.Request(url=url,headers=headers)
except:
print('打开链接失败url:(%s)' % url)
else:
return urllib.request.urlopen(req,timeout=30)
def parse_html(self,html):
response1 = html.read().decode('utf-8').replace('price-txt','price')
soup = BeautifulSoup(response1,'html.parser')
listinfo = soup.find_all(class_='infos')
listprice = soup.find_all(class_='favor-pos')
#print(len(listprice),len(listinfo))
infolist=[]
for number in range(0,len(listinfo)):
houseinfo=''
if len(listinfo.select('.items-name'))!=0:
houseinfo=houseinfo+(listinfo.select('.items-name').get_text())
else :
houseinfo=houseinfo+("default")
houseinfo=houseinfo+";"
if len(listinfo.select('.list-map'))!=0:
houseinfo=houseinfo+(listinfo.select('.list-map').get_text())
else :
houseinfo=houseinfo+("default")
houseinfo=houseinfo+";"
for status in listinfo.select('.status-icon') :
houseinfo=houseinfo+(status.get_text())
houseinfo=houseinfo+","
houseinfo=houseinfo+";"
if len(listinfo.select('.group-mark')) != 0:
houseinfo=houseinfo+(listinfo.select('.group-mark').get_text())
else :
houseinfo=houseinfo+("default")
houseinfo=houseinfo+";"
if len(listprice.select('span'))!=0:
houseinfo=houseinfo+(listprice.select('span').get_text())
else :
houseinfo=houseinfo+('-1')
print('获取城市住房销售信息成功:%s ' %houseinfo)
infolist.append(houseinfo+'\n')
return infolist
#保存文件按天进行保存
def save_file(self,infolist):
filename='house_indo_' + time.strftime('%Y%m%d',time.localtime())
fp=''
try:
if os.path.exists('.\\data'):
fp = open('.\\data\\'+filename,'a',encoding='utf-8')
fp.writelines(infolist)
fp.close()
else:
os.mkdir('.\\data')
fp = open('.\\data\\'+filename,'a',encoding='utf-8')
fp.writelines(infolist)
fp.close()
except Exception as e:
print('保存文件失败,请检查错误信息 %s' % str(e))
if fp != '':
fp.close()
def run_sprider(self):
#1、获取链接(安居客)
url_set = set()
for city in self._citylist:
url = self._url+city+'/'
print('完成一个网页链接:%s' % url)
url_set.add(url)
#2、遍历链接,获取网页
for url in url_set:
print('开始爬取[%s]...' % url)
html = self.down_load_page(url)
if html.getcode() == 200:
info_list=self.parse_html(html)
self.save_file(info_list)
print('一个批次爬取完成,休眠5秒.......')
time.sleep(5)
else:
print('网页没有正确打开,请检查,网页返回码:(%d)' % html.getcode())
这里是APP信息:
from spiderdown import spiderdown
if __name__=='__main__':
try:
jiwuApp = spiderdown()
base_url='https://xm.fang.anjuke.com/loupan/'
city_list=['tongan','xiangan','jimei','haicang','zhangzhougang','siming','huli','jiaomei','quanzhou','xiamenzhoushi']
jiwuApp.set_url(base_url,city_list)
jiwuApp.run_sprider()
except Exception as e:
print('爬取失败,错误信息:%s' % str(e))
喜欢就赞老夫吧。哈哈
附件:
getserver 发表于 2020-3-20 22:46
不支持多线程吗
数据量不大,没必要多线程。python的多线程有个弱点,不一定好。建议多进程 by海洋 发表于 2020-5-17 13:38
这个爬完的数据在哪?
你可以参考下
楼主好,怎么使用啊? 老夫耐你 不支持多线程吗 谢谢分享试一下 xiaoz165748 发表于 2020-3-20 22:39
楼主好,怎么使用啊?
安装:python 3.7 ,beautiful soup就好 感谢大佬分享这个爬虫 感谢楼主分享 感谢分享 学习了