[Python] 纯文本查看 复制代码 from scrapy_splash import SplashRequest
from scrapy.spiders import Spider
from scrapy import Request
from lianjia.items import LianjiaItem
lua_script = """
function main(splash, args)
assert(splash:go(args.url))
assert(splash:wait(0.5))
return {
html = splash:html()
}
end
"""
class LianjiaSpider(Spider):
name = "lianjia"
url = 'https://yt.lianjia.com/ershoufang/'
def start_requests(self):
yield SplashRequest(self.url,
callback=self.parse,
endpoint='execute',
args={
'lua_source': lua_script,
'images': 0,
'wait': 3
},
cache_args=['lua_source'])
def parse(self, response):
item = LianjiaItem()
list_selector = response.xpath("//ul[@class='sellListContent']/li")
for one_selector in list_selector:
try:
name = one_selector.xpath("div[@class='info clear']/div[@class='title']/a/text()").extract()[0]
price = one_selector.xpath("div[@class='info clear']/div[@class='priceInfo']/div[1]/span/text()").extract()[0]
item['name'] = name
item['price'] = price
yield item
except:
pass
next_url = response.xpath("//div[@class='page-box house-lst-page-box']/a[last()]/text()").extract()[0]
if next_url == "下一页":
next_url = response.xpath("//div[@class='page-box house-lst-page-box']/a[last()]/@href").extract()[0]
next_url = "https://yt.lianjia.com"+next_url
yield SplashRequest(next_url,
callback=self.parse,
endpoint='execute',
args={
'lua_source': lua_script,
'images': 0,
'wait': 3
},
cache_args=['lua_source'])
链家二手房的下一页是通过js动态加载的应该,如果使用普通的request请求是获取不到下一页的信息的,除非是自己写链接然后设置一个定值,但是实际上如果只是一个地区的话好说,换一个别的地区则容易出现问题,比如说烟台是100页,北京可能是80页,如果我们每次换地区都要重新改一下数据的话,我感觉比较麻烦,不过其实也不是很麻烦。。。。
不过后来发现可以通过splash来动态加载页面,然后请求这个splash的页面获取信息,这样就不怕获取不到下一页的链接了。 |