使用python 爬了一下苏宁图书分类下的全部图书,抓取图书的名称、图片、商家、评价,详细信息, 发现价格的标签是动态添加的DOM,不知道这个用scrapy 怎么能拿到。
图书分类
这个是Model
[Asm] 纯文本查看 复制代码 import scrapy
class SnbookItem(scrapy.Item):
# define the fields for your item here like:
classTitle = scrapy.Field()
classUrl = scrapy.Field()
class SnbookDetail(scrapy.Item):
# define the fields for your item here like:
goodsName = scrapy.Field()
goodsImg = scrapy.Field()
goodsPingjia = scrapy.Field()
shopName = scrapy.Field()
goodsUrl = scrapy.Field()
goodsPrice = scrapy.Field()
这个是爬取的代码
[Asm] 纯文本查看 复制代码 import scrapy
from scrapy.selector import Selector
from snbook.items import SnbookItem,SnbookDetail
import re
class SubookSpider(scrapy.Spider):
name = 'subook'
allowed_domains = ['suning.com']
start_urls = ['http://list.suning.com/0-502282-0.html']
# 获得所有图书一级分类数据
def parse(self, response):
list = response.xpath("//div[@class='all-class clearfix']//a").extract()
if list is not None:
for li in list:
item = SnbookItem()
data = Selector(text=li)
item["classUrl"] ='http:'+data.xpath("//a/@href").extract_first()
item["classTitle"] = data.xpath("//a/@title").extract_first()
yield scrapy.Request(
item['classUrl'],
callback= self.parse_book_list,
meta= {'item':item}
)
# 爬取二级分类的数据
def parse_book_list(self,response):
list = response.xpath("//div[@class='all-class clearfix']//a").extract()
if list is not None:
for li in list:
item = SnbookItem()
data = Selector(text=li)
item["classUrl"] = 'http:' + data.xpath("//a/@href").extract_first()
item["classTitle"] = data.xpath("//a/@title").extract_first()
yield scrapy.Request(
item['classUrl'],
callback=self.parse_book_detail,
meta={'item': item}
)
# 爬取分类的商品数据
def parse_book_detail(self,response):
#
detailItem = SnbookDetail()
detailItem['goodsName'] = response.xpath(
"//div[@class='product-list clearfix']//ul//li//div[@class='title-selling-point']//a//text()").extract_first()
detailItem['goodsImg'] = response.xpath(
"//div[@class='product-list clearfix']//ul//li//div[@class='res-img']//img/@src").extract_first()
detailItem['goodsImg'] = 'http:'+detailItem['goodsImg']
detailItem['goodsPingjia'] = response.xpath(
"//div[@class='product-list clearfix']//ul//li//div[@class='evaluate-old clearfix']//a//i//text()").extract_first()
detailItem['shopName'] = response.xpath(
"//div[@class='product-list clearfix']//ul//li//div[@class='store-stock']//a//text()").extract_first()
detailItem['goodsUrl'] = response.xpath(
"//div[@class='product-list clearfix']//ul//li//div[@class='res-img']//a/@href").extract_first()
detailItem['goodsUrl'] = 'http:' + detailItem['goodsUrl']
# detailItem['goodsPrice'] = response.xpath(
# "//div[@class='product-list clearfix']//ul//li//div[@class='price-box']")()
# print(detailItem['goodsPrice'])
yield detailItem
pattern = re.compile(r'共(\d+)页,')
pagecount = pattern.findall(response.body.decode())[0] # 子类图书总页数
# print(pagecount)
if pagecount is not None:
# 拿出第一页的URL。拼接
next_url = response.xpath("//div[@class='search-page page-fruits clearfix']//a[2]/@href").extract_first()
# print(next_url)
if next_url is not None:
for i in range(2,int(pagecount)):
pattern1 = re.compile(r'(-[0-9]\d*.html)$')
pagecount1 = pattern1.findall(next_url)[0]
# print(pagecount1)
str = '-{}.html'.format(i)
nextpage_url = 'http://list.suning.com'+next_url.replace(pagecount1,str)
# print(nextpage_url)
yield scrapy.Request(
nextpage_url,
callback=self.parse_book_detail
)
|