[Asm] 纯文本查看 复制代码 import scrapy
from copy import deepcopy
class ZxSpider(scrapy.Spider):
name = 'zx'
allowed_domains = ['zxcs.info']
start_urls = ['http://zxcs.info/']
def parse(self, response):
#大分类分组
li_list = response.xpath('.//div[@id="nav"]/ul/li')[1:]
for li in li_list:
item = {}
item["b_cate"] = li.xpath('./a//text()').extract_first()
item["b_href"] = li.xpath('./a/@href').extract_first()
#小分类分组
li_2_list= li.xpath('./ul/li')
for li_2 in li_2_list:
item["s_cate"]= li_2.xpath('./a/text()').extract_first()
item["s_href"] = li_2.xpath('./a/@href').extract_first()
if item["s_href"] is not None:
yield scrapy.Request(item["s_href"],
callback=self.parse_book_list,
meta= {"item":deepcopy(item)}
)
else:
yield scrapy.Request(item["b_href"],
callback=self.parse_book_list,
meta={"item":deepcopy(item)}
)
def parse_book_list(self,response):
item = response.meta["item"]
dl_list =response.xpath('// div[@class="wrap"]/div[2]//dl')
#图书列表页分组
for dl in dl_list:
item["book_name"] = dl.xpath('./dt/a/text()').extract_first()
item["book_detail"] = dl.xpath('./dd/text()').extract_first()
item["book_href"] = dl.xpath('./dt/a/@href').extract_first()
yield scrapy.Request(
item["book_href"],
callback=self.parse_book_detail,
meta={"item":deepcopy(item)}
)
#翻页
a_pagenavi = response.xpath('//div[@id="pagenavi"]/a/@href').extract()
a_pagenavi = a_pagenavi[5]
i = a_pagenavi.split('/')[-1]
y=1
for page in i:
if y <= int(page):
y += 1
next_url = item["s_href"] + "/page/" +str(y)
yield scrapy.Request(next_url,
callback=self.parse_book_list,
meta={"item":item}
)
def parse_book_detail(self,response):
item = response.meta["item"]
item["book_img"] = response.xpath('.//div[@id="content"]/div//a/@href').extract_first()
item["book_img"] = "http://zxcs.info" + item["book_img"]
item["book_down_url"] = response.xpath('//*[@id="content"]/div[2]/div[2]/div[2]/p[1]/a/@href').extract_first()
item["book_down_url"] = "http://zxcs.info" + item["book_down_url"]
print(item) |