[Asm] 纯文本查看 复制代码 import scrapy
from urllib import parse
from ..items import BiqugeItem
class BjgSpider(scrapy.Spider):
name = 'bjg'
allowed_domains = ['xbiquge.la']
start_urls = ['https://www.xbiquge.la/xiaoshuodaquan/']
def parse(self, response):
pages = response.css(".novellist a::attr(href)").extract() #获取下一页url列表
# book_names =response.xpath('//div[@class="novellist"]//li/a/text()').extract() #获取所有小说名
for page in pages:
self.page =page
yield scrapy.Request(
page,callback=self.parse_page
)
def parse_page(self,response):
item = BiqugeItem()
detail_urls = response.css("#list a::attr(href)").extract()
item["book_name"] =response.css("#info ::text")[1].extract() #获取小说名字
# print(book_name)
for detall_page in detail_urls:
detall_page = parse.urljoin(self.page,detall_page) #获取章节内容的url
yield scrapy.Request(
detall_page,callback=self.parse_detail,meta={"item":item}
)
def parse_detail(self,response):
item =response.meta["item"]
# charpter =response.xpath('//div[class="bookname"]/h1/text()').extract() #获取章节目录名
item["book_charpter"] = response.css(".bookname ::text")[1].extract() # 获取章节目录名
content =response.css("#content *::text").extract() #获取章节文本列表
# for i in range(len(content)):
# print(i)
# content[i] = [x for x in content[i] if x != '']
# print(len(content[i]))
# print(len(content[1]))
# content= map(str.strip, filter(lambda x: x and x.strip(), content))
item["book_content"] = [x.strip() for x in content] # 去除列表中的空格 优解 这样就可以将list内的元素遍历一遍去除空格,重新组成新的list
yield item |