第一次用MongoDB存成功 顺便练练css选择器
import scrapyfrom urllib import parse
from ..items import BiqugeItem
class BjgSpider(scrapy.Spider):
name = 'bjg'
allowed_domains = ['xbiquge.la']
start_urls = ['https://www.xbiquge.la/xiaoshuodaquan/']
def parse(self, response):
pages = response.css(".novellist a::attr(href)").extract()#获取下一页url列表
# book_names =response.xpath('//div[@class="novellist"]//li/a/text()').extract() #获取所有小说名
forpage inpages:
self.page =page
yield scrapy.Request(
page,callback=self.parse_page
)
def parse_page(self,response):
item = BiqugeItem()
detail_urls = response.css("#list a::attr(href)").extract()
item["book_name"] =response.css("#info ::text").extract() #获取小说名字
# print(book_name)
for detall_page indetail_urls:
detall_page = parse.urljoin(self.page,detall_page)#获取章节内容的url
yield scrapy.Request(
detall_page,callback=self.parse_detail,meta={"item":item}
)
def parse_detail(self,response):
item =response.meta["item"]
# charpter =response.xpath('//div/h1/text()').extract() #获取章节目录名
item["book_charpter"] = response.css(".bookname ::text").extract()# 获取章节目录名
content =response.css("#content *::text").extract() #获取章节文本列表
# for i in range(len(content)):
# print(i)
# content = if x != '']
# print(len(content))
# print(len(content))
# content= map(str.strip, filter(lambda x: x and x.strip(), content))
item["book_content"] = #去除列表中的空格 优解这样就可以将list内的元素遍历一遍去除空格,重新组成新的list
yield item css 选择个人感觉没有顺序提取出章节url 乱跳的
页:
[1]