[Asm] 纯文本查看 复制代码 import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class KsSpider(CrawlSpider):
name = 'ks'
allowed_domains = ['sxcnw.net']
start_urls = ['http://www.sxcnw.net/xuanhuan/List_1.html']
rules = (
Rule(LinkExtractor(restrict_xpaths='//ul[@class="listcon"]/li/a'), callback='parse_item'), #到最后一个标签比如这个a标签之后不要自己往下去取url 坑了我好久 rules会自动提取最后一个标签里的url
Rule(LinkExtractor(restrict_xpaths='//div[@class="showpage"]/a[position()>1]'),follow=True),
)
def parse_item(self, response):
item = {}
item['book_name'] = response.xpath('//div[@class="book-title clear"]/h1/text()').extract()
item['content'] = response.xpath('//div[@class="about-txt"]//text()').extract()
item['book_down'] = response.xpath('//div[@class="dl-to-pc"]/a/@href').extract_first()
print(item)
return item
|