小白每日一爬 Crawlspider练习 (发现正则不会写)
import scrapyfrom scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class ZxSpider(CrawlSpider):
name = 'zx'
allowed_domains = ['zzxrjc.com']
start_urls = ['http://zzxrjc.com/']
rules = (
Rule(LinkExtractor(allow=r'(/txt/\d+/)'), callback='parse_item'),
Rule(LinkExtractor(allow=r'(/sort/\d+/)'),follow=True),
)
def parse_item(self, response):
item = {}
item['book_img'] = response.xpath('//div[@class="novel_info_main"]/img/@src').extract_first()
item['book_name'] = response.xpath('//div[@class="novel_info_main"]/img/@alt').extract_first()
item['book_detail'] = response.xpath('//div[@id="info"]/div//text()').extract_first()
print(item)
return item 知轩藏书比上一贴少了好多代码 Rules 真好用
页:
[1]