好友
阅读权限 10
听众
最后登录 1970-1-1
本帖最后由 feng0945 于 2021-7-20 09:47 编辑
大佬们刚刚学scrapy看了教程,但是研究了几天都不知道哪里错误了。无法用通道下载图片。求大佬们指点一下。
settings.py
[Python] 纯文本查看 复制代码
import os
BOT_NAME = 'book'
SPIDER_MODULES = ['book.spiders']
NEWSPIDER_MODULE = 'book.spiders'
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
'scrapy.pipelines.images.ImagesPipeline': 1,
'book.pipelines.Json': 10,
'book.pipelines.MysqlTwistedPipeline': 20,
'book.pipelines.BookPipeline': 300,
}
project_dir = os.path.dirname(os.path.abspath(__file__))
IMAGE_STORE = os.path.join(project_dir, 'images')
items.py
[Python] 纯文本查看 复制代码
import scrapy
class BookItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
class Booktxt(scrapy.Item):
book_name = scrapy.Field()
book_author = scrapy.Field()
book_state = scrapy.Field()
book_synopsis = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
book_image_path = scrapy.Field()
book_url = scrapy.Field()
book_url_id = scrapy.Field()
chapter_name = scrapy.Field()
chapter_content = scrapy.Field()
pipeline.py
[Python] 纯文本查看 复制代码
import codecs
import json
from twisted.enterprise import adbapi
import MySQLdb
class BookPipeline(object):
def process_item(self, item, spider):
return item
class MysqlPipeline(object):
def __init__(self):
self.connection = MySQLdb.connect(host='127.0.0.1', user='root',
passwd='', db='book', port=3306,
charset='utf8', use_unicode=True)
self.cursor = self.connection.cursor()
def process_item(self, item, spider):
insert_sql = """
insert into 23txt_book(book_name,book_author,book_state,book_synopsis,image_urls,book_url,book_url_id,
chapter_name,chapter_content) values (%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE chapter_content= VALUES (chapter_content)
"""
params = list()
params.append(item['book_name'])
params.append(item['book_author'])
params.append(item['book_state'])
book_synopsis = ','.join(item['book_synopsis'])
params.append(book_synopsis)
image_urls = ','.join(item['image_urls'])
params.append(image_urls)
params.append(item['book_url'])
params.append(item['book_url_id'])
params.append(item['chapter_name'])
chapter_content = ','.join(item['chapter_content'])
params.append(chapter_content)
self.cursor.execute(insert_sql, tuple(params))
self.connection.commit()
return item
class MysqlTwistedPipeline(object):
def __init__(self, dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(cls, settings):
from MySQLdb.cursors import DictCursor
dbparms = dict(
host=settings["MYSQL_HOST"],
db=settings["MYSQL_DBNAME"],
user=settings["MYSQL_USER"],
passwd=settings["MYSQL_PASSWORD"],
charset='utf8',
cursorclass=DictCursor,
use_unicode=True,
)
dbpool = adbapi.ConnectionPool('MySQLdb', **dbparms)
return cls(dbpool)
def process_item(self, item, spider):
query = self.dbpool.runInteraction(self.do_insert, item)
query.addErrback(self.handle_error, item, spider) # 处理异常
def handle_error(self, failure, item, spider):
print(failure)
def do_insert(self, cursor, item):
insert_sql = """
insert into 23txt_book(book_name,book_author,book_state,book_synopsis,image_urls,book_url,book_url_id,
chapter_name,chapter_content) values (%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE chapter_content=VALUES(chapter_content)
"""
params = list()
params.append(item['book_name'])
params.append(item['book_author'])
params.append(item['book_state'])
book_synopsis = ','.join(item['book_synopsis'])
params.append(book_synopsis)
image_urls = ','.join(item['image_urls'])
params.append(image_urls)
params.append(item['book_url'])
params.append(item['book_url_id'])
params.append(item['chapter_name'])
chapter_content = ','.join(item['chapter_content'])
params.append(chapter_content)
cursor.execute(insert_sql, tuple(params))
class Json(object):
# 自定义文件导处
def __init__(self):
self.file = codecs.open('article.json', 'a', encoding="utf-8")
def process_item(self, item, spider):
lines = json.dumps(dict(item), ensure_ascii=False) + "\n"
self.file.write(lines)
return item
def spider_closed(self, spider):
self.file.close()
spiders.py
[Python] 纯文本查看 复制代码
from urllib import parse
import scrapy
from scrapy import Request
from book.items import Booktxt
from book.utils.common import get_md5
class A23txtSpider(scrapy.Spider):
name = '23txt'
allowed_domains = ['www.23txt.com']
start_urls = ['https://www.23txt.com/files/article/html/74/74743/']
def parse(self, response): # 提取每一章URL(做抓取策略),调取对应方法。
book_image_urls = response.xpath('//*[@id="fmimg"]/img/@src').extract_first("")
antics_item = Booktxt()
antics_item['image_urls'] = [book_image_urls]
antics_item['book_name'] = response.css('#info h1::text').extract_first("")
antics_item['book_author'] = response.xpath('//*[@id="info"]/p[1]/text()').extract_first("")
antics_item['book_state'] = response.xpath('//*[@id="info"]/p[2]/text()[1]').extract_first("")
antics_item['book_synopsis'] = response.xpath('//*[@id="intro"]/text()').extract_first("")
book_urls = response.css('#list dd a::attr(href)').extract()[:1]
for book_url in book_urls:
book_url = book_url
yield Request(url=parse.urljoin(response.url, book_url), meta={'antics_item': antics_item},
callback=self.book_content)
def book_content(self, response):
antics_item = response.meta['antics_item']
antics_item['book_url'] = response.url
antics_item['book_url_id'] = get_md5(response.url)
antics_item['chapter_name'] = response.css('.bookname h1::text').extract_first("")
antics_item['chapter_content'] = response.css('.box_con #content::text').extract()
yield antics_item