feng0945 发表于 2021-7-20 08:55

scrapy无法通过自带管道下载图片。

本帖最后由 feng0945 于 2021-7-20 09:47 编辑

大佬们刚刚学scrapy看了教程,但是研究了几天都不知道哪里错误了。无法用通道下载图片。求大佬们指点一下。
settings.py
import os
BOT_NAME = 'book'
SPIDER_MODULES = ['book.spiders']
NEWSPIDER_MODULE = 'book.spiders'
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
    'scrapy.pipelines.images.ImagesPipeline': 1,
    'book.pipelines.Json': 10,
    'book.pipelines.MysqlTwistedPipeline': 20,
    'book.pipelines.BookPipeline': 300,
}
project_dir = os.path.dirname(os.path.abspath(__file__))
IMAGE_STORE = os.path.join(project_dir, 'images')


items.py

import scrapy

class BookItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass

class Booktxt(scrapy.Item):
    book_name = scrapy.Field()
    book_author = scrapy.Field()
    book_state = scrapy.Field()
    book_synopsis = scrapy.Field()
    image_urls = scrapy.Field()
    images = scrapy.Field()
    book_image_path = scrapy.Field()
    book_url = scrapy.Field()
    book_url_id = scrapy.Field()
    chapter_name = scrapy.Field()
    chapter_content = scrapy.Field()


pipeline.py

import codecs
import json
from twisted.enterprise import adbapi
import MySQLdb


class BookPipeline(object):
    def process_item(self, item, spider):
      return item


class MysqlPipeline(object):
    def __init__(self):
      self.connection = MySQLdb.connect(host='127.0.0.1', user='root',
                                          passwd='', db='book', port=3306,
                                          charset='utf8', use_unicode=True)
      self.cursor = self.connection.cursor()

    def process_item(self, item, spider):
      insert_sql = """
            insert into 23txt_book(book_name,book_author,book_state,book_synopsis,image_urls,book_url,book_url_id,
            chapter_name,chapter_content) values (%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE chapter_content= VALUES (chapter_content)
      """
      params = list()
      params.append(item['book_name'])
      params.append(item['book_author'])
      params.append(item['book_state'])
      book_synopsis = ','.join(item['book_synopsis'])
      params.append(book_synopsis)
      image_urls = ','.join(item['image_urls'])
      params.append(image_urls)
      params.append(item['book_url'])
      params.append(item['book_url_id'])
      params.append(item['chapter_name'])
      chapter_content = ','.join(item['chapter_content'])
      params.append(chapter_content)
      self.cursor.execute(insert_sql, tuple(params))
      self.connection.commit()
      return item


class MysqlTwistedPipeline(object):
    def __init__(self, dbpool):
      self.dbpool = dbpool

    @classmethod
    def from_settings(cls, settings):
      from MySQLdb.cursors import DictCursor
      dbparms = dict(
            host=settings["MYSQL_HOST"],
            db=settings["MYSQL_DBNAME"],
            user=settings["MYSQL_USER"],
            passwd=settings["MYSQL_PASSWORD"],
            charset='utf8',
            cursorclass=DictCursor,
            use_unicode=True,
      )
      dbpool = adbapi.ConnectionPool('MySQLdb', **dbparms)
      return cls(dbpool)

    def process_item(self, item, spider):
      query = self.dbpool.runInteraction(self.do_insert, item)
      query.addErrback(self.handle_error, item, spider)# 处理异常

    def handle_error(self, failure, item, spider):
      print(failure)

    def do_insert(self, cursor, item):
      insert_sql = """
            insert into 23txt_book(book_name,book_author,book_state,book_synopsis,image_urls,book_url,book_url_id,
            chapter_name,chapter_content) values (%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE chapter_content=VALUES(chapter_content)
      """
      params = list()
      params.append(item['book_name'])
      params.append(item['book_author'])
      params.append(item['book_state'])
      book_synopsis = ','.join(item['book_synopsis'])
      params.append(book_synopsis)
      image_urls = ','.join(item['image_urls'])
      params.append(image_urls)
      params.append(item['book_url'])
      params.append(item['book_url_id'])
      params.append(item['chapter_name'])
      chapter_content = ','.join(item['chapter_content'])
      params.append(chapter_content)
      cursor.execute(insert_sql, tuple(params))


class Json(object):
    # 自定义文件导处
    def __init__(self):
      self.file = codecs.open('article.json', 'a', encoding="utf-8")

    def process_item(self, item, spider):
      lines = json.dumps(dict(item), ensure_ascii=False) + "\n"
      self.file.write(lines)
      return item

    def spider_closed(self, spider):
      self.file.close()


spiders.py

from urllib import parse
import scrapy
from scrapy import Request
from book.items import Booktxt
from book.utils.common import get_md5

class A23txtSpider(scrapy.Spider):
    name = '23txt'
    allowed_domains = ['www.23txt.com']
    start_urls = ['https://www.23txt.com/files/article/html/74/74743/']

    def parse(self, response):# 提取每一章URL(做抓取策略),调取对应方法。
      book_image_urls = response.xpath('//*[@id="fmimg"]/img/@src').extract_first("")
      antics_item = Booktxt()
      antics_item['image_urls'] =
      antics_item['book_name'] = response.css('#info h1::text').extract_first("")
      antics_item['book_author'] = response.xpath('//*[@id="info"]/p/text()').extract_first("")
      antics_item['book_state'] = response.xpath('//*[@id="info"]/p/text()').extract_first("")
      antics_item['book_synopsis'] = response.xpath('//*[@id="intro"]/text()').extract_first("")
      book_urls = response.css('#list dd a::attr(href)').extract()[:1]
      for book_url in book_urls:
            book_url = book_url
            yield Request(url=parse.urljoin(response.url, book_url), meta={'antics_item': antics_item},
                        callback=self.book_content)
    def book_content(self, response):
      antics_item = response.meta['antics_item']
      antics_item['book_url'] = response.url
      antics_item['book_url_id'] = get_md5(response.url)
      antics_item['chapter_name'] = response.css('.bookname h1::text').extract_first("")
      antics_item['chapter_content'] = response.css('.box_con #content::text').extract()
      yield antics_item

狐白本白 发表于 2021-7-20 10:08

本帖最后由 狐白本白 于 2021-7-20 10:14 编辑

你管道太复杂了吧
几行sql语句硬写了几十行
管道里可以直接写建立打开mysql
然后建立游标
写sql语句
关闭游标
关闭sql# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter

from HelloScrapy.items import MovieItem, CommentItem
import pymysql

class HelloscrapyPipeline:
    def process_item(self, item, spider):
      conn = pymysql.Connect(user=user, password=password, host="127.0.0.1", port=3306, database=database, charset="utf8mb4")
      cursor = conn.cursor()
      sql = 'insert into ' + item.title_db + '(' + ','.join(item.keys()) + ') VALUES (' + ','.join(["%s"] * len(item.keys())) + ');'
      args = tuple()
      if isinstance(item,MovieItem):
            cursor.execute(sql,
                           args=args)
      elif isinstance(item,CommentItem):
            cursor.execute(sql,
                           args=args)
      conn.commit()

      cursor.close()

      conn.close()
      return item

就OK了


存图片
也很简单class ImagePipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
      # 循环每一张图片地址下载,若传过来的不是集合则无需循环直接yield
      # for image_url in item['skin_url']:
      if isinstance(item,SkinItem):
            print('*'*100)
                # print(image_url)
            yield Request(item['skin_url'])

在settings中设置一下图片存储位置
#图片存储位置
IMAGES_STORE = 'F:\ImageSpider'

feng0945 发表于 2021-7-20 11:38

本帖最后由 feng0945 于 2021-7-20 11:42 编辑

pipelines文件里面
class ImagePipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
      if isinstance(item, Booktxt):
            print('*' * 100)
            yield Request(item['image_urls'])
      return item

setting文件
ITEM_PIPELINES = {
   
    'book.pipelines.ImagePipeline': 2,
    'book.pipelines.Json': 10,
    'book.pipelines.MysqlTwistedPipeline': 20,
    'book.pipelines.BookPipeline': 300,
}
IMAGES_STORE = 'F:\ImageSpider'

2021-07-20 11:28:57 INFO: Scrapy 2.5.0 started (bot: book)
2021-07-20 11:28:57 INFO: Versions: lxml 4.6.3.0, libxml2 2.9.5, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.2.0, Python 3.9.5 (tags/v3.9.5:0a7dcbd, May3 2021, 17:27:52) , pyOpenSSL 20.0.1 (OpenSSL 1.1.1k25 Mar 2021), cryptography 3.4.7, Platform Windows-10-10.0.19042-SP0
2021-07-20 11:28:57 DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2021-07-20 11:28:57 INFO: Overridden settings:
{'BOT_NAME': 'book',
'NEWSPIDER_MODULE': 'book.spiders',
'SPIDER_MODULES': ['book.spiders']}
2021-07-20 11:28:57 INFO: Telnet Password: 1cbb37b43cb63097
2021-07-20 11:28:57 INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2021-07-20 11:28:57 INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2021-07-20 11:28:57 INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2021-07-20 11:28:57 WARNING: Disabled ImagesPipeline: ImagesPipeline requires installing Pillow 4.0.0 or later
2021-07-20 11:28:57 WARNING: Disabled ImagePipeline: ImagesPipeline requires installing Pillow 4.0.0 or later
2021-07-20 11:28:57 INFO: Enabled item pipelines:
['book.pipelines.Json',
'book.pipelines.MysqlTwistedPipeline',
'book.pipelines.BookPipeline']
2021-07-20 11:28:57 INFO: Spider opened
2021-07-20 11:28:57 INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2021-07-20 11:28:57 INFO: Telnet console listening on 127.0.0.1:6023
2021-07-20 11:28:58 DEBUG: Crawled (200) <GET https://www.23txt.com/files/article/html/74/74743/> (referer: None)
2021-07-20 11:28:58 DEBUG: Crawled (200) <GET https://www.23txt.com/files/article/html/74/74743/165165.html> (referer: https://www.23txt.com/files/article/html/74/74743/)
2021-07-20 11:28:58 DEBUG: Scraped from <200 https://www.23txt.com/files/article/html/74/74743/165165.html>
None
2021-07-20 11:28:58 INFO: Closing spider (finished)
2021-07-20 11:28:58 INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 554,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 17476,
'downloader/response_count': 2,
'downloader/response_status_count/200': 2,
'elapsed_time_seconds': 1.092532,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2021, 7, 20, 3, 28, 58, 757887),
'httpcompression/response_bytes': 95593,
'httpcompression/response_count': 2,
'item_scraped_count': 1,
'log_count/DEBUG': 3,
'log_count/INFO': 10,
'log_count/WARNING': 2,
'request_depth_max': 1,
'response_received_count': 2,
'scheduler/dequeued': 2,
'scheduler/dequeued/memory': 2,
'scheduler/enqueued': 2,
'scheduler/enqueued/memory': 2,
'start_time': datetime.datetime(2021, 7, 20, 3, 28, 57, 665355)}
2021-07-20 11:28:58 INFO: Spider closed (finished)

进程已结束,退出代码 0

代码这样对应修改,单步调试的时候,不进下载函数里面。运行也没有图片的下载。
我把源码放在百度盘了 请大佬们指导一下。
链接:https://pan.baidu.com/s/17cHiEp1nyweJv9uGGsXobw
提取码:wj3r
--来自百度网盘超级会员V6的分享

QingYi. 发表于 2021-7-20 12:43

feng0945 发表于 2021-7-20 11:38
pipelines文件里面
class ImagePipeline(ImagesPipeline):
    def get_media_ ...

有問題先看官方文檔
https://docs.scrapy.org/en/latest/topics/media-pipeline.html

feng0945 发表于 2021-7-20 14:12

@ QingYi. 我已经看过了的,也按照文档里面的做了,但是就不知道错误在哪里。
页: [1]
查看完整版本: scrapy无法通过自带管道下载图片。