scrapy无法通过自带管道下载图片。
本帖最后由 feng0945 于 2021-7-20 09:47 编辑大佬们刚刚学scrapy看了教程,但是研究了几天都不知道哪里错误了。无法用通道下载图片。求大佬们指点一下。
settings.py
import os
BOT_NAME = 'book'
SPIDER_MODULES = ['book.spiders']
NEWSPIDER_MODULE = 'book.spiders'
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
'scrapy.pipelines.images.ImagesPipeline': 1,
'book.pipelines.Json': 10,
'book.pipelines.MysqlTwistedPipeline': 20,
'book.pipelines.BookPipeline': 300,
}
project_dir = os.path.dirname(os.path.abspath(__file__))
IMAGE_STORE = os.path.join(project_dir, 'images')
items.py
import scrapy
class BookItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
class Booktxt(scrapy.Item):
book_name = scrapy.Field()
book_author = scrapy.Field()
book_state = scrapy.Field()
book_synopsis = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
book_image_path = scrapy.Field()
book_url = scrapy.Field()
book_url_id = scrapy.Field()
chapter_name = scrapy.Field()
chapter_content = scrapy.Field()
pipeline.py
import codecs
import json
from twisted.enterprise import adbapi
import MySQLdb
class BookPipeline(object):
def process_item(self, item, spider):
return item
class MysqlPipeline(object):
def __init__(self):
self.connection = MySQLdb.connect(host='127.0.0.1', user='root',
passwd='', db='book', port=3306,
charset='utf8', use_unicode=True)
self.cursor = self.connection.cursor()
def process_item(self, item, spider):
insert_sql = """
insert into 23txt_book(book_name,book_author,book_state,book_synopsis,image_urls,book_url,book_url_id,
chapter_name,chapter_content) values (%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE chapter_content= VALUES (chapter_content)
"""
params = list()
params.append(item['book_name'])
params.append(item['book_author'])
params.append(item['book_state'])
book_synopsis = ','.join(item['book_synopsis'])
params.append(book_synopsis)
image_urls = ','.join(item['image_urls'])
params.append(image_urls)
params.append(item['book_url'])
params.append(item['book_url_id'])
params.append(item['chapter_name'])
chapter_content = ','.join(item['chapter_content'])
params.append(chapter_content)
self.cursor.execute(insert_sql, tuple(params))
self.connection.commit()
return item
class MysqlTwistedPipeline(object):
def __init__(self, dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(cls, settings):
from MySQLdb.cursors import DictCursor
dbparms = dict(
host=settings["MYSQL_HOST"],
db=settings["MYSQL_DBNAME"],
user=settings["MYSQL_USER"],
passwd=settings["MYSQL_PASSWORD"],
charset='utf8',
cursorclass=DictCursor,
use_unicode=True,
)
dbpool = adbapi.ConnectionPool('MySQLdb', **dbparms)
return cls(dbpool)
def process_item(self, item, spider):
query = self.dbpool.runInteraction(self.do_insert, item)
query.addErrback(self.handle_error, item, spider)# 处理异常
def handle_error(self, failure, item, spider):
print(failure)
def do_insert(self, cursor, item):
insert_sql = """
insert into 23txt_book(book_name,book_author,book_state,book_synopsis,image_urls,book_url,book_url_id,
chapter_name,chapter_content) values (%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE chapter_content=VALUES(chapter_content)
"""
params = list()
params.append(item['book_name'])
params.append(item['book_author'])
params.append(item['book_state'])
book_synopsis = ','.join(item['book_synopsis'])
params.append(book_synopsis)
image_urls = ','.join(item['image_urls'])
params.append(image_urls)
params.append(item['book_url'])
params.append(item['book_url_id'])
params.append(item['chapter_name'])
chapter_content = ','.join(item['chapter_content'])
params.append(chapter_content)
cursor.execute(insert_sql, tuple(params))
class Json(object):
# 自定义文件导处
def __init__(self):
self.file = codecs.open('article.json', 'a', encoding="utf-8")
def process_item(self, item, spider):
lines = json.dumps(dict(item), ensure_ascii=False) + "\n"
self.file.write(lines)
return item
def spider_closed(self, spider):
self.file.close()
spiders.py
from urllib import parse
import scrapy
from scrapy import Request
from book.items import Booktxt
from book.utils.common import get_md5
class A23txtSpider(scrapy.Spider):
name = '23txt'
allowed_domains = ['www.23txt.com']
start_urls = ['https://www.23txt.com/files/article/html/74/74743/']
def parse(self, response):# 提取每一章URL(做抓取策略),调取对应方法。
book_image_urls = response.xpath('//*[@id="fmimg"]/img/@src').extract_first("")
antics_item = Booktxt()
antics_item['image_urls'] =
antics_item['book_name'] = response.css('#info h1::text').extract_first("")
antics_item['book_author'] = response.xpath('//*[@id="info"]/p/text()').extract_first("")
antics_item['book_state'] = response.xpath('//*[@id="info"]/p/text()').extract_first("")
antics_item['book_synopsis'] = response.xpath('//*[@id="intro"]/text()').extract_first("")
book_urls = response.css('#list dd a::attr(href)').extract()[:1]
for book_url in book_urls:
book_url = book_url
yield Request(url=parse.urljoin(response.url, book_url), meta={'antics_item': antics_item},
callback=self.book_content)
def book_content(self, response):
antics_item = response.meta['antics_item']
antics_item['book_url'] = response.url
antics_item['book_url_id'] = get_md5(response.url)
antics_item['chapter_name'] = response.css('.bookname h1::text').extract_first("")
antics_item['chapter_content'] = response.css('.box_con #content::text').extract()
yield antics_item
本帖最后由 狐白本白 于 2021-7-20 10:14 编辑
你管道太复杂了吧
几行sql语句硬写了几十行
管道里可以直接写建立打开mysql
然后建立游标
写sql语句
关闭游标
关闭sql# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
from HelloScrapy.items import MovieItem, CommentItem
import pymysql
class HelloscrapyPipeline:
def process_item(self, item, spider):
conn = pymysql.Connect(user=user, password=password, host="127.0.0.1", port=3306, database=database, charset="utf8mb4")
cursor = conn.cursor()
sql = 'insert into ' + item.title_db + '(' + ','.join(item.keys()) + ') VALUES (' + ','.join(["%s"] * len(item.keys())) + ');'
args = tuple()
if isinstance(item,MovieItem):
cursor.execute(sql,
args=args)
elif isinstance(item,CommentItem):
cursor.execute(sql,
args=args)
conn.commit()
cursor.close()
conn.close()
return item
就OK了
存图片
也很简单class ImagePipeline(ImagesPipeline):
def get_media_requests(self, item, info):
# 循环每一张图片地址下载,若传过来的不是集合则无需循环直接yield
# for image_url in item['skin_url']:
if isinstance(item,SkinItem):
print('*'*100)
# print(image_url)
yield Request(item['skin_url'])
在settings中设置一下图片存储位置
#图片存储位置
IMAGES_STORE = 'F:\ImageSpider' 本帖最后由 feng0945 于 2021-7-20 11:42 编辑
pipelines文件里面
class ImagePipeline(ImagesPipeline):
def get_media_requests(self, item, info):
if isinstance(item, Booktxt):
print('*' * 100)
yield Request(item['image_urls'])
return item
setting文件
ITEM_PIPELINES = {
'book.pipelines.ImagePipeline': 2,
'book.pipelines.Json': 10,
'book.pipelines.MysqlTwistedPipeline': 20,
'book.pipelines.BookPipeline': 300,
}
IMAGES_STORE = 'F:\ImageSpider'
2021-07-20 11:28:57 INFO: Scrapy 2.5.0 started (bot: book)
2021-07-20 11:28:57 INFO: Versions: lxml 4.6.3.0, libxml2 2.9.5, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.2.0, Python 3.9.5 (tags/v3.9.5:0a7dcbd, May3 2021, 17:27:52) , pyOpenSSL 20.0.1 (OpenSSL 1.1.1k25 Mar 2021), cryptography 3.4.7, Platform Windows-10-10.0.19042-SP0
2021-07-20 11:28:57 DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2021-07-20 11:28:57 INFO: Overridden settings:
{'BOT_NAME': 'book',
'NEWSPIDER_MODULE': 'book.spiders',
'SPIDER_MODULES': ['book.spiders']}
2021-07-20 11:28:57 INFO: Telnet Password: 1cbb37b43cb63097
2021-07-20 11:28:57 INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2021-07-20 11:28:57 INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2021-07-20 11:28:57 INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2021-07-20 11:28:57 WARNING: Disabled ImagesPipeline: ImagesPipeline requires installing Pillow 4.0.0 or later
2021-07-20 11:28:57 WARNING: Disabled ImagePipeline: ImagesPipeline requires installing Pillow 4.0.0 or later
2021-07-20 11:28:57 INFO: Enabled item pipelines:
['book.pipelines.Json',
'book.pipelines.MysqlTwistedPipeline',
'book.pipelines.BookPipeline']
2021-07-20 11:28:57 INFO: Spider opened
2021-07-20 11:28:57 INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2021-07-20 11:28:57 INFO: Telnet console listening on 127.0.0.1:6023
2021-07-20 11:28:58 DEBUG: Crawled (200) <GET https://www.23txt.com/files/article/html/74/74743/> (referer: None)
2021-07-20 11:28:58 DEBUG: Crawled (200) <GET https://www.23txt.com/files/article/html/74/74743/165165.html> (referer: https://www.23txt.com/files/article/html/74/74743/)
2021-07-20 11:28:58 DEBUG: Scraped from <200 https://www.23txt.com/files/article/html/74/74743/165165.html>
None
2021-07-20 11:28:58 INFO: Closing spider (finished)
2021-07-20 11:28:58 INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 554,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 17476,
'downloader/response_count': 2,
'downloader/response_status_count/200': 2,
'elapsed_time_seconds': 1.092532,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2021, 7, 20, 3, 28, 58, 757887),
'httpcompression/response_bytes': 95593,
'httpcompression/response_count': 2,
'item_scraped_count': 1,
'log_count/DEBUG': 3,
'log_count/INFO': 10,
'log_count/WARNING': 2,
'request_depth_max': 1,
'response_received_count': 2,
'scheduler/dequeued': 2,
'scheduler/dequeued/memory': 2,
'scheduler/enqueued': 2,
'scheduler/enqueued/memory': 2,
'start_time': datetime.datetime(2021, 7, 20, 3, 28, 57, 665355)}
2021-07-20 11:28:58 INFO: Spider closed (finished)
进程已结束,退出代码 0
代码这样对应修改,单步调试的时候,不进下载函数里面。运行也没有图片的下载。
我把源码放在百度盘了 请大佬们指导一下。
链接:https://pan.baidu.com/s/17cHiEp1nyweJv9uGGsXobw
提取码:wj3r
--来自百度网盘超级会员V6的分享 feng0945 发表于 2021-7-20 11:38
pipelines文件里面
class ImagePipeline(ImagesPipeline):
def get_media_ ...
有問題先看官方文檔
https://docs.scrapy.org/en/latest/topics/media-pipeline.html @ QingYi. 我已经看过了的,也按照文档里面的做了,但是就不知道错误在哪里。
页:
[1]