发表于 2018-8-7 11:40

申请会员ID:zoudidong

1、申 请 I D : zoudidong
2、个人邮箱:18122928761@163.com
技术文章:用Scrapy框架+代{过}{滤}理池爬取+mongodb存储豆瓣上万电影信息
环境:Windows 系统+ Python 2.7.*

先来看看要爬的页面
https://movie.douban.com/explore#!type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=0


很明显,page_limit是显示电影的最大个数,paga_start是偏移量,tag 是 电影类别

然后通过抓包可以抓到一堆装电影信息的json数据



这样就容易了,基本的思路有了。根据类别分别处理,打开页面直接读取json数据再放入管道处理
爬虫代码基本如下

# -*- coding: utf-8 -*-
import scrapy
from douban_movie.items import DoubanMovieItem
import time
import json

import sys

reload(sys)
sys.setdefaultencoding('utf-8')

class DoubanSpider(scrapy.Spider):
    name = 'douban'
    allowed_domains = ['movie.douban.com']
    start_urls = ['http://movie.douban.com/']

    page_count = 100000#爬取page_count*20的电影数据

    def start_requests(self):
    time.sleep(1)
    url = 'https://movie.douban.com/j/search_subjects?type=movie&tag={type}&sort=time&page_limit=20&page_start={count}'
      # type_list = map(lambdax : urllib.urlencode(x),
      #               )
    type_list =
    for t in type_list:
    self.movie_type = t
    for i in range(self.page_count):
         yield scrapy.Request(url.format(type=t,count=i*20))


    def parse(self, response):
    items = DoubanMovieItem()
    data_list = json.loads(response.text)
    for data in data_list:
      items['title'] = data
      items['rate'] = data
      # scrapy.Request(url=data, callback=self.text_parse)
      # items['simple_text'] =
      items['type'] = self.movie_type
    yield items

#    def text_parse(self,response):
#yield response.xpath('//div[@id="link-report"]/span/text()').extract_first().strip()




我只抓取电影的名称,评分和类别(我比较懒:) )

item文件如下:
import scrapy


class DoubanMovieItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    type = scrapy.Field()
    title = scrapy.Field()
    rate = scrapy.Field()

然后放入管道处理爬下来的数据,我是直接保存存入mongodb里面的
class MongoPipeline(object):
    def __init__(self, mongoip, mongoport):
      self.mongoip = mongoip
      self.mongoport = mongoport

    def process_item(self, item, spider):
      self.db.movie_set.insert({u'电影名' : item['title'],
                                  u'评分' : item['rate'],
                                  u'电影类型' : item['type']
                                  # u'内容简介' : item['simple_text']
                                  })
      return item

    @classmethod
    def from_crawler(cls, crawler):
      return cls(crawler.settings.get('MONGO_URL'),
                   crawler.settings.get('MONGO_PORT'))

    def open_spider(self, spider):
      self.client = pymongo.MongoClient(self.mongoip, self.mongoport)
      self.db = self.client.douban


这样就行了吗?

答案是当然不行...当爬电影爬到几万的时候就会把你的主机ban掉
这时候我们就要用设置多个代{过}{滤}理来进行访问

下载中间件代码如下:
class DoubanMovieDownloaderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.

    def __init__(self,pool_ip,pool_port):
      self.proxypool_ip = pool_ip
      self.proxypool_port = pool_port
      self.port = 23333

    @classmethod
    def from_crawler(cls, crawler):
      # This method is used by Scrapy to create your spiders.
      s = cls(pool_ip = crawler.settings.get('PROXY_SERVER_IP'),
                pool_port = crawler.settings.get('PROXY_SERVER_PORT')
                )
      crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
      return s

    def process_request(self, request, spider):
      # Called for each request that goes through the downloader
      # middleware.

      # Must either:
      # - return None: continue processing this request
      # - or return a Response object
      # - or return a Request object
      # - or raise IgnoreRequest: process_exception() methods of
      #   installed downloader middleware will be called
      return None

    def process_response(self, request, response, spider):
      # Called with the response returned from the downloader.

      # Must either;
      # - return a Response object
      # - return a Request object
      # - or raise IgnoreRequest
      return response

    def process_exception(self, request, exception, spider):
      # Called when a download handler or a process_request()
      # (from other downloader middleware) raises an exception.

      # Must either:
      # - return None: continue processing this exception
      # - return a Response object: stops process_exception() chain
      # - return a Request object: stops process_exception() chain
      self.sck.sendto(233,(self.proxypool_ip,self.proxypool_port))
      request.meta['proxy'] = self.sck.recv()

    def spider_opened(self, spider):
      spider.logger.info('Spider opened: %s' % spider.name)
      self.sck = socket.socket(socket.AF_INET, socket.SOCK_DGRAM, 0)
      self.sck.bind(('127.0.0.1',self.port))

我的代{过}{滤}理池是通过爬别人的代{过}{滤}理网站获取代{过}{滤}理服务器,当然你是土豪可以自己买233
代{过}{滤}理池代码如下:

# coding:UTF-8
import requests
# from multiprocessing import Pool,Process
import threading
import socket
from random import choice
from lxml import etree


proxy_list = []
port = 14382
ip = '127.0.0.1'
proxy_max = 2
max_page = 1000 #爬约1000页的数据
proxies = {
'http': 'http://{ip}:{port}',
'https': 'http://{ip}:{port}'
}

headers = {
    'user-agent' : 'User-Agent:Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'
}

url = 'https://www.kuaidaili.com/free/intr/%d/'
test_url = 'http://tool.chinaz.com'


def get_proxy_spider():
    page_num = 1
    response = requests.get(url=url%page_num,headers=headers)
    html = etree.HTML(response.text)
    thread_pool = []
    Proxies_list = html.xpath('//tbody/tr')
    while len(proxy_list) <= proxy_max and page_num < max_page:
      for p in Proxies_list:
            proxy = {'http' : proxies['http'].format(ip=p.xpath('td[@data-title="IP"]/text()'),
                                                   port = p.xpath('td[@data-title="PORT"]/text()')),
                     'https' : proxies['https'].format(ip=p.xpath('td[@data-title="IP"]/text()'),
                                                   port = p.xpath('td[@data-title="PORT"]/text()'))
                     }
            thread_pool.append(threading.Thread(target=test_proxy,args=(proxy,)))
      
      page_num += 1                                                               #get new page
      next_response = requests.get(url=url%page_num,headers=headers)
      html = etree.HTML(next_response.text)
      Proxies_list = html.xpath('//tbody/tr')
      thread_pool = []      #清空线程

def test_proxy(proxies):
    if len(proxy_list) >= proxy_max:
      return None
    try:
      requests.get(url = test_url,headers = headers, proxies = proxies,timeout = 5.0)
      #print proxies,'Can get url.Add to list...'
    except requests.RequestException,e :
      pass
    else:
      proxy_list.append(proxies['http'])


def start_service(proxylist):            #开启socket 服务向数据获取

    try :
      sck = socket.socket(socket.AF_INET,socket.SOCK_DGRAM,0)
      sck.bind((ip,port))
      while True:
            data,src_addr = int(sck.recvfrom(4))
            if data == 233:
                sck.sendto(choice(proxylist),tuple(src_addr.split(':')))
    except socket.errno,e:
      print 'Server open up false...\n',e

if __name__ == '__main__':
    print 'Start thread to get...'
    get_proxy_spider()
    print 'Ok...'
    print proxy_list

    print 'Start UDP server....'
    print 'ip:%s,port:%s'%(ip,port)
    start_service()






爬出的数据如下(我就展示一小部分):


谢谢阅读我写的渣技术文章

Hmily 发表于 2018-8-7 16:14

抱歉,未能达到申请要求,申请不通过,可以关注论坛官方微信(吾爱破解论坛),等待开放注册通知。
页: [1]
查看完整版本: 申请会员ID:zoudidong