#原创//多线程爬去某漫画网站的漫画

.Net_破解 · 发表于 2019-1-27 11:07

#coding:utf-8
import urllib.request as ub
import urllib.parse as parse
import http.cookiejar as cjar
import re
from lxml import etree
from queue import Queue
import threading
import os
import json

#获取数据的装饰器函数
def get_body(func):
def wrapper(*args, **kwargs):
      for i in range(0, 3):
         try:
            html = func(*args, **kwargs)
         except Exception as e:
            print("error:{},url:{}".format(str(e),args[1]))
            if i == 2:
                  i = 3
            continue
         else:
            return html
      if i == 3:
         return False
return wrapper
#获取章节中图片地址的线程类
class chaptersThread(threading.Thread):
def __init__(self,chaptersQ,ImagesUrlQ,threadName,spider):
      super(chaptersThread,self).__init__()
      self.chaptersQ = chaptersQ
      self.threadName = threadName
      self.ImagesUrlQ = ImagesUrlQ
      self.spider = spider
def run(self):
      print("{}:线程正在工作".format(self.threadName))
      while not CHAPTERS_EXIT:
         chapterTuple = self.chaptersQ.get(False)
         title = chapterTuple[0]
         url = chapterTuple[1]
         url = self.spider.index + url[1:]
         html = self.spider.get_data(url) #返回未处理的html
         if html:
            html = re.sub('\xa9|\xbb', '', html.decode(encoding="utf-8", errors="igonre"))
            imagesUrl=self.parseUrl(html)
            self.ImagesUrlQ.put({title:imagesUrl})
         else:
            print("获取失败{}".format(url))
      print("{}:{}完成工作".format(self.threadName,self.name))
def parseUrl(self,html):
      imagesUrl=[]
      compile = re.compile("chapterImages = (\[.*?\])")
      compile2 = re.compile('chapterPath = "(.*?)"')
      images = json.loads(compile.findall(html)[0])
      url = compile2.findall(html)[0]
      for image in images:
         imagesUrl.append("http://res.gufengmh.com/" + url + image)
      return  imagesUrl
#获取每一章节中的图片的线程类
class ImagesUrlThread(threading.Thread):
def __init__(self,ImagesUrlQ,threadName,spider):
      super(ImagesUrlThread,self).__init__()
      self.ImagesUrlQ = ImagesUrlQ
      self.threadName = threadName
      self.spider = spider
def run(self):
      print("{}:线程正在工作".format(self.threadName))
      while not IMAGESURL_EXIT:
         chapter = self.ImagesUrlQ.get(False)
         title = list(chapter.keys())[0]
         images = chapter[title]#list类型
         try:
            os.mkdir(os.path.join(os.getcwd(),"manhua/"+title))
         except Exception as e:
            print("error:{}".format(str(e)))
         for i in range(len(images)):
            url = images
            imagesIo = self.spider.get_data(url)
            if imagesIo:
                  save_path = "manhua/" + title + "/" + str(i) + ".jpg"
                  with open(os.path.join(os.getcwd(), save_path), "bw") as file:
                     try:
                        file.write(imagesIo)
                     except:
                        pass
            else:
                  global ErrorQ
                  ErrorQ.put({"title":title,"page":str(i),"url":url})
                  print("章节:{},第{}页,url:{},获取失败".format(title,str(i),url))
      print("获取完成{}".format(self.threadName))
#用来从新获取获取失败的章节
class ErrorUrlThrad(threading.Thread):
def __init__(self,ErrorQ,tName):
      super(ErrorUrlThrad,self).__init__()
      self.ErrorQ = ErrorQ
      self.spider = MSpider()
      self.threadName = tName
def run(self):
      print("{}:线程正在工作".format(self.threadName))
      while not ERRORU_EXIT:
         error_dict = self.ErrorQ.get(False)
         title = error_dict["title"]
         page =error_dict["page"]
         url = error_dict["url"]

         imageIo = self.spider.get_data(url)
         if imageIo:
            with open(os.path.join(os.getcwd(), "manhua/" + title + "/" + page + ".jpg"),"wb") as f:
                  f.write(imageIo)
         else:
            print("章节:{},第{}页,url:{},重新获取失败".format(title, page, url))

class MSpider(object):
def __init__(self,index=''):
      self.cookie = cjar.CookieJar()
      self.opener = ub.build_opener(ub.HTTPCookieProcessor(self.cookie))
      self.opener.addheaders = [("User-Agent"," Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36")]
      self.index = index #漫画首页
def get_cookie(self):
      try:
         self.opener.open(self.index,timeout=10)
      except Exception as e:
         print(str(e))
def search_api(self,name):
      if isinstance(name,str):
         data_dict = {
            'keywords': name
         }
      else:
         print("传入的name不是字符串")
         return 0
      data = parse.urlencode(data_dict)
      url = "http://www.gufengmh.com/search/?" + data
      response = self.opener.open(url)
      html= response.read().decode("gbk", "ignore")
      html_xpath = etree.HTML(html)
      try:
         index =  html_xpath.xpath('//*[@id="contList"]/li/a/@href')[0]#漫画首页
         print("漫画首页：{}".format(index))
      except:
         index = ''
      return index
#获取章节
def get_chapter(self,index):
      response = self.opener.open(index)
      html = response.read()
      html = re.sub('\xa9|\xbb','',html.decode(encoding="utf-8",errors="igonre"))#删除特殊字符
      html_xpath = etree.HTML(html)
      chapters_href = html_xpath.xpath('//*[@id="chapter-list-1"]/li/a/@href')
      chapters_title = html_xpath.xpath('//*[@id="chapter-list-1"]/li/a/span/text()')
      print(chapters_href)
      print(chapters_title)
      if len(chapters_title)==len(chapters_href):
         chapters = Queue()
         for i in range(len(chapters_title)):
            chapters.put((chapters_title,chapters_href))
      return chapters

#开始获取章节的函数
def get_oneChapter(self,chaptersQ):
      ImagesUrlQ=Queue()
      tNames = ["cps1","cps2","cps3","cps4","cps5","cps6","cps7","cps8","cps9","cps10"]
      cpts =[]#存储章节爬去线程
      for tName in tNames:
         cpt = chaptersThread(chaptersQ,ImagesUrlQ,tName,self)#创建进程
         cpt.start()
         cpts.append(cpt)
      while not chaptersQ.empty():
         pass

      global CHAPTERS_EXIT
      CHAPTERS_EXIT = True

      for cpt in cpts:
         cpt.join()
      print("章节获取完成")
      print(ImagesUrlQ.qsize())

      Imuts = []#image 获取线程
      t2Names = ["IMUs1","IMUs2","IMUs3","IMUs4","IMUs5","IMUs6","IMUs7","IMUs8","IMUs9","IMUs10"]
      for tName in t2Names:
         Imut = ImagesUrlThread(ImagesUrlQ, tName, self)  # 创建进程
         Imut.start()
         Imuts.append(Imut)
      while not ImagesUrlQ.empty():
         pass
      global IMAGESURL_EXIT
      IMAGESURL_EXIT = True
      for Imut in Imuts:
         Imut.join()
      print("全部获取完成")
@get_body
def get_data(self,*args,**kwargs):
      return self.opener.open(args[0],timeout=30).read()#args[0]=url变量
CHAPTERS_EXIT = False
IMAGESURL_EXIT=False
ERRORU_EXIT=False
error_num=0
ErrorQ =Queue()#获取错误的url的队列
def main():
manhuaName = input("请输入你想搜素的漫画名:")
try:
      os.mkdir("manhua")
except Exception as e:
      print(str(e))

index = "http://www.gufengmh.com/"
spider = MSpider(index)
spider.get_cookie()
index = spider.search_api(manhuaName)
if index:
      chapters = spider.get_chapter(index)
      spider.get_oneChapter(chapters)

      if not ErrorQ.empty():
         errorTnames = ["error1","error2","error3"]
         eThreads = []
         for tname in errorTnames:
            eThread = ErrorUrlThrad(ErrorQ,tname)
            eThread.start()
            eThreads.append(eThread)
         while not ErrorQ.empty():
            pass
         #等待线程结束
         for t in eThreads:
            t.join()
else:
      print("------------漫画不存在-----------")
      exit(1)
if __name__ == '__main__':
main()
温馨提示：代码为python3.7版本为兼容2.7 请见谅库不好引。。。。。。。。。
需要爬虫的可以找小弟嘻嘻。。。

Boxhunter · 发表于 2019-1-27 11:17

厉害，谢谢大佬

shuimuyi · 发表于 2019-1-27 11:24

正好在学爬虫仔细学习学习大佬的代码

hedy16 · 发表于 2019-1-27 11:47

这个不错，喜欢看漫画不一定知道这个怎么弄搞一串代码

帐号		自动登录	找回密码
密码			注册[Register]

[Python 转载] #原创//多线程爬去某漫画网站的漫画

免费评分