#原创//多线程爬去某漫画网站的漫画

.Net_破解 发表于 2019-1-27 11:07

#coding:utf-8
import urllib.request as ub
import urllib.parse as parse
import http.cookiejar as cjar
import re
from lxml import etree
from queue import Queue
import threading
import os
import json

#获取数据的装饰器函数
def get_body(func):
def wrapper(*args, **kwargs):
   for i in range(0, 3):
         try:
            html = func(*args, **kwargs)
         except Exception as e:
            print("error:{},url:{}".format(str(e),args[1]))
            if i == 2:
               i = 3
            continue
         else:
            return html
   if i == 3:
         return False
return wrapper
#获取章节中图片地址的线程类
class chaptersThread(threading.Thread):
def __init__(self,chaptersQ,ImagesUrlQ,threadName,spider):
   super(chaptersThread,self).__init__()
   self.chaptersQ = chaptersQ
   self.threadName = threadName
   self.ImagesUrlQ = ImagesUrlQ
   self.spider = spider
def run(self):
   print("{}:线程正在工作".format(self.threadName))
   while not CHAPTERS_EXIT:
         chapterTuple = self.chaptersQ.get(False)
         title = chapterTuple[0]
         url = chapterTuple[1]
         url = self.spider.index + url[1:]
         html = self.spider.get_data(url) #返回未处理的html
         if html:
            html = re.sub('\xa9|\xbb', '', html.decode(encoding="utf-8", errors="igonre"))
            imagesUrl=self.parseUrl(html)
            self.ImagesUrlQ.put({title:imagesUrl})
         else:
            print("获取失败{}".format(url))
   print("{}:{}完成工作".format(self.threadName,self.name))
def parseUrl(self,html):
   imagesUrl=[]
   compile = re.compile("chapterImages = (\[.*?\])")
   compile2 = re.compile('chapterPath = "(.*?)"')
   images = json.loads(compile.findall(html)[0])
   url = compile2.findall(html)[0]
   for image in images:
         imagesUrl.append("http://res.gufengmh.com/" + url + image)
   returnimagesUrl
#获取每一章节中的图片的线程类
class ImagesUrlThread(threading.Thread):
def __init__(self,ImagesUrlQ,threadName,spider):
   super(ImagesUrlThread,self).__init__()
   self.ImagesUrlQ = ImagesUrlQ
   self.threadName = threadName
   self.spider = spider
def run(self):
   print("{}:线程正在工作".format(self.threadName))
   while not IMAGESURL_EXIT:
         chapter = self.ImagesUrlQ.get(False)
         title = list(chapter.keys())[0]
         images = chapter#list类型
         try:
            os.mkdir(os.path.join(os.getcwd(),"manhua/"+title))
         except Exception as e:
            print("error:{}".format(str(e)))
         for i in range(len(images)):
            url = images
            imagesIo = self.spider.get_data(url)
            if imagesIo:
               save_path = "manhua/" + title + "/" + str(i) + ".jpg"
               with open(os.path.join(os.getcwd(), save_path), "bw") as file:
                     try:
                        file.write(imagesIo)
                     except:
                        pass
            else:
               global ErrorQ
               ErrorQ.put({"title":title,"page":str(i),"url":url})
               print("章节:{},第{}页,url:{},获取失败".format(title,str(i),url))
   print("获取完成{}".format(self.threadName))
#用来从新获取获取失败的章节
class ErrorUrlThrad(threading.Thread):
def __init__(self,ErrorQ,tName):
   super(ErrorUrlThrad,self).__init__()
   self.ErrorQ = ErrorQ
   self.spider = MSpider()
   self.threadName = tName
def run(self):
   print("{}:线程正在工作".format(self.threadName))
   while not ERRORU_EXIT:
         error_dict = self.ErrorQ.get(False)
         title = error_dict["title"]
         page =error_dict["page"]
         url = error_dict["url"]

         imageIo = self.spider.get_data(url)
         if imageIo:
            with open(os.path.join(os.getcwd(), "manhua/" + title + "/" + page + ".jpg"),"wb") as f:
               f.write(imageIo)
         else:
            print("章节:{},第{}页,url:{},重新获取失败".format(title, page, url))

class MSpider(object):
def __init__(self,index=''):
   self.cookie = cjar.CookieJar()
   self.opener = ub.build_opener(ub.HTTPCookieProcessor(self.cookie))
   self.opener.addheaders = [("User-Agent"," Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36")]
   self.index = index #漫画首页
def get_cookie(self):
   try:
         self.opener.open(self.index,timeout=10)
   except Exception as e:
         print(str(e))
def search_api(self,name):
   if isinstance(name,str):
         data_dict = {
            'keywords': name
         }
   else:
         print("传入的name不是字符串")
         return 0
   data = parse.urlencode(data_dict)
   url = "http://www.gufengmh.com/search/?" + data
   response = self.opener.open(url)
   html= response.read().decode("gbk", "ignore")
   html_xpath = etree.HTML(html)
   try:
         index =html_xpath.xpath('//*[@id="contList"]/li/a/@href')[0]#漫画首页
         print("漫画首页：{}".format(index))
   except:
         index = ''
   return index
#获取章节
def get_chapter(self,index):
   response = self.opener.open(index)
   html = response.read()
   html = re.sub('\xa9|\xbb','',html.decode(encoding="utf-8",errors="igonre"))#删除特殊字符
   html_xpath = etree.HTML(html)
   chapters_href = html_xpath.xpath('//*[@id="chapter-list-1"]/li/a/@href')
   chapters_title = html_xpath.xpath('//*[@id="chapter-list-1"]/li/a/span/text()')
   print(chapters_href)
   print(chapters_title)
   if len(chapters_title)==len(chapters_href):
         chapters = Queue()
         for i in range(len(chapters_title)):
            chapters.put((chapters_title,chapters_href))
   return chapters

#开始获取章节的函数
def get_oneChapter(self,chaptersQ):
   ImagesUrlQ=Queue()
   tNames = ["cps1","cps2","cps3","cps4","cps5","cps6","cps7","cps8","cps9","cps10"]
   cpts =[]#存储章节爬去线程
   for tName in tNames:
         cpt = chaptersThread(chaptersQ,ImagesUrlQ,tName,self)#创建进程
         cpt.start()
         cpts.append(cpt)
   while not chaptersQ.empty():
         pass

   global CHAPTERS_EXIT
   CHAPTERS_EXIT = True

   for cpt in cpts:
         cpt.join()
   print("章节获取完成")
   print(ImagesUrlQ.qsize())

   Imuts = []#image 获取线程
   t2Names = ["IMUs1","IMUs2","IMUs3","IMUs4","IMUs5","IMUs6","IMUs7","IMUs8","IMUs9","IMUs10"]
   for tName in t2Names:
         Imut = ImagesUrlThread(ImagesUrlQ, tName, self)# 创建进程
         Imut.start()
         Imuts.append(Imut)
   while not ImagesUrlQ.empty():
         pass
   global IMAGESURL_EXIT
   IMAGESURL_EXIT = True
   for Imut in Imuts:
         Imut.join()
   print("全部获取完成")
@get_body
def get_data(self,*args,**kwargs):
   return self.opener.open(args[0],timeout=30).read()#args=url变量
CHAPTERS_EXIT = False
IMAGESURL_EXIT=False
ERRORU_EXIT=False
error_num=0
ErrorQ =Queue()#获取错误的url的队列
def main():
manhuaName = input("请输入你想搜素的漫画名:")
try:
   os.mkdir("manhua")
except Exception as e:
   print(str(e))

index = "http://www.gufengmh.com/"
spider = MSpider(index)
spider.get_cookie()
index = spider.search_api(manhuaName)
if index:
   chapters = spider.get_chapter(index)
   spider.get_oneChapter(chapters)

   if not ErrorQ.empty():
         errorTnames = ["error1","error2","error3"]
         eThreads = []
         for tname in errorTnames:
            eThread = ErrorUrlThrad(ErrorQ,tname)
            eThread.start()
            eThreads.append(eThread)
         while not ErrorQ.empty():
            pass
         #等待线程结束
         for t in eThreads:
            t.join()
else:
   print("------------漫画不存在-----------")
   exit(1)
if __name__ == '__main__':
main()
温馨提示：代码为python3.7版本为兼容2.7 请见谅库不好引。。。。。。。。。
需要爬虫的可以找小弟嘻嘻。。。

Boxhunter 发表于 2019-1-27 11:17

厉害，谢谢大佬

shuimuyi 发表于 2019-1-27 11:24

正好在学爬虫仔细学习学习大佬的代码

hedy16 发表于 2019-1-27 11:47

这个不错，喜欢看漫画不一定知道这个怎么弄搞一串代码

页: [1]

吾爱破解 - 52pojie.cn's Archiver

#原创//多线程爬去某漫画网站的漫画