#coding:utf-8 import urllib.request as ub import urllib.parse as parse import http.cookiejar as cjar import re from lxml import etree from queue import Queue import threading import os import json
#获取数据的装饰器函数 def get_body(func): def wrapper(*args, **kwargs): for i in range(0, 3): try: html = func(*args, **kwargs) except Exception as e: print("error:{},url:{}".format(str(e),args[1])) if i == 2: i = 3 continue else: return html if i == 3: return False return wrapper #获取章节中图片地址的线程类 class chaptersThread(threading.Thread): def __init__(self,chaptersQ,ImagesUrlQ,threadName,spider): super(chaptersThread,self).__init__()
self.chaptersQ = chaptersQ self.threadName = threadName self.ImagesUrlQ = ImagesUrlQ self.spider = spider def run(self): print("{}:线程正在工作".format(self.threadName)) while not CHAPTERS_EXIT: chapterTuple = self.chaptersQ.get(False)
title = chapterTuple[0]
url = chapterTuple[1]
url = self.spider.index + url[1:]
html = self.spider.get_data(url) #返回未处理的html if html: html = re.sub('\xa9|\xbb', '', html.decode(encoding="utf-8", errors="igonre"))
imagesUrl=self.parseUrl(html)
self.ImagesUrlQ.put({title:imagesUrl}) else: print("获取失败{}".format(url))
print("{}:{}完成工作".format(self.threadName,self.name)) def parseUrl(self,html): imagesUrl=[]
compile = re.compile("chapterImages = (\[.*?\])")
compile2 = re.compile('chapterPath = "(.*?)"')
images = json.loads(compile.findall(html)[0])
url = compile2.findall(html)[0] for image in images: imagesUrl.append("http://res.gufengmh.com/" + url + image) return imagesUrl #获取每一章节中的图片的线程类 class ImagesUrlThread(threading.Thread): def __init__(self,ImagesUrlQ,threadName,spider): super(ImagesUrlThread,self).__init__()
self.ImagesUrlQ = ImagesUrlQ self.threadName = threadName self.spider = spider def run(self): print("{}:线程正在工作".format(self.threadName)) while not IMAGESURL_EXIT: chapter = self.ImagesUrlQ.get(False)
title = list(chapter.keys())[0]
images = chapter[title]#list类型 try: os.mkdir(os.path.join(os.getcwd(),"manhua/"+title)) except Exception as e: print("error:{}".format(str(e))) for i in range(len(images)): url = images
imagesIo = self.spider.get_data(url) if imagesIo: save_path = "manhua/" + title + "/" + str(i) + ".jpg" with open(os.path.join(os.getcwd(), save_path), "bw") as file: try: file.write(imagesIo) except: pass else: global ErrorQ
ErrorQ.put({"title":title,"page":str(i),"url":url})
print("章节:{},第{}页,url:{},获取失败".format(title,str(i),url))
print("获取完成{}".format(self.threadName)) #用来从新获取 获取失败的章节 class ErrorUrlThrad(threading.Thread): def __init__(self,ErrorQ,tName): super(ErrorUrlThrad,self).__init__()
self.ErrorQ = ErrorQ self.spider = MSpider()
self.threadName = tName def run(self): print("{}:线程正在工作".format(self.threadName)) while not ERRORU_EXIT: error_dict = self.ErrorQ.get(False)
title = error_dict["title"]
page =error_dict["page"]
url = error_dict["url"]
imageIo = self.spider.get_data(url) if imageIo: with open(os.path.join(os.getcwd(), "manhua/" + title + "/" + page + ".jpg"),"wb") as f: f.write(imageIo) else: print("章节:{},第{}页,url:{},重新获取失败".format(title, page, url))
class MSpider(object): def __init__(self,index=''): self.cookie = cjar.CookieJar()
self.opener = ub.build_opener(ub.HTTPCookieProcessor(self.cookie))
self.opener.addheaders = [("User-Agent"," Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36")]
self.index = index #漫画首页 def get_cookie(self): try: self.opener.open(self.index,timeout=10) except Exception as e: print(str(e)) def search_api(self,name): if isinstance(name,str): data_dict = { 'keywords': name } else: print("传入的name不是字符串") return 0
data = parse.urlencode(data_dict)
url = "http://www.gufengmh.com/search/?" + data
response = self.opener.open(url)
html= response.read().decode("gbk", "ignore")
html_xpath = etree.HTML(html) try: index = html_xpath.xpath('//*[@id="contList"]/li/a/@href')[0]#漫画首页 print("漫画首页:{}".format(index)) except: index = '' return index #获取章节 def get_chapter(self,index): response = self.opener.open(index)
html = response.read()
html = re.sub('\xa9|\xbb','',html.decode(encoding="utf-8",errors="igonre"))#删除特殊字符 html_xpath = etree.HTML(html)
chapters_href = html_xpath.xpath('//*[@id="chapter-list-1"]/li/a/@href')
chapters_title = html_xpath.xpath('//*[@id="chapter-list-1"]/li/a/span/text()')
print(chapters_href)
print(chapters_title) if len(chapters_title)==len(chapters_href): chapters = Queue() for i in range(len(chapters_title)): chapters.put((chapters_title,chapters_href)) return chapters
#开始获取章节的函数 def get_oneChapter(self,chaptersQ): ImagesUrlQ=Queue()
tNames = ["cps1","cps2","cps3","cps4","cps5","cps6","cps7","cps8","cps9","cps10"]
cpts =[]#存储章节爬去线程 for tName in tNames: cpt = chaptersThread(chaptersQ,ImagesUrlQ,tName,self)#创建进程 cpt.start()
cpts.append(cpt) while not chaptersQ.empty(): pass global CHAPTERS_EXIT
CHAPTERS_EXIT = True for cpt in cpts: cpt.join()
print("章节获取完成")
print(ImagesUrlQ.qsize())
Imuts = []#image 获取线程 t2Names = ["IMUs1","IMUs2","IMUs3","IMUs4","IMUs5","IMUs6","IMUs7","IMUs8","IMUs9","IMUs10"] for tName in t2Names: Imut = ImagesUrlThread(ImagesUrlQ, tName, self) # 创建进程 Imut.start()
Imuts.append(Imut) while not ImagesUrlQ.empty(): pass global IMAGESURL_EXIT
IMAGESURL_EXIT = True for Imut in Imuts: Imut.join()
print("全部获取完成") @get_body def get_data(self,*args,**kwargs): return self.opener.open(args[0],timeout=30).read()#args[0]=url变量 CHAPTERS_EXIT = False IMAGESURL_EXIT=False ERRORU_EXIT=False error_num=0
ErrorQ =Queue()#获取错误的url的队列 def main(): manhuaName = input("请输入你想搜素的漫画名:") try: os.mkdir("manhua") except Exception as e: print(str(e))
index = "http://www.gufengmh.com/" spider = MSpider(index)
spider.get_cookie()
index = spider.search_api(manhuaName) if index: chapters = spider.get_chapter(index)
spider.get_oneChapter(chapters)
if not ErrorQ.empty(): errorTnames = ["error1","error2","error3"]
eThreads = [] for tname in errorTnames: eThread = ErrorUrlThrad(ErrorQ,tname)
eThread.start()
eThreads.append(eThread) while not ErrorQ.empty(): pass #等待线程结束 for t in eThreads: t.join() else: print("------------漫画不存在-----------")
exit(1) if __name__ == '__main__': main()
温馨提示:代码为python3.7版本 为兼容2.7 请见谅 库不好引。。。。。。。。。
需要爬虫的 可以找小弟 嘻嘻。。。