#原创//多线程爬去某漫画网站的漫画
#coding:utf-8import urllib.request as ub
import urllib.parse as parse
import http.cookiejar as cjar
import re
from lxml import etree
from queue import Queue
import threading
import os
import json
#获取数据的装饰器函数
def get_body(func):
def wrapper(*args, **kwargs):
for i in range(0, 3):
try:
html = func(*args, **kwargs)
except Exception as e:
print("error:{},url:{}".format(str(e),args[1]))
if i == 2:
i = 3
continue
else:
return html
if i == 3:
return False
return wrapper
#获取章节中图片地址的线程类
class chaptersThread(threading.Thread):
def __init__(self,chaptersQ,ImagesUrlQ,threadName,spider):
super(chaptersThread,self).__init__()
self.chaptersQ = chaptersQ
self.threadName = threadName
self.ImagesUrlQ = ImagesUrlQ
self.spider = spider
def run(self):
print("{}:线程正在工作".format(self.threadName))
while not CHAPTERS_EXIT:
chapterTuple = self.chaptersQ.get(False)
title = chapterTuple[0]
url = chapterTuple[1]
url = self.spider.index + url[1:]
html = self.spider.get_data(url) #返回未处理的html
if html:
html = re.sub('\xa9|\xbb', '', html.decode(encoding="utf-8", errors="igonre"))
imagesUrl=self.parseUrl(html)
self.ImagesUrlQ.put({title:imagesUrl})
else:
print("获取失败{}".format(url))
print("{}:{}完成工作".format(self.threadName,self.name))
def parseUrl(self,html):
imagesUrl=[]
compile = re.compile("chapterImages = (\[.*?\])")
compile2 = re.compile('chapterPath = "(.*?)"')
images = json.loads(compile.findall(html)[0])
url = compile2.findall(html)[0]
for image in images:
imagesUrl.append("http://res.gufengmh.com/" + url + image)
returnimagesUrl
#获取每一章节中的图片的线程类
class ImagesUrlThread(threading.Thread):
def __init__(self,ImagesUrlQ,threadName,spider):
super(ImagesUrlThread,self).__init__()
self.ImagesUrlQ = ImagesUrlQ
self.threadName = threadName
self.spider = spider
def run(self):
print("{}:线程正在工作".format(self.threadName))
while not IMAGESURL_EXIT:
chapter = self.ImagesUrlQ.get(False)
title = list(chapter.keys())[0]
images = chapter#list类型
try:
os.mkdir(os.path.join(os.getcwd(),"manhua/"+title))
except Exception as e:
print("error:{}".format(str(e)))
for i in range(len(images)):
url = images
imagesIo = self.spider.get_data(url)
if imagesIo:
save_path = "manhua/" + title + "/" + str(i) + ".jpg"
with open(os.path.join(os.getcwd(), save_path), "bw") as file:
try:
file.write(imagesIo)
except:
pass
else:
global ErrorQ
ErrorQ.put({"title":title,"page":str(i),"url":url})
print("章节:{},第{}页,url:{},获取失败".format(title,str(i),url))
print("获取完成{}".format(self.threadName))
#用来从新获取 获取失败的章节
class ErrorUrlThrad(threading.Thread):
def __init__(self,ErrorQ,tName):
super(ErrorUrlThrad,self).__init__()
self.ErrorQ = ErrorQ
self.spider = MSpider()
self.threadName = tName
def run(self):
print("{}:线程正在工作".format(self.threadName))
while not ERRORU_EXIT:
error_dict = self.ErrorQ.get(False)
title = error_dict["title"]
page =error_dict["page"]
url = error_dict["url"]
imageIo = self.spider.get_data(url)
if imageIo:
with open(os.path.join(os.getcwd(), "manhua/" + title + "/" + page + ".jpg"),"wb") as f:
f.write(imageIo)
else:
print("章节:{},第{}页,url:{},重新获取失败".format(title, page, url))
class MSpider(object):
def __init__(self,index=''):
self.cookie = cjar.CookieJar()
self.opener = ub.build_opener(ub.HTTPCookieProcessor(self.cookie))
self.opener.addheaders = [("User-Agent"," Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36")]
self.index = index #漫画首页
def get_cookie(self):
try:
self.opener.open(self.index,timeout=10)
except Exception as e:
print(str(e))
def search_api(self,name):
if isinstance(name,str):
data_dict = {
'keywords': name
}
else:
print("传入的name不是字符串")
return 0
data = parse.urlencode(data_dict)
url = "http://www.gufengmh.com/search/?" + data
response = self.opener.open(url)
html= response.read().decode("gbk", "ignore")
html_xpath = etree.HTML(html)
try:
index =html_xpath.xpath('//*[@id="contList"]/li/a/@href')[0]#漫画首页
print("漫画首页:{}".format(index))
except:
index = ''
return index
#获取章节
def get_chapter(self,index):
response = self.opener.open(index)
html = response.read()
html = re.sub('\xa9|\xbb','',html.decode(encoding="utf-8",errors="igonre"))#删除特殊字符
html_xpath = etree.HTML(html)
chapters_href = html_xpath.xpath('//*[@id="chapter-list-1"]/li/a/@href')
chapters_title = html_xpath.xpath('//*[@id="chapter-list-1"]/li/a/span/text()')
print(chapters_href)
print(chapters_title)
if len(chapters_title)==len(chapters_href):
chapters = Queue()
for i in range(len(chapters_title)):
chapters.put((chapters_title,chapters_href))
return chapters
#开始获取章节的函数
def get_oneChapter(self,chaptersQ):
ImagesUrlQ=Queue()
tNames = ["cps1","cps2","cps3","cps4","cps5","cps6","cps7","cps8","cps9","cps10"]
cpts =[]#存储章节爬去线程
for tName in tNames:
cpt = chaptersThread(chaptersQ,ImagesUrlQ,tName,self)#创建进程
cpt.start()
cpts.append(cpt)
while not chaptersQ.empty():
pass
global CHAPTERS_EXIT
CHAPTERS_EXIT = True
for cpt in cpts:
cpt.join()
print("章节获取完成")
print(ImagesUrlQ.qsize())
Imuts = []#image 获取线程
t2Names = ["IMUs1","IMUs2","IMUs3","IMUs4","IMUs5","IMUs6","IMUs7","IMUs8","IMUs9","IMUs10"]
for tName in t2Names:
Imut = ImagesUrlThread(ImagesUrlQ, tName, self)# 创建进程
Imut.start()
Imuts.append(Imut)
while not ImagesUrlQ.empty():
pass
global IMAGESURL_EXIT
IMAGESURL_EXIT = True
for Imut in Imuts:
Imut.join()
print("全部获取完成")
@get_body
def get_data(self,*args,**kwargs):
return self.opener.open(args[0],timeout=30).read()#args=url变量
CHAPTERS_EXIT = False
IMAGESURL_EXIT=False
ERRORU_EXIT=False
error_num=0
ErrorQ =Queue()#获取错误的url的队列
def main():
manhuaName = input("请输入你想搜素的漫画名:")
try:
os.mkdir("manhua")
except Exception as e:
print(str(e))
index = "http://www.gufengmh.com/"
spider = MSpider(index)
spider.get_cookie()
index = spider.search_api(manhuaName)
if index:
chapters = spider.get_chapter(index)
spider.get_oneChapter(chapters)
if not ErrorQ.empty():
errorTnames = ["error1","error2","error3"]
eThreads = []
for tname in errorTnames:
eThread = ErrorUrlThrad(ErrorQ,tname)
eThread.start()
eThreads.append(eThread)
while not ErrorQ.empty():
pass
#等待线程结束
for t in eThreads:
t.join()
else:
print("------------漫画不存在-----------")
exit(1)
if __name__ == '__main__':
main()
温馨提示:代码为python3.7版本 为兼容2.7 请见谅 库不好引。。。。。。。。。
需要爬虫的 可以找小弟 嘻嘻。。。 厉害,谢谢大佬 正好在学爬虫 仔细学习学习大佬的代码 这个不错,喜欢看漫画 不一定知道这个怎么弄 搞一串代码
页:
[1]