小白的多线程爬虫。以某漫画为例
#coding:utf-8import urllib.request as ub
import urllib.parse as parse
import http.cookiejar as cjar
import re
from lxml import etree
from queue import Queue
import threading
import os
import json
import sys
#获取数据的装饰器函数
def get_body(func):
def wrapper(*args, **kwargs):
for i in range(0, 3):
try:
html = func(*args, **kwargs)
except Exception as e:
if str(e).find('404')>=0:
print("error:{},url:{}".format(str(e), args[1]))
return 1
print("error:{},url:{}".format(str(e),args[1]))
if i == 2:
i = 3
continue
else:
return html
if i == 3:
return False
return wrapper
#获取章节中图片地址的线程类
class chaptersThread(threading.Thread):
def __init__(self,chaptersQ,ImagesUrlQ,threadName,spider):
super(chaptersThread,self).__init__()
self.chaptersQ = chaptersQ
self.threadName = threadName
self.ImagesUrlQ = ImagesUrlQ
self.spider = spider
def run(self):
print("{}:线程正在工作".format(self.threadName))
global CHAPTERS_EXIT
while not CHAPTERS_EXIT:
try:
chapterTuple = self.chaptersQ.get(False) #一会处理
except Exception as e:
break
title = chapterTuple[0]
url = chapterTuple[1]
url = self.spider.index + url[1:]
html = self.spider.get_data(url) #返回未处理的html
if html:
html = re.sub('\xa9|\xbb', '', html.decode(encoding="utf-8"))
imagesUrl=self.parseUrl(html)
self.ImagesUrlQ.put({title:imagesUrl})
else:
print("获取失败{}".format(url))
print("{}:{}完成工作".format(self.threadName,self.name))
def parseUrl(self,html):
imagesUrl=[]
compile = re.compile("chapterImages = (\[.*?\])")
compile2 = re.compile('chapterPath = "(.*?)"')
images = json.loads(compile.findall(html)[0])
im_path = compile2.findall(html)[0]
im_url = "http://res.gufengmh8.com/"
for image in images:
imagesUrl.append(im_url + im_path + image)
returnimagesUrl
#获取每一章节中的图片的线程类
class ImagesUrlThread(threading.Thread):
def __init__(self,ImagesUrlQ,threadName,spider):
super(ImagesUrlThread,self).__init__()
self.ImagesUrlQ = ImagesUrlQ
self.threadName = threadName
self.spider = spider
def run(self):
print("{}:线程正在工作".format(self.threadName))
global IMAGESURL_EXIT
while not IMAGESURL_EXIT:
try:
images_chapter = self.ImagesUrlQ.get(False)
except:
break
title = list(images_chapter.keys())[0]
images = images_chapter#list类型~~
try:
os.mkdir(os.path.join(self.spider.dir_path+title))
except Exception as e:
pass
#print("error:{}".format(str(e)))
for i in range(len(images)):
url = images
imagesIo = self.spider.get_data(url)
if imagesIo:
save_path = self.spider.dir_path + title + "/" + str(i) + ".jpg"
with open(save_path, "bw") as file:
try:
file.write(imagesIo)
except:
pass
elif imagesIo==1:
pass
else:
global ErrorQ
ErrorQ.put({"title":title,"page":str(i),"url":url})
print("章节:{},第{}页,url:{},获取失败".format(title,str(i),url))
print("获取完成{}".format(self.threadName))
#用来从新获取 获取失败的章节
class ErrorUrlThrad(threading.Thread):
def __init__(self,ErrorQ,tName,spider):
super(ErrorUrlThrad,self).__init__()
self.ErrorQ = ErrorQ
self.spider = spider
self.threadName = tName
def run(self):
print("{}:线程正在工作".format(self.threadName))
global ERRORU_EXIT
while not ERRORU_EXIT:
try:
error_dict = self.ErrorQ.get(False)
except Exception as e:
break
title = error_dict["title"]
page =error_dict["page"]
url = error_dict["url"]
imageIo = self.spider.get_data(url)
if imageIo:
with open(os.path.join(self.spider.dir_path+title + "/" + page + ".jpg"),"wb") as f:
f.write(imageIo)
else:
print("章节:{},第{}页,url:{},重新获取失败".format(title, page, url))
class MSpider(object):
def __init__(self,index='',CartoonName=None,dir_path=''):
self.cookie = cjar.CookieJar()
self.opener = ub.build_opener(ub.HTTPCookieProcessor(self.cookie))
self.opener.addheaders =[("User-Agent","Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Mobile Safari/537.36")]
self.index = index #漫画首页
self.CartoonName = CartoonName
self.dir_path = dir_path
def get_cookie(self):
try:
self.opener.open(self.index,timeout=10)
except Exception as e:
print(str(e))
def search_api(self):
if not isinstance(self.CartoonName,str):
self.CartoonName = str(self.CartoonName)
data_dict = {
'keywords': self.CartoonName
}
data = parse.urlencode(data_dict)
url = self.index+"search/?" + data
html = self.get_data(url)
html = re.sub('\xa9|\xbb', '', html.decode(encoding="utf-8"))
html_xpath = etree.HTML(html)
try:
cartoonList =html_xpath.xpath('//*[@id="update_list"]/div/div/div/a/@href')#漫画首页
update = html_xpath.xpath('//*[@id="update_list"]/div/div/div/p/span/text()')
for index,date in zip(cartoonList,update):
print("更新日期:{},漫画链接:{}".format(date,index))
index = int(input('请根据时间选择你要看的漫画?请输入阿拉伯数字进行选择。'))
if index<=0:
index = 1
except Exception as e:
print("error:{}".format(str(e)))
return ""
return cartoonList-1]
#获取章节
def get_chapter(self,index):
html = self.get_data(index)
html = re.sub('\xa9|\xbb','',html.decode(encoding="utf-8"))#删除特殊字符
html_xpath = etree.HTML(html)
chapters_href = html_xpath.xpath('//*[@id="chapter-list-1"]/li/a/@href')
chapters_title = html_xpath.xpath('//*[@id="chapter-list-1"]/li/a/span/text()')
chapters_len = len(chapters_title)
print("""最近更新10章更新:
{}
{}""".format(chapters_title-10:chapters_len-5],chapters_title-5:chapters_len]))
print('因为其中包含特殊章节,并不是每个章节链接和每一话动漫都对应.\n'
'请自行斟酌要爬去的章节范围.\n\n\n'
'您搜素漫画一共{}章节,'\
.format(chapters_len))
while True:
try:
start_page = int(input("请输入起始章节:"))
end_page = int(input("请输入结束章节:"))
if end_page>chapters_len:
print("章节超出搜素范围,请重新输入")
continue
elif start_page>end_page:
print('起始章节大于结束章节,请重新输入')
continue
elif start_page<1:
print("起始章节存在错误")
continue
break
except Exception as e:
print('您输入的章节数目格式存在错误请重新出入,Error:{}'.format(str(e)))
if chapters_len==len(chapters_href):
chapters = Queue()
for i in range(start_page-1,end_page):
chapters.put((chapters_title,chapters_href))
return chapters
#开始获取章节的函数
def get_oneChapter(self,chaptersQ):
ImagesUrlQ=Queue()
tNames = ["cps1","cps2","cps3","cps4","cps5","cps6","cps7","cps8","cps9","cps10"]
cpts =[]#存储章节爬去线程
for tName in tNames:
cpt = chaptersThread(chaptersQ,ImagesUrlQ,tName,self)#创建进程
cpt.start()
cpts.append(cpt)
while not chaptersQ.empty():
pass
global CHAPTERS_EXIT
CHAPTERS_EXIT = True
for cpt in cpts:
cpt.join()
print("章节获取完成,一共获取了{}章漫画".format(ImagesUrlQ.qsize()))
if ImagesUrlQ.empty():
print("ImagesUrlQ is empty ,漫画被下架!")
exit(1)
Imuts = []#image 获取线程
t2Names = ["IMUs1","IMUs2","IMUs3","IMUs4","IMUs5","IMUs6","IMUs7","IMUs8","IMUs9","IMUs10",
"IMUs11", "IMUs12", "IMUs13", "IMUs14", "IMUs15", "IMUs16", "IMUs17", "IMUs18", "IMUs19", "IMUs20"]
for tName in t2Names:
Imut = ImagesUrlThread(ImagesUrlQ, tName, self)# 创建进程
Imut.start()
Imuts.append(Imut)
while not ImagesUrlQ.empty():
pass
global IMAGESURL_EXIT
IMAGESURL_EXIT = True
for Imut in Imuts:
Imut.join()
print("全部获取完成")
@get_body
def get_data(self,*args,**kwargs):
return self.opener.open(args[0],timeout=30).read()#args=url变量
CHAPTERS_EXIT = False
IMAGESURL_EXIT=False
ERRORU_EXIT=False
error_num=0
ErrorQ =Queue()#获取错误的url的队列
def main():
CartoonName = input("请输入你想搜素的漫画名:")
if sys.platform.startswith('win'):
dir_path="manhua/"+CartoonName+"/"
else:
dir_path ="/storage/emulated/0/manhua/"+CartoonName+"/"
try:
os.mkdir(dir_path)
except Exception as e:
#print(str(e))
pass
index = "http://m.gufengmh8.com/"
spider = MSpider(index,CartoonName,dir_path)
spider.get_cookie()
index = spider.search_api()
if index:
chapters = spider.get_chapter(index)
spider.get_oneChapter(chapters)
if not ErrorQ.empty():
errorTnames = ["error1","error2","error3"]
eThreads = []
for tname in errorTnames:
eThread = ErrorUrlThrad(ErrorQ,tname,spider)
eThread.start()
eThreads.append(eThread)
while not ErrorQ.empty():
pass
#等待线程结束
for t in eThreads:
t.join()
else:
print("------------漫画不存在-----------")
exit(1)
if __name__ == '__main__':
main()
运行环境Android 和 windows用的 是python3.5不兼容python2.7 #coding:utf-8
import urllib.request as ub
import urllib.parse as parse
import http.cookiejar as cjar
import re
from lxml import etree
from queue import Queue
import threading
import os
import json
import sys
#获取数据的装饰器函数
def get_body(func):
def wrapper(*args, **kwargs):
for i in range(0, 3):
try:
html = func(*args, **kwargs)
except Exception as e:
if str(e).find('404')>=0:
print("error:{},url:{}".format(str(e), args))
return 1
print("error:{},url:{}".format(str(e),args))
if i == 2:
i = 3
continue
else:
return html
if i == 3:
return False
return wrapper
#获取章节中图片地址的线程类
class chaptersThread(threading.Thread):
def __init__(self,chaptersQ,ImagesUrlQ,threadName,spider):
super(chaptersThread,self).__init__()
self.chaptersQ = chaptersQ
self.threadName = threadName
self.ImagesUrlQ = ImagesUrlQ
self.spider = spider
def run(self):
print("{}:线程正在工作".format(self.threadName))
global CHAPTERS_EXIT
while not CHAPTERS_EXIT:
try:
chapterTuple = self.chaptersQ.get(False) #一会处理
except Exception as e:
break
title = chapterTuple
url = chapterTuple
url = self.spider.index + url
html = self.spider.get_data(url) #返回未处理的html
if html:
html = re.sub('\xa9|\xbb', '', html.decode(encoding="utf-8"))
imagesUrl=self.parseUrl(html)
self.ImagesUrlQ.put({title:imagesUrl})
else:
print("获取失败{}".format(url))
print("{}:{}完成工作".format(self.threadName,self.name))
def parseUrl(self,html):
imagesUrl=[]
compile = re.compile("chapterImages = (\[.*?\])")
compile2 = re.compile('chapterPath = "(.*?)"')
images = json.loads(compile.findall(html))
im_path = compile2.findall(html)
im_url = "http://res.gufengmh8.com/"
for image in images:
imagesUrl.append(im_url + im_path + image)
returnimagesUrl
#获取每一章节中的图片的线程类
class ImagesUrlThread(threading.Thread):
def __init__(self,ImagesUrlQ,threadName,spider):
super(ImagesUrlThread,self).__init__()
self.ImagesUrlQ = ImagesUrlQ
self.threadName = threadName
self.spider = spider
def run(self):
print("{}:线程正在工作".format(self.threadName))
global IMAGESURL_EXIT
while not IMAGESURL_EXIT:
try:
images_chapter = self.ImagesUrlQ.get(False)
except:
break
title = list(images_chapter.keys())
images = images_chapter#list类型~~
try:
os.mkdir(os.path.join(self.spider.dir_path+title))
except Exception as e:
pass
#print("error:{}".format(str(e)))
for i in range(len(images)):
url = images
imagesIo = self.spider.get_data(url)
if imagesIo:
save_path = self.spider.dir_path + title + "/" + str(i) + ".jpg"
with open(save_path, "bw") as file:
try:
file.write(imagesIo)
except:
pass
elif imagesIo==1:
pass
else:
global ErrorQ
ErrorQ.put({"title":title,"page":str(i),"url":url})
print("章节:{},第{}页,url:{},获取失败".format(title,str(i),url))
print("获取完成{}".format(self.threadName))
#用来从新获取 获取失败的章节
class ErrorUrlThrad(threading.Thread):
def __init__(self,ErrorQ,tName,spider):
super(ErrorUrlThrad,self).__init__()
self.ErrorQ = ErrorQ
self.spider = spider
self.threadName = tName
def run(self):
print("{}:线程正在工作".format(self.threadName))
global ERRORU_EXIT
while not ERRORU_EXIT:
try:
error_dict = self.ErrorQ.get(False)
except Exception as e:
break
title = error_dict["title"]
page =error_dict["page"]
url = error_dict["url"]
imageIo = self.spider.get_data(url)
if imageIo:
with open(os.path.join(self.spider.dir_path+title + "/" + page + ".jpg"),"wb") as f:
f.write(imageIo)
else:
print("章节:{},第{}页,url:{},重新获取失败".format(title, page, url))
class MSpider(object):
def __init__(self,index='',CartoonName=None,dir_path=''):
self.cookie = cjar.CookieJar()
self.opener = ub.build_opener(ub.HTTPCookieProcessor(self.cookie))
self.opener.addheaders =[("User-Agent","Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Mobile Safari/537.36")]
self.index = index #漫画首页
self.CartoonName = CartoonName
self.dir_path = dir_path
def get_cookie(self):
try:
self.opener.open(self.index,timeout=10)
except Exception as e:
print(str(e))
def search_api(self):
if not isinstance(self.CartoonName,str):
self.CartoonName = str(self.CartoonName)
data_dict = {
'keywords': self.CartoonName
}
data = parse.urlencode(data_dict)
url = self.index+"search/?" + data
html = self.get_data(url)
html = re.sub('\xa9|\xbb', '', html.decode(encoding="utf-8"))
html_xpath = etree.HTML(html)
try:
cartoonList =html_xpath.xpath('//*[@id="update_list"]/div/div/div/a/@href')#漫画首页
update = html_xpath.xpath('//*[@id="update_list"]/div/div/div/p/span/text()')
for index,date in zip(cartoonList,update):
print("更新日期:{},漫画链接:{}".format(date,index))
index = int(input('请根据时间选择你要看的漫画?请输入阿拉伯数字进行选择。'))
if index<=0:
index = 1
except Exception as e:
print("error:{}".format(str(e)))
return ""
return cartoonList
#获取章节
def get_chapter(self,index):
html = self.get_data(index)
html = re.sub('\xa9|\xbb','',html.decode(encoding="utf-8"))#删除特殊字符
html_xpath = etree.HTML(html)
chapters_href = html_xpath.xpath('//*[@id="chapter-list-1"]/li/a/@href')
chapters_title = html_xpath.xpath('//*[@id="chapter-list-1"]/li/a/span/text()')
chapters_len = len(chapters_title)
print("""最近更新10章更新:
{}
{}""".format(chapters_title,chapters_title))
print('因为其中包含特殊章节,并不是每个章节链接和每一话动漫都对应.\n'
'请自行斟酌要爬去的章节范围.\n\n\n'
'您搜素漫画一共{}章节,'\
.format(chapters_len))
while True:
try:
start_page = int(input("请输入起始章节:"))
end_page = int(input("请输入结束章节:"))
if end_page>chapters_len:
print("章节超出搜素范围,请重新输入")
continue
elif start_page>end_page:
print('起始章节大于结束章节,请重新输入')
continue
elif start_page<1:
print("起始章节存在错误")
continue
break
except Exception as e:
print('您输入的章节数目格式存在错误请重新出入,Error:{}'.format(str(e)))
if chapters_len==len(chapters_href):
chapters = Queue()
for i in range(start_page-1,end_page):
chapters.put((chapters_title,chapters_href))
return chapters
#开始获取章节的函数
def get_oneChapter(self,chaptersQ):
ImagesUrlQ=Queue()
tNames = ["cps1","cps2","cps3","cps4","cps5","cps6","cps7","cps8","cps9","cps10"]
cpts =[]#存储章节爬去线程
for tName in tNames:
cpt = chaptersThread(chaptersQ,ImagesUrlQ,tName,self)#创建进程
cpt.start()
cpts.append(cpt)
while not chaptersQ.empty():
pass
global CHAPTERS_EXIT
CHAPTERS_EXIT = True
for cpt in cpts:
cpt.join()
print("章节获取完成,一共获取了{}章漫画".format(ImagesUrlQ.qsize()))
if ImagesUrlQ.empty():
print("ImagesUrlQ is empty ,漫画被下架!")
exit(1)
Imuts = []#image 获取线程
t2Names = ["IMUs1","IMUs2","IMUs3","IMUs4","IMUs5","IMUs6","IMUs7","IMUs8","IMUs9","IMUs10",
"IMUs11", "IMUs12", "IMUs13", "IMUs14", "IMUs15", "IMUs16", "IMUs17", "IMUs18", "IMUs19", "IMUs20"]
for tName in t2Names:
Imut = ImagesUrlThread(ImagesUrlQ, tName, self)# 创建进程
Imut.start()
Imuts.append(Imut)
while not ImagesUrlQ.empty():
pass
global IMAGESURL_EXIT
IMAGESURL_EXIT = True
for Imut in Imuts:
Imut.join()
print("全部获取完成")
@get_body
def get_data(self,*args,**kwargs):
return self.opener.open(args,timeout=30).read()#args=url变量
CHAPTERS_EXIT = False
IMAGESURL_EXIT=False
ERRORU_EXIT=False
error_num=0
ErrorQ =Queue()#获取错误的url的队列
def main():
CartoonName = input("请输入你想搜素的漫画名:")
if sys.platform.startswith('win'):
dir_path="manhua/"+CartoonName+"/"
else:
dir_path ="/storage/emulated/0/manhua/"+CartoonName+"/"
try:
os.mkdir(dir_path)
except Exception as e:
#print(str(e))
pass
index = "http://m.gufengmh8.com/"
spider = MSpider(index,CartoonName,dir_path)
spider.get_cookie()
index = spider.search_api()
if index:
chapters = spider.get_chapter(index)
spider.get_oneChapter(chapters)
if not ErrorQ.empty():
errorTnames = ["error1","error2","error3"]
eThreads = []
for tname in errorTnames:
eThread = ErrorUrlThrad(ErrorQ,tname,spider)
eThread.start()
eThreads.append(eThread)
while not ErrorQ.empty():
pass
#等待线程结束
for t in eThreads:
t.join()
else:
print("------------漫画不存在-----------")
exit(1)
if __name__ == '__main__':
main()
hustlzp 发表于 2019-3-5 20:30
#coding:utf-8
import urllib.request as ub
import urllib.parse as parse
谢谢大佬 我不会用这个功能 写的不错哦{:1_918:}{:1_918:}{:1_918:} 真小白前来学习 有成品吗。大佬 莫問道 发表于 2019-5-1 15:27
有成品吗。大佬
没有图行界面需要自己下载python3 和lxml的库然后运行脚本
页:
[1]