小白的多线程爬虫。以某漫画为例

.Net_破解 发表于 2019-3-5 19:52

#coding:utf-8
import urllib.request as ub
import urllib.parse as parse
import http.cookiejar as cjar
import re
from lxml import etree
from queue import Queue
import threading
import os
import json
import sys

#获取数据的装饰器函数
def get_body(func):
def wrapper(*args, **kwargs):
   for i in range(0, 3):
         try:
            html = func(*args, **kwargs)
         except Exception as e:
            if str(e).find('404')>=0:
               print("error:{},url:{}".format(str(e), args[1]))
               return 1
            print("error:{},url:{}".format(str(e),args[1]))
            if i == 2:
               i = 3
            continue
         else:
            return html
   if i == 3:
         return False
return wrapper
#获取章节中图片地址的线程类
class chaptersThread(threading.Thread):
def __init__(self,chaptersQ,ImagesUrlQ,threadName,spider):
   super(chaptersThread,self).__init__()
   self.chaptersQ = chaptersQ
   self.threadName = threadName
   self.ImagesUrlQ = ImagesUrlQ
   self.spider = spider
def run(self):
   print("{}:线程正在工作".format(self.threadName))
   global CHAPTERS_EXIT
   while not CHAPTERS_EXIT:
         try:
            chapterTuple = self.chaptersQ.get(False) #一会处理
         except Exception as e:
            break
         title = chapterTuple[0]
         url = chapterTuple[1]
         url = self.spider.index + url[1:]
         html = self.spider.get_data(url) #返回未处理的html
         if html:
            html = re.sub('\xa9|\xbb', '', html.decode(encoding="utf-8"))
            imagesUrl=self.parseUrl(html)
            self.ImagesUrlQ.put({title:imagesUrl})
         else:
            print("获取失败{}".format(url))
   print("{}:{}完成工作".format(self.threadName,self.name))
def parseUrl(self,html):
   imagesUrl=[]
   compile = re.compile("chapterImages = (\[.*?\])")
   compile2 = re.compile('chapterPath = "(.*?)"')
   images = json.loads(compile.findall(html)[0])
   im_path = compile2.findall(html)[0]
   im_url = "http://res.gufengmh8.com/"
   for image in images:
         imagesUrl.append(im_url + im_path + image)
   returnimagesUrl
#获取每一章节中的图片的线程类
class ImagesUrlThread(threading.Thread):
def __init__(self,ImagesUrlQ,threadName,spider):
   super(ImagesUrlThread,self).__init__()
   self.ImagesUrlQ = ImagesUrlQ
   self.threadName = threadName
   self.spider = spider
def run(self):
   print("{}:线程正在工作".format(self.threadName))
   global IMAGESURL_EXIT
   while not IMAGESURL_EXIT:
         try:
            images_chapter = self.ImagesUrlQ.get(False)
         except:
            break
         title = list(images_chapter.keys())[0]
         images = images_chapter#list类型~~
         try:
            os.mkdir(os.path.join(self.spider.dir_path+title))
         except Exception as e:
            pass
            #print("error:{}".format(str(e)))
         for i in range(len(images)):
            url = images
            imagesIo = self.spider.get_data(url)
            if imagesIo:
               save_path = self.spider.dir_path + title + "/" + str(i) + ".jpg"
               with open(save_path, "bw") as file:
                     try:
                        file.write(imagesIo)
                     except:
                        pass
            elif imagesIo==1:
               pass
            else:
               global ErrorQ
               ErrorQ.put({"title":title,"page":str(i),"url":url})
               print("章节:{},第{}页,url:{},获取失败".format(title,str(i),url))
   print("获取完成{}".format(self.threadName))
#用来从新获取获取失败的章节
class ErrorUrlThrad(threading.Thread):
def __init__(self,ErrorQ,tName,spider):
   super(ErrorUrlThrad,self).__init__()
   self.ErrorQ = ErrorQ
   self.spider = spider
   self.threadName = tName
def run(self):
   print("{}:线程正在工作".format(self.threadName))
   global ERRORU_EXIT
   while not ERRORU_EXIT:
         try:
            error_dict = self.ErrorQ.get(False)
         except Exception as e:
            break

         title = error_dict["title"]
         page =error_dict["page"]
         url = error_dict["url"]

         imageIo = self.spider.get_data(url)
         if imageIo:
            with open(os.path.join(self.spider.dir_path+title + "/" + page + ".jpg"),"wb") as f:
               f.write(imageIo)
         else:
            print("章节:{},第{}页,url:{},重新获取失败".format(title, page, url))

class MSpider(object):
def __init__(self,index='',CartoonName=None,dir_path=''):
   self.cookie = cjar.CookieJar()
   self.opener = ub.build_opener(ub.HTTPCookieProcessor(self.cookie))
   self.opener.addheaders =[("User-Agent","Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Mobile Safari/537.36")]
   self.index = index #漫画首页
   self.CartoonName = CartoonName
   self.dir_path = dir_path
def get_cookie(self):
   try:
         self.opener.open(self.index,timeout=10)
   except Exception as e:
         print(str(e))
def search_api(self):
   if not isinstance(self.CartoonName,str):
         self.CartoonName = str(self.CartoonName)

   data_dict = {
         'keywords': self.CartoonName
   }
   data = parse.urlencode(data_dict)
   url = self.index+"search/?" + data

   html = self.get_data(url)
   html = re.sub('\xa9|\xbb', '', html.decode(encoding="utf-8"))
   html_xpath = etree.HTML(html)
   try:
         cartoonList =html_xpath.xpath('//*[@id="update_list"]/div/div/div/a/@href')#漫画首页
         update = html_xpath.xpath('//*[@id="update_list"]/div/div/div/p/span/text()')
         for index,date in zip(cartoonList,update):
            print("更新日期:{},漫画链接:{}".format(date,index))
         index = int(input('请根据时间选择你要看的漫画？请输入阿拉伯数字进行选择。'))
         if index<=0:
            index = 1
   except Exception as e:
         print("error:{}".format(str(e)))
         return ""
   return cartoonList-1]
#获取章节
def get_chapter(self,index):
   html = self.get_data(index)
   html = re.sub('\xa9|\xbb','',html.decode(encoding="utf-8"))#删除特殊字符
   html_xpath = etree.HTML(html)
   chapters_href = html_xpath.xpath('//*[@id="chapter-list-1"]/li/a/@href')
   chapters_title = html_xpath.xpath('//*[@id="chapter-list-1"]/li/a/span/text()')
   chapters_len = len(chapters_title)

   print("""最近更新10章更新:
   {}
   {}""".format(chapters_title-10:chapters_len-5],chapters_title-5:chapters_len]))

   print('因为其中包含特殊章节,并不是每个章节链接和每一话动漫都对应.\n'
         '请自行斟酌要爬去的章节范围.\n\n\n'
         '您搜素漫画一共{}章节,'\
         .format(chapters_len))
   while True:
         try:
            start_page = int(input("请输入起始章节："))
            end_page = int(input("请输入结束章节："))
            if end_page>chapters_len:
               print("章节超出搜素范围,请重新输入")
               continue
            elif start_page>end_page:
               print('起始章节大于结束章节,请重新输入')
               continue
            elif start_page<1:
               print("起始章节存在错误")
               continue
            break
         except Exception as e:
            print('您输入的章节数目格式存在错误请重新出入,Error:{}'.format(str(e)))

   if chapters_len==len(chapters_href):
         chapters = Queue()
         for i in range(start_page-1,end_page):
            chapters.put((chapters_title,chapters_href))
   return chapters

#开始获取章节的函数
def get_oneChapter(self,chaptersQ):
   ImagesUrlQ=Queue()
   tNames = ["cps1","cps2","cps3","cps4","cps5","cps6","cps7","cps8","cps9","cps10"]
   cpts =[]#存储章节爬去线程

   for tName in tNames:
         cpt = chaptersThread(chaptersQ,ImagesUrlQ,tName,self)#创建进程
         cpt.start()
         cpts.append(cpt)
   while not chaptersQ.empty():
         pass

   global CHAPTERS_EXIT
   CHAPTERS_EXIT = True

   for cpt in cpts:
         cpt.join()
   print("章节获取完成,一共获取了{}章漫画".format(ImagesUrlQ.qsize()))
   if ImagesUrlQ.empty():
         print("ImagesUrlQ is empty ,漫画被下架!")
         exit(1)
   Imuts = []#image 获取线程
   t2Names = ["IMUs1","IMUs2","IMUs3","IMUs4","IMUs5","IMUs6","IMUs7","IMUs8","IMUs9","IMUs10",
               "IMUs11", "IMUs12", "IMUs13", "IMUs14", "IMUs15", "IMUs16", "IMUs17", "IMUs18", "IMUs19", "IMUs20"]
   for tName in t2Names:
         Imut = ImagesUrlThread(ImagesUrlQ, tName, self)# 创建进程
         Imut.start()
         Imuts.append(Imut)
   while not ImagesUrlQ.empty():
         pass
   global IMAGESURL_EXIT
   IMAGESURL_EXIT = True
   for Imut in Imuts:
         Imut.join()
   print("全部获取完成")
@get_body
def get_data(self,*args,**kwargs):
   return self.opener.open(args[0],timeout=30).read()#args=url变量

CHAPTERS_EXIT = False
IMAGESURL_EXIT=False
ERRORU_EXIT=False
error_num=0
ErrorQ =Queue()#获取错误的url的队列
def main():
CartoonName = input("请输入你想搜素的漫画名:")
if sys.platform.startswith('win'):
   dir_path="manhua/"+CartoonName+"/"
else:
   dir_path ="/storage/emulated/0/manhua/"+CartoonName+"/"
try:
   os.mkdir(dir_path)
except Exception as e:
   #print(str(e))
   pass
index = "http://m.gufengmh8.com/"
spider = MSpider(index,CartoonName,dir_path)
spider.get_cookie()
index = spider.search_api()
if index:
   chapters = spider.get_chapter(index)
   spider.get_oneChapter(chapters)
   if not ErrorQ.empty():
         errorTnames = ["error1","error2","error3"]
         eThreads = []
         for tname in errorTnames:
            eThread = ErrorUrlThrad(ErrorQ,tname,spider)
            eThread.start()
            eThreads.append(eThread)
         while not ErrorQ.empty():
            pass
         #等待线程结束
         for t in eThreads:
            t.join()
else:
   print("------------漫画不存在-----------")
   exit(1)
if __name__ == '__main__':
main()

运行环境Android 和 windows用的是python3.5不兼容python2.7

hustlzp 发表于 2019-3-5 20:30

#coding:utf-8
import urllib.request as ub
import urllib.parse as parse
import http.cookiejar as cjar
import re
from lxml import etree
from queue import Queue
import threading
import os
import json
import sys

#获取数据的装饰器函数
def get_body(func):
def wrapper(*args, **kwargs):
   for i in range(0, 3):
         try:
            html = func(*args, **kwargs)
         except Exception as e:
            if str(e).find('404')>=0:
               print("error:{},url:{}".format(str(e), args))
               return 1
            print("error:{},url:{}".format(str(e),args))
            if i == 2:
               i = 3
            continue
         else:
            return html
   if i == 3:
         return False
return wrapper
#获取章节中图片地址的线程类
class chaptersThread(threading.Thread):
def __init__(self,chaptersQ,ImagesUrlQ,threadName,spider):
   super(chaptersThread,self).__init__()
   self.chaptersQ = chaptersQ
   self.threadName = threadName
   self.ImagesUrlQ = ImagesUrlQ
   self.spider = spider
def run(self):
   print("{}:线程正在工作".format(self.threadName))
   global CHAPTERS_EXIT
   while not CHAPTERS_EXIT:
         try:
            chapterTuple = self.chaptersQ.get(False) #一会处理
         except Exception as e:
            break
         title = chapterTuple
         url = chapterTuple
         url = self.spider.index + url
         html = self.spider.get_data(url) #返回未处理的html
         if html:
            html = re.sub('\xa9|\xbb', '', html.decode(encoding="utf-8"))
            imagesUrl=self.parseUrl(html)
            self.ImagesUrlQ.put({title:imagesUrl})
         else:
            print("获取失败{}".format(url))
   print("{}:{}完成工作".format(self.threadName,self.name))
def parseUrl(self,html):
   imagesUrl=[]
   compile = re.compile("chapterImages = (\[.*?\])")
   compile2 = re.compile('chapterPath = "(.*?)"')
   images = json.loads(compile.findall(html))
   im_path = compile2.findall(html)
   im_url = "http://res.gufengmh8.com/"
   for image in images:
         imagesUrl.append(im_url + im_path + image)
   returnimagesUrl
#获取每一章节中的图片的线程类
class ImagesUrlThread(threading.Thread):
def __init__(self,ImagesUrlQ,threadName,spider):
   super(ImagesUrlThread,self).__init__()
   self.ImagesUrlQ = ImagesUrlQ
   self.threadName = threadName
   self.spider = spider
def run(self):
   print("{}:线程正在工作".format(self.threadName))
   global IMAGESURL_EXIT
   while not IMAGESURL_EXIT:
         try:
            images_chapter = self.ImagesUrlQ.get(False)
         except:
            break
         title = list(images_chapter.keys())
         images = images_chapter#list类型~~
         try:
            os.mkdir(os.path.join(self.spider.dir_path+title))
         except Exception as e:
            pass
            #print("error:{}".format(str(e)))
         for i in range(len(images)):
            url = images
            imagesIo = self.spider.get_data(url)
            if imagesIo:
               save_path = self.spider.dir_path + title + "/" + str(i) + ".jpg"
               with open(save_path, "bw") as file:
                     try:
                        file.write(imagesIo)
                     except:
                        pass
            elif imagesIo==1:
               pass
            else:
               global ErrorQ
               ErrorQ.put({"title":title,"page":str(i),"url":url})
               print("章节:{},第{}页,url:{},获取失败".format(title,str(i),url))
   print("获取完成{}".format(self.threadName))
#用来从新获取获取失败的章节
class ErrorUrlThrad(threading.Thread):
def __init__(self,ErrorQ,tName,spider):
   super(ErrorUrlThrad,self).__init__()
   self.ErrorQ = ErrorQ
   self.spider = spider
   self.threadName = tName
def run(self):
   print("{}:线程正在工作".format(self.threadName))
   global ERRORU_EXIT
   while not ERRORU_EXIT:
         try:
            error_dict = self.ErrorQ.get(False)
         except Exception as e:
            break

         title = error_dict["title"]
         page =error_dict["page"]
         url = error_dict["url"]

         imageIo = self.spider.get_data(url)
         if imageIo:
            with open(os.path.join(self.spider.dir_path+title + "/" + page + ".jpg"),"wb") as f:
               f.write(imageIo)
         else:
            print("章节:{},第{}页,url:{},重新获取失败".format(title, page, url))

class MSpider(object):
def __init__(self,index='',CartoonName=None,dir_path=''):
   self.cookie = cjar.CookieJar()
   self.opener = ub.build_opener(ub.HTTPCookieProcessor(self.cookie))
   self.opener.addheaders =[("User-Agent","Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Mobile Safari/537.36")]
   self.index = index #漫画首页
   self.CartoonName = CartoonName
   self.dir_path = dir_path
def get_cookie(self):
   try:
         self.opener.open(self.index,timeout=10)
   except Exception as e:
         print(str(e))
def search_api(self):
   if not isinstance(self.CartoonName,str):
         self.CartoonName = str(self.CartoonName)

   data_dict = {
         'keywords': self.CartoonName
   }
   data = parse.urlencode(data_dict)
   url = self.index+"search/?" + data

   html = self.get_data(url)
   html = re.sub('\xa9|\xbb', '', html.decode(encoding="utf-8"))
   html_xpath = etree.HTML(html)
   try:
         cartoonList =html_xpath.xpath('//*[@id="update_list"]/div/div/div/a/@href')#漫画首页
         update = html_xpath.xpath('//*[@id="update_list"]/div/div/div/p/span/text()')
         for index,date in zip(cartoonList,update):
            print("更新日期:{},漫画链接:{}".format(date,index))
         index = int(input('请根据时间选择你要看的漫画？请输入阿拉伯数字进行选择。'))
         if index<=0:
            index = 1
   except Exception as e:
         print("error:{}".format(str(e)))
         return ""
   return cartoonList
#获取章节
def get_chapter(self,index):
   html = self.get_data(index)
   html = re.sub('\xa9|\xbb','',html.decode(encoding="utf-8"))#删除特殊字符
   html_xpath = etree.HTML(html)
   chapters_href = html_xpath.xpath('//*[@id="chapter-list-1"]/li/a/@href')
   chapters_title = html_xpath.xpath('//*[@id="chapter-list-1"]/li/a/span/text()')
   chapters_len = len(chapters_title)

   print("""最近更新10章更新:
   {}
   {}""".format(chapters_title,chapters_title))

   print('因为其中包含特殊章节,并不是每个章节链接和每一话动漫都对应.\n'
         '请自行斟酌要爬去的章节范围.\n\n\n'
         '您搜素漫画一共{}章节,'\
         .format(chapters_len))
   while True:
         try:
            start_page = int(input("请输入起始章节："))
            end_page = int(input("请输入结束章节："))
            if end_page>chapters_len:
               print("章节超出搜素范围,请重新输入")
               continue
            elif start_page>end_page:
               print('起始章节大于结束章节,请重新输入')
               continue
            elif start_page<1:
               print("起始章节存在错误")
               continue
            break
         except Exception as e:
            print('您输入的章节数目格式存在错误请重新出入,Error:{}'.format(str(e)))

   if chapters_len==len(chapters_href):
         chapters = Queue()
         for i in range(start_page-1,end_page):
            chapters.put((chapters_title,chapters_href))
   return chapters

#开始获取章节的函数
def get_oneChapter(self,chaptersQ):
   ImagesUrlQ=Queue()
   tNames = ["cps1","cps2","cps3","cps4","cps5","cps6","cps7","cps8","cps9","cps10"]
   cpts =[]#存储章节爬去线程

   for tName in tNames:
         cpt = chaptersThread(chaptersQ,ImagesUrlQ,tName,self)#创建进程
         cpt.start()
         cpts.append(cpt)
   while not chaptersQ.empty():
         pass

   global CHAPTERS_EXIT
   CHAPTERS_EXIT = True

   for cpt in cpts:
         cpt.join()
   print("章节获取完成,一共获取了{}章漫画".format(ImagesUrlQ.qsize()))
   if ImagesUrlQ.empty():
         print("ImagesUrlQ is empty ,漫画被下架!")
         exit(1)
   Imuts = []#image 获取线程
   t2Names = ["IMUs1","IMUs2","IMUs3","IMUs4","IMUs5","IMUs6","IMUs7","IMUs8","IMUs9","IMUs10",
               "IMUs11", "IMUs12", "IMUs13", "IMUs14", "IMUs15", "IMUs16", "IMUs17", "IMUs18", "IMUs19", "IMUs20"]
   for tName in t2Names:
         Imut = ImagesUrlThread(ImagesUrlQ, tName, self)# 创建进程
         Imut.start()
         Imuts.append(Imut)
   while not ImagesUrlQ.empty():
         pass
   global IMAGESURL_EXIT
   IMAGESURL_EXIT = True
   for Imut in Imuts:
         Imut.join()
   print("全部获取完成")
@get_body
def get_data(self,*args,**kwargs):
   return self.opener.open(args,timeout=30).read()#args=url变量

CHAPTERS_EXIT = False
IMAGESURL_EXIT=False
ERRORU_EXIT=False
error_num=0
ErrorQ =Queue()#获取错误的url的队列
def main():
CartoonName = input("请输入你想搜素的漫画名:")
if sys.platform.startswith('win'):
   dir_path="manhua/"+CartoonName+"/"
else:
   dir_path ="/storage/emulated/0/manhua/"+CartoonName+"/"
try:
   os.mkdir(dir_path)
except Exception as e:
   #print(str(e))
   pass
index = "http://m.gufengmh8.com/"
spider = MSpider(index,CartoonName,dir_path)
spider.get_cookie()
index = spider.search_api()
if index:
   chapters = spider.get_chapter(index)
   spider.get_oneChapter(chapters)
   if not ErrorQ.empty():
         errorTnames = ["error1","error2","error3"]
         eThreads = []
         for tname in errorTnames:
            eThread = ErrorUrlThrad(ErrorQ,tname,spider)
            eThread.start()
            eThreads.append(eThread)
         while not ErrorQ.empty():
            pass
         #等待线程结束
         for t in eThreads:
            t.join()
else:
   print("------------漫画不存在-----------")
   exit(1)
if __name__ == '__main__':
main()

.Net_破解 发表于 2019-3-5 21:39

hustlzp 发表于 2019-3-5 20:30
#coding:utf-8
import urllib.request as ub
import urllib.parse as parse

谢谢大佬我不会用这个功能

python3 发表于 2019-3-7 22:15

写的不错哦{:1_918:}{:1_918:}{:1_918:}

鸿鹄小白 发表于 2019-3-21 16:34

真小白前来学习

莫問道 发表于 2019-5-1 15:27

有成品吗。大佬

.Net_破解 发表于 2019-5-2 07:36

莫問道发表于 2019-5-1 15:27
有成品吗。大佬

没有图行界面需要自己下载python3 和lxml的库然后运行脚本

页: [1]

吾爱破解 - 52pojie.cn's Archiver

小白的多线程爬虫。以某漫画为例