小白的多线程爬虫。以某漫画为例

.Net_破解 · 发表于 2019-3-5 19:52

#coding:utf-8
import urllib.request as ub
import urllib.parse as parse
import http.cookiejar as cjar
import re
from lxml import etree
from queue import Queue
import threading
import os
import json
import sys

#获取数据的装饰器函数
def get_body(func):
def wrapper(*args, **kwargs):
      for i in range(0, 3):
         try:
            html = func(*args, **kwargs)
         except Exception as e:
            if str(e).find('404')>=0:
                  print("error:{},url:{}".format(str(e), args[1]))
                  return 1
            print("error:{},url:{}".format(str(e),args[1]))
            if i == 2:
                  i = 3
            continue
         else:
            return html
      if i == 3:
         return False
return wrapper
#获取章节中图片地址的线程类
class chaptersThread(threading.Thread):
def __init__(self,chaptersQ,ImagesUrlQ,threadName,spider):
      super(chaptersThread,self).__init__()
      self.chaptersQ = chaptersQ
      self.threadName = threadName
      self.ImagesUrlQ = ImagesUrlQ
      self.spider = spider
def run(self):
      print("{}:线程正在工作".format(self.threadName))
      global CHAPTERS_EXIT
      while not CHAPTERS_EXIT:
         try:
            chapterTuple = self.chaptersQ.get(False) #一会处理
         except Exception as e:
            break
         title = chapterTuple[0]
         url = chapterTuple[1]
         url = self.spider.index + url[1:]
         html = self.spider.get_data(url) #返回未处理的html
         if html:
            html = re.sub('\xa9|\xbb', '', html.decode(encoding="utf-8"))
            imagesUrl=self.parseUrl(html)
            self.ImagesUrlQ.put({title:imagesUrl})
         else:
            print("获取失败{}".format(url))
      print("{}:{}完成工作".format(self.threadName,self.name))
def parseUrl(self,html):
      imagesUrl=[]
      compile = re.compile("chapterImages = (\[.*?\])")
      compile2 = re.compile('chapterPath = "(.*?)"')
      images = json.loads(compile.findall(html)[0])
      im_path = compile2.findall(html)[0]
      im_url = "http://res.gufengmh8.com/"
      for image in images:
         imagesUrl.append(im_url + im_path + image)
      return  imagesUrl
#获取每一章节中的图片的线程类
class ImagesUrlThread(threading.Thread):
def __init__(self,ImagesUrlQ,threadName,spider):
      super(ImagesUrlThread,self).__init__()
      self.ImagesUrlQ = ImagesUrlQ
      self.threadName = threadName
      self.spider = spider
def run(self):
      print("{}:线程正在工作".format(self.threadName))
      global IMAGESURL_EXIT
      while not IMAGESURL_EXIT:
         try:
            images_chapter = self.ImagesUrlQ.get(False)
         except:
            break
         title = list(images_chapter.keys())[0]
         images = images_chapter[title]#list类型~~
         try:
            os.mkdir(os.path.join(self.spider.dir_path+title))
         except Exception as e:
            pass
            #print("error:{}".format(str(e)))
         for i in range(len(images)):
            url = images
            imagesIo = self.spider.get_data(url)
            if imagesIo:
                  save_path = self.spider.dir_path + title + "/" + str(i) + ".jpg"
                  with open(save_path, "bw") as file:
                     try:
                        file.write(imagesIo)
                     except:
                        pass
            elif imagesIo==1:
                  pass
            else:
                  global ErrorQ
                  ErrorQ.put({"title":title,"page":str(i),"url":url})
                  print("章节:{},第{}页,url:{},获取失败".format(title,str(i),url))
      print("获取完成{}".format(self.threadName))
#用来从新获取获取失败的章节
class ErrorUrlThrad(threading.Thread):
def __init__(self,ErrorQ,tName,spider):
      super(ErrorUrlThrad,self).__init__()
      self.ErrorQ = ErrorQ
      self.spider = spider
      self.threadName = tName
def run(self):
      print("{}:线程正在工作".format(self.threadName))
      global ERRORU_EXIT
      while not ERRORU_EXIT:
         try:
            error_dict = self.ErrorQ.get(False)
         except Exception as e:
            break

         title = error_dict["title"]
         page =error_dict["page"]
         url = error_dict["url"]

         imageIo = self.spider.get_data(url)
         if imageIo:
            with open(os.path.join(self.spider.dir_path+title + "/" + page + ".jpg"),"wb") as f:
                  f.write(imageIo)
         else:
            print("章节:{},第{}页,url:{},重新获取失败".format(title, page, url))

class MSpider(object):
def __init__(self,index='',CartoonName=None,dir_path=''):
      self.cookie = cjar.CookieJar()
      self.opener = ub.build_opener(ub.HTTPCookieProcessor(self.cookie))
      self.opener.addheaders =[("User-Agent","Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Mobile Safari/537.36")]
      self.index = index #漫画首页
      self.CartoonName = CartoonName
      self.dir_path = dir_path
def get_cookie(self):
      try:
         self.opener.open(self.index,timeout=10)
      except Exception as e:
         print(str(e))
def search_api(self):
      if not isinstance(self.CartoonName,str):
         self.CartoonName = str(self.CartoonName)

      data_dict = {
         'keywords': self.CartoonName
      }
      data = parse.urlencode(data_dict)
      url = self.index+"search/?" + data

      html = self.get_data(url)
      html = re.sub('\xa9|\xbb', '', html.decode(encoding="utf-8"))
      html_xpath = etree.HTML(html)
      try:
         cartoonList =  html_xpath.xpath('//*[@id="update_list"]/div/div/div[2]/a/@href')#漫画首页
         update = html_xpath.xpath('//*[@id="update_list"]/div/div/div[2]/p[3]/span[2]/text()')
         for index,date in zip(cartoonList,update):
            print("更新日期:{},漫画链接:{}".format(date,index))
         index = int(input('请根据时间选择你要看的漫画？请输入阿拉伯数字进行选择。'))
         if index<=0:
            index = 1
      except Exception as e:
         print("error:{}".format(str(e)))
         return ""
      return cartoonList[index-1]
#获取章节
def get_chapter(self,index):
      html = self.get_data(index)
      html = re.sub('\xa9|\xbb','',html.decode(encoding="utf-8"))#删除特殊字符
      html_xpath = etree.HTML(html)
      chapters_href = html_xpath.xpath('//*[@id="chapter-list-1"]/li/a/@href')
      chapters_title = html_xpath.xpath('//*[@id="chapter-list-1"]/li/a/span/text()')
      chapters_len = len(chapters_title)

      print("""最近更新10章更新:
      {}
      {}""".format(chapters_title[chapters_len-10:chapters_len-5],chapters_title[chapters_len-5:chapters_len]))

      print('因为其中包含特殊章节,并不是每个章节链接和每一话动漫都对应.\n'
            '请自行斟酌要爬去的章节范围.\n\n\n'
            '您搜素漫画一共{}章节,'\
            .format(chapters_len))
      while True:
         try:
            start_page = int(input("请输入起始章节："))
            end_page = int(input("请输入结束章节："))
            if end_page>chapters_len:
                  print("章节超出搜素范围,请重新输入")
                  continue
            elif start_page>end_page:
                  print('起始章节大于结束章节,请重新输入')
                  continue
            elif start_page<1:
                  print("起始章节存在错误")
                  continue
            break
         except Exception as e:
            print('您输入的章节数目格式存在错误请重新出入,Error:{}'.format(str(e)))

      if chapters_len==len(chapters_href):
         chapters = Queue()
         for i in range(start_page-1,end_page):
            chapters.put((chapters_title,chapters_href))
      return chapters

#开始获取章节的函数
def get_oneChapter(self,chaptersQ):
      ImagesUrlQ=Queue()
      tNames = ["cps1","cps2","cps3","cps4","cps5","cps6","cps7","cps8","cps9","cps10"]
      cpts =[]#存储章节爬去线程

      for tName in tNames:
         cpt = chaptersThread(chaptersQ,ImagesUrlQ,tName,self)#创建进程
         cpt.start()
         cpts.append(cpt)
      while not chaptersQ.empty():
         pass

      global CHAPTERS_EXIT
      CHAPTERS_EXIT = True

      for cpt in cpts:
         cpt.join()
      print("章节获取完成,一共获取了{}章漫画".format(ImagesUrlQ.qsize()))
      if ImagesUrlQ.empty():
         print("ImagesUrlQ is empty ,漫画被下架!")
         exit(1)
      Imuts = []#image 获取线程
      t2Names = ["IMUs1","IMUs2","IMUs3","IMUs4","IMUs5","IMUs6","IMUs7","IMUs8","IMUs9","IMUs10",
               "IMUs11", "IMUs12", "IMUs13", "IMUs14", "IMUs15", "IMUs16", "IMUs17", "IMUs18", "IMUs19", "IMUs20"]
      for tName in t2Names:
         Imut = ImagesUrlThread(ImagesUrlQ, tName, self)  # 创建进程
         Imut.start()
         Imuts.append(Imut)
      while not ImagesUrlQ.empty():
         pass
      global IMAGESURL_EXIT
      IMAGESURL_EXIT = True
      for Imut in Imuts:
         Imut.join()
      print("全部获取完成")
@get_body
def get_data(self,*args,**kwargs):
      return self.opener.open(args[0],timeout=30).read()#args[0]=url变量

CHAPTERS_EXIT = False
IMAGESURL_EXIT=False
ERRORU_EXIT=False
error_num=0
ErrorQ =Queue()#获取错误的url的队列
def main():
CartoonName = input("请输入你想搜素的漫画名:")
if sys.platform.startswith('win'):
      dir_path  ="manhua/"+CartoonName+"/"
else:
      dir_path ="/storage/emulated/0/manhua/"+CartoonName+"/"
try:
      os.mkdir(dir_path)
except Exception as e:
      #print(str(e))
      pass
index = "http://m.gufengmh8.com/"
spider = MSpider(index,CartoonName,dir_path)
spider.get_cookie()
index = spider.search_api()
if index:
      chapters = spider.get_chapter(index)
      spider.get_oneChapter(chapters)
      if not ErrorQ.empty():
         errorTnames = ["error1","error2","error3"]
         eThreads = []
         for tname in errorTnames:
            eThread = ErrorUrlThrad(ErrorQ,tname,spider)
            eThread.start()
            eThreads.append(eThread)
         while not ErrorQ.empty():
            pass
         #等待线程结束
         for t in eThreads:
            t.join()
else:
      print("------------漫画不存在-----------")
      exit(1)
if __name__ == '__main__':
main()

运行环境  Android 和 windows  用的是python3.5  不兼容python2.7

hustlzp · 发表于 2019-3-5 20:30

[Python] 纯文本查看 复制代码

#coding:utf-8
import urllib.request as ub
import urllib.parse as parse
import http.cookiejar as cjar
import re
from lxml import etree
from queue import Queue
import threading
import os
import json
import sys

#获取数据的装饰器函数
def get_body(func):
    def wrapper(*args, **kwargs):
        for i in range(0, 3):
            try:
                html = func(*args, **kwargs)
            except Exception as e:
                if str(e).find('404')>=0:
                    print("error:{},url:{}".format(str(e), args[1]))
                    return 1
                print("error:{},url:{}".format(str(e),args[1]))
                if i == 2:
                    i = 3
                continue
            else:
                return html
        if i == 3:
            return False
    return wrapper
#获取章节中图片地址的线程类
class chaptersThread(threading.Thread):
    def __init__(self,chaptersQ,ImagesUrlQ,threadName,spider):
        super(chaptersThread,self).__init__()
        self.chaptersQ = chaptersQ
        self.threadName = threadName
        self.ImagesUrlQ = ImagesUrlQ
        self.spider = spider
    def run(self):
        print("{}:线程正在工作".format(self.threadName))
        global CHAPTERS_EXIT
        while not CHAPTERS_EXIT:
            try:
                chapterTuple = self.chaptersQ.get(False) #一会处理
            except Exception as e:
                break
            title = chapterTuple[0]
            url = chapterTuple[1]
            url = self.spider.index + url[1:]
            html = self.spider.get_data(url) #返回未处理的html
            if html:
                html = re.sub('\xa9|\xbb', '', html.decode(encoding="utf-8"))
                imagesUrl=self.parseUrl(html)
                self.ImagesUrlQ.put({title:imagesUrl})
            else:
                print("获取失败{}".format(url))
        print("{}:{}完成工作".format(self.threadName,self.name))
    def parseUrl(self,html):
        imagesUrl=[]
        compile = re.compile("chapterImages = (\[.*?\])")
        compile2 = re.compile('chapterPath = "(.*?)"')
        images = json.loads(compile.findall(html)[0])
        im_path = compile2.findall(html)[0]
        im_url = "http://res.gufengmh8.com/"
        for image in images:
            imagesUrl.append(im_url + im_path + image)
        return  imagesUrl
#获取每一章节中的图片的线程类
class ImagesUrlThread(threading.Thread):
    def __init__(self,ImagesUrlQ,threadName,spider):
        super(ImagesUrlThread,self).__init__()
        self.ImagesUrlQ = ImagesUrlQ
        self.threadName = threadName
        self.spider = spider
    def run(self):
        print("{}:线程正在工作".format(self.threadName))
        global IMAGESURL_EXIT
        while not IMAGESURL_EXIT:
            try:
                images_chapter = self.ImagesUrlQ.get(False)
            except:
                break
            title = list(images_chapter.keys())[0]
            images = images_chapter[title]#list类型~~
            try:
                os.mkdir(os.path.join(self.spider.dir_path+title))
            except Exception as e:
                pass
                #print("error:{}".format(str(e)))
            for i in range(len(images)):
                url = images
                imagesIo = self.spider.get_data(url)
                if imagesIo:
                    save_path = self.spider.dir_path + title + "/" + str(i) + ".jpg"
                    with open(save_path, "bw") as file:
                        try:
                            file.write(imagesIo)
                        except:
                            pass
                elif imagesIo==1:
                    pass
                else:
                    global ErrorQ
                    ErrorQ.put({"title":title,"page":str(i),"url":url})
                    print("章节:{},第{}页,url:{},获取失败".format(title,str(i),url))
        print("获取完成{}".format(self.threadName))
#用来从新获取 获取失败的章节
class ErrorUrlThrad(threading.Thread):
    def __init__(self,ErrorQ,tName,spider):
        super(ErrorUrlThrad,self).__init__()
        self.ErrorQ = ErrorQ
        self.spider = spider
        self.threadName = tName
    def run(self):
        print("{}:线程正在工作".format(self.threadName))
        global ERRORU_EXIT
        while not ERRORU_EXIT:
            try:
                error_dict = self.ErrorQ.get(False)
            except Exception as e:
                break

            title = error_dict["title"]
            page =error_dict["page"]
            url = error_dict["url"]

            imageIo = self.spider.get_data(url)
            if imageIo:
                with open(os.path.join(self.spider.dir_path+title + "/" + page + ".jpg"),"wb") as f:
                    f.write(imageIo)
            else:
                print("章节:{},第{}页,url:{},重新获取失败".format(title, page, url))

class MSpider(object):
    def __init__(self,index='',CartoonName=None,dir_path=''):
        self.cookie = cjar.CookieJar()
        self.opener = ub.build_opener(ub.HTTPCookieProcessor(self.cookie))
        self.opener.addheaders =[("User-Agent","Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Mobile Safari/537.36")]
        self.index = index #漫画首页
        self.CartoonName = CartoonName
        self.dir_path = dir_path
    def get_cookie(self):
        try:
            self.opener.open(self.index,timeout=10)
        except Exception as e:
            print(str(e))
    def search_api(self):
        if not isinstance(self.CartoonName,str):
            self.CartoonName = str(self.CartoonName)

        data_dict = {
            'keywords': self.CartoonName
        }
        data = parse.urlencode(data_dict)
        url = self.index+"search/?" + data

        html = self.get_data(url)
        html = re.sub('\xa9|\xbb', '', html.decode(encoding="utf-8"))
        html_xpath = etree.HTML(html)
        try:
            cartoonList =  html_xpath.xpath('//*[@id="update_list"]/div/div/div[2]/a/@href')#漫画首页
            update = html_xpath.xpath('//*[@id="update_list"]/div/div/div[2]/p[3]/span[2]/text()')
            for index,date in zip(cartoonList,update):
                print("更新日期:{},漫画链接:{}".format(date,index))
            index = int(input('请根据时间选择你要看的漫画？请输入阿拉伯数字进行选择。'))
            if index<=0:
                index = 1
        except Exception as e:
            print("error:{}".format(str(e)))
            return ""
        return cartoonList[index-1]
    #获取章节
    def get_chapter(self,index):
        html = self.get_data(index)
        html = re.sub('\xa9|\xbb','',html.decode(encoding="utf-8"))#删除特殊字符
        html_xpath = etree.HTML(html)
        chapters_href = html_xpath.xpath('//*[@id="chapter-list-1"]/li/a/@href')
        chapters_title = html_xpath.xpath('//*[@id="chapter-list-1"]/li/a/span/text()')
        chapters_len = len(chapters_title)

        print("""最近更新10章更新:
        {}
        {}""".format(chapters_title[chapters_len-10:chapters_len-5],chapters_title[chapters_len-5:chapters_len]))

        print('因为其中包含特殊章节,并不是每个章节链接和每一话动漫都对应.\n'
              '请自行斟酌要爬去的章节范围.\n\n\n'
              '您搜素漫画一共{}章节,'\
              .format(chapters_len))
        while True:
            try:
                start_page = int(input("请输入起始章节："))
                end_page = int(input("请输入结束章节："))
                if end_page>chapters_len:
                    print("章节超出搜素范围,请重新输入")
                    continue
                elif start_page>end_page:
                    print('起始章节大于结束章节,请重新输入')
                    continue
                elif start_page<1:
                    print("起始章节存在错误")
                    continue
                break
            except Exception as e:
                print('您输入的章节数目格式存在错误请重新出入,Error:{}'.format(str(e)))

        if chapters_len==len(chapters_href):
            chapters = Queue()
            for i in range(start_page-1,end_page):
                chapters.put((chapters_title,chapters_href))
        return chapters

    #开始获取章节的函数
    def get_oneChapter(self,chaptersQ):
        ImagesUrlQ=Queue()
        tNames = ["cps1","cps2","cps3","cps4","cps5","cps6","cps7","cps8","cps9","cps10"]
        cpts =[]#存储章节爬去线程

        for tName in tNames:
            cpt = chaptersThread(chaptersQ,ImagesUrlQ,tName,self)#创建进程
            cpt.start()
            cpts.append(cpt)
        while not chaptersQ.empty():
            pass

        global CHAPTERS_EXIT
        CHAPTERS_EXIT = True

        for cpt in cpts:
            cpt.join()
        print("章节获取完成,一共获取了{}章漫画".format(ImagesUrlQ.qsize()))
        if ImagesUrlQ.empty():
            print("ImagesUrlQ is empty ,漫画被下架!")
            exit(1)
        Imuts = []#image 获取线程
        t2Names = ["IMUs1","IMUs2","IMUs3","IMUs4","IMUs5","IMUs6","IMUs7","IMUs8","IMUs9","IMUs10",
                   "IMUs11", "IMUs12", "IMUs13", "IMUs14", "IMUs15", "IMUs16", "IMUs17", "IMUs18", "IMUs19", "IMUs20"]
        for tName in t2Names:
            Imut = ImagesUrlThread(ImagesUrlQ, tName, self)  # 创建进程
            Imut.start()
            Imuts.append(Imut)
        while not ImagesUrlQ.empty():
            pass
        global IMAGESURL_EXIT
        IMAGESURL_EXIT = True
        for Imut in Imuts:
            Imut.join()
        print("全部获取完成")
    @get_body
    def get_data(self,*args,**kwargs):
        return self.opener.open(args[0],timeout=30).read()#args[0]=url变量

CHAPTERS_EXIT = False
IMAGESURL_EXIT=False
ERRORU_EXIT=False
error_num=0
ErrorQ =Queue()#获取错误的url的队列
def main():
    CartoonName = input("请输入你想搜素的漫画名:")
    if sys.platform.startswith('win'):
        dir_path  ="manhua/"+CartoonName+"/"
    else:
        dir_path ="/storage/emulated/0/manhua/"+CartoonName+"/"
    try:
        os.mkdir(dir_path)
    except Exception as e:
        #print(str(e))
        pass
    index = "http://m.gufengmh8.com/"
    spider = MSpider(index,CartoonName,dir_path)
    spider.get_cookie()
    index = spider.search_api()
    if index:
        chapters = spider.get_chapter(index)
        spider.get_oneChapter(chapters)
        if not ErrorQ.empty():
            errorTnames = ["error1","error2","error3"]
            eThreads = []
            for tname in errorTnames:
                eThread = ErrorUrlThrad(ErrorQ,tname,spider)
                eThread.start()
                eThreads.append(eThread)
            while not ErrorQ.empty():
                pass
            #等待线程结束
            for t in eThreads:
                t.join()
    else:
        print("------------漫画不存在-----------")
        exit(1)
if __name__ == '__main__':
    main()

.Net_破解 · 发表于 2019-3-5 21:39

hustlzp 发表于 2019-3-5 20:30
[mw_shl_code=python,true]#coding:utf-8
import urllib.request as ub
import urllib.parse as parse

谢谢大佬我不会用这个功能

python3 · 发表于 2019-3-7 22:15

写的不错哦

鸿鹄小白 · 发表于 2019-3-21 16:34

真小白前来学习

莫問道 · 发表于 2019-5-1 15:27

有成品吗。大佬

.Net_破解 · 发表于 2019-5-2 07:36

莫問道发表于 2019-5-1 15:27
有成品吗。大佬

没有图行界面需要自己下载python3 和lxml的库然后运行脚本

帐号		自动登录	找回密码
密码			注册[Register]

[Python 转载] 小白的多线程爬虫。以某漫画为例

免费评分