吾爱破解 - 52pojie.cn

 找回密码
 注册[Register]

QQ登录

只需一步,快速开始

查看: 5228|回复: 6
收起左侧

[Python 转载] 小白的多线程爬虫。以某漫画为例

  [复制链接]
.Net_破解 发表于 2019-3-5 19:52
#coding:utf-8
import urllib.request as ub
import urllib.parse as parse
import http.cookiejar as cjar
import re
from lxml import etree
from queue import Queue
import threading
import os
import json
import sys

#获取数据的装饰器函数
def get_body(func):
    def wrapper(*args, **kwargs):
        for i in range(0, 3):
            try:
                html = func(*args, **kwargs)
            except Exception as e:
                if str(e).find('404')>=0:
                    print("error:{},url:{}".format(str(e), args[1]))
                    return 1
                print("error:{},url:{}".format(str(e),args[1]))
                if i == 2:
                    i = 3
                continue
            else:
                return html
        if i == 3:
            return False
    return wrapper
#获取章节中图片地址的线程类
class chaptersThread(threading.Thread):
    def __init__(self,chaptersQ,ImagesUrlQ,threadName,spider):
        super(chaptersThread,self).__init__()
        self.chaptersQ = chaptersQ
        self.threadName = threadName
        self.ImagesUrlQ = ImagesUrlQ
        self.spider = spider
    def run(self):
        print("{}:线程正在工作".format(self.threadName))
        global CHAPTERS_EXIT
        while not CHAPTERS_EXIT:
            try:
                chapterTuple = self.chaptersQ.get(False) #一会处理
            except Exception as e:
                break
            title = chapterTuple[0]
            url = chapterTuple[1]
            url = self.spider.index + url[1:]
            html = self.spider.get_data(url) #返回未处理的html
            if html:
                html = re.sub('\xa9|\xbb', '', html.decode(encoding="utf-8"))
                imagesUrl=self.parseUrl(html)
                self.ImagesUrlQ.put({title:imagesUrl})
            else:
                print("获取失败{}".format(url))
        print("{}:{}完成工作".format(self.threadName,self.name))
    def parseUrl(self,html):
        imagesUrl=[]
        compile = re.compile("chapterImages = (\[.*?\])")
        compile2 = re.compile('chapterPath = "(.*?)"')
        images = json.loads(compile.findall(html)[0])
        im_path = compile2.findall(html)[0]
        im_url = "http://res.gufengmh8.com/"
        for image in images:
            imagesUrl.append(im_url + im_path + image)
        return  imagesUrl
#获取每一章节中的图片的线程类
class ImagesUrlThread(threading.Thread):
    def __init__(self,ImagesUrlQ,threadName,spider):
        super(ImagesUrlThread,self).__init__()
        self.ImagesUrlQ = ImagesUrlQ
        self.threadName = threadName
        self.spider = spider
    def run(self):
        print("{}:线程正在工作".format(self.threadName))
        global IMAGESURL_EXIT
        while not IMAGESURL_EXIT:
            try:
                images_chapter = self.ImagesUrlQ.get(False)
            except:
                break
            title = list(images_chapter.keys())[0]
            images = images_chapter[title]#list类型~~
            try:
                os.mkdir(os.path.join(self.spider.dir_path+title))
            except Exception as e:
                pass
                #print("error:{}".format(str(e)))
            for i in range(len(images)):
                url = images
                imagesIo = self.spider.get_data(url)
                if imagesIo:
                    save_path = self.spider.dir_path + title + "/" + str(i) + ".jpg"
                    with open(save_path, "bw") as file:
                        try:
                            file.write(imagesIo)
                        except:
                            pass
                elif imagesIo==1:
                    pass
                else:
                    global ErrorQ
                    ErrorQ.put({"title":title,"page":str(i),"url":url})
                    print("章节:{},{},url:{},获取失败".format(title,str(i),url))
        print("获取完成{}".format(self.threadName))
#用来从新获取 获取失败的章节
class ErrorUrlThrad(threading.Thread):
    def __init__(self,ErrorQ,tName,spider):
        super(ErrorUrlThrad,self).__init__()
        self.ErrorQ = ErrorQ
        self.spider = spider
        self.threadName = tName
    def run(self):
        print("{}:线程正在工作".format(self.threadName))
        global ERRORU_EXIT
        while not ERRORU_EXIT:
            try:
                error_dict = self.ErrorQ.get(False)
            except Exception as e:
                break

            title = error_dict["title"]
            page =error_dict["page"]
            url = error_dict["url"]

            imageIo = self.spider.get_data(url)
            if imageIo:
                with open(os.path.join(self.spider.dir_path+title + "/" + page + ".jpg"),"wb") as f:
                    f.write(imageIo)
            else:
                print("章节:{},{},url:{},重新获取失败".format(title, page, url))

class MSpider(object):
    def __init__(self,index='',CartoonName=None,dir_path=''):
        self.cookie = cjar.CookieJar()
        self.opener = ub.build_opener(ub.HTTPCookieProcessor(self.cookie))
        self.opener.addheaders =[("User-Agent","Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Mobile Safari/537.36")]
        self.index = index #漫画首页
        self.CartoonName = CartoonName
        self.dir_path = dir_path
    def get_cookie(self):
        try:
            self.opener.open(self.index,timeout=10)
        except Exception as e:
            print(str(e))
    def search_api(self):
        if not isinstance(self.CartoonName,str):
            self.CartoonName = str(self.CartoonName)

        data_dict = {
            'keywords': self.CartoonName
        }
        data = parse.urlencode(data_dict)
        url = self.index+"search/?" + data

        html = self.get_data(url)
        html = re.sub('\xa9|\xbb', '', html.decode(encoding="utf-8"))
        html_xpath = etree.HTML(html)
        try:
            cartoonList =  html_xpath.xpath('//*[@id="update_list"]/div/div/div[2]/a/@href')#漫画首页
            update = html_xpath.xpath('//*[@id="update_list"]/div/div/div[2]/p[3]/span[2]/text()')
            for index,date in zip(cartoonList,update):
                print("更新日期:{},漫画链接:{}".format(date,index))
            index = int(input('请根据时间选择你要看的漫画?请输入阿拉伯数字进行选择。'))
            if index<=0:
                index = 1
        except Exception as e:
            print("error:{}".format(str(e)))
            return ""
        return cartoonList[index-1]
    #获取章节
    def get_chapter(self,index):
        html = self.get_data(index)
        html = re.sub('\xa9|\xbb','',html.decode(encoding="utf-8"))#删除特殊字符
        html_xpath = etree.HTML(html)
        chapters_href = html_xpath.xpath('//*[@id="chapter-list-1"]/li/a/@href')
        chapters_title = html_xpath.xpath('//*[@id="chapter-list-1"]/li/a/span/text()')
        chapters_len = len(chapters_title)

        print("""最近更新10章更新:
        {}
        {}""".format(chapters_title[chapters_len-10:chapters_len-5],chapters_title[chapters_len-5:chapters_len]))

        print('因为其中包含特殊章节,并不是每个章节链接和每一话动漫都对应.\n'
              '请自行斟酌要爬去的章节范围.\n\n\n'
              '您搜素漫画一共{}章节,'\
              .format(chapters_len))
        while True:
            try:
                start_page = int(input("请输入起始章节:"))
                end_page = int(input("请输入结束章节:"))
                if end_page>chapters_len:
                    print("章节超出搜素范围,请重新输入")
                    continue
                elif start_page>end_page:
                    print('起始章节大于结束章节,请重新输入')
                    continue
                elif start_page<1:
                    print("起始章节存在错误")
                    continue
                break
            except Exception as e:
                print('您输入的章节数目格式存在错误请重新出入,Error:{}'.format(str(e)))

        if chapters_len==len(chapters_href):
            chapters = Queue()
            for i in range(start_page-1,end_page):
                chapters.put((chapters_title,chapters_href))
        return chapters

    #开始获取章节的函数
    def get_oneChapter(self,chaptersQ):
        ImagesUrlQ=Queue()
        tNames = ["cps1","cps2","cps3","cps4","cps5","cps6","cps7","cps8","cps9","cps10"]
        cpts =[]#存储章节爬去线程

        for tName in tNames:
            cpt = chaptersThread(chaptersQ,ImagesUrlQ,tName,self)#创建进程
            cpt.start()
            cpts.append(cpt)
        while not chaptersQ.empty():
            pass

        global CHAPTERS_EXIT
        CHAPTERS_EXIT = True

        for cpt in cpts:
            cpt.join()
        print("章节获取完成,一共获取了{}章漫画".format(ImagesUrlQ.qsize()))
        if ImagesUrlQ.empty():
            print("ImagesUrlQ is empty ,漫画被下架!")
            exit(1)
        Imuts = []#image 获取线程
        t2Names = ["IMUs1","IMUs2","IMUs3","IMUs4","IMUs5","IMUs6","IMUs7","IMUs8","IMUs9","IMUs10",
                   "IMUs11", "IMUs12", "IMUs13", "IMUs14", "IMUs15", "IMUs16", "IMUs17", "IMUs18", "IMUs19", "IMUs20"]
        for tName in t2Names:
            Imut = ImagesUrlThread(ImagesUrlQ, tName, self)  # 创建进程
            Imut.start()
            Imuts.append(Imut)
        while not ImagesUrlQ.empty():
            pass
        global IMAGESURL_EXIT
        IMAGESURL_EXIT = True
        for Imut in Imuts:
            Imut.join()
        print("全部获取完成")
    @get_body
    def get_data(self,*args,**kwargs):
        return self.opener.open(args[0],timeout=30).read()#args[0]=url变量

CHAPTERS_EXIT = False
IMAGESURL_EXIT=False
ERRORU_EXIT=False
error_num=0
ErrorQ =Queue()#获取错误的url的队列
def main():
    CartoonName = input("请输入你想搜素的漫画名:")
    if sys.platform.startswith('win'):
        dir_path  ="manhua/"+CartoonName+"/"
    else:
        dir_path ="/storage/emulated/0/manhua/"+CartoonName+"/"
    try:
        os.mkdir(dir_path)
    except Exception as e:
        #print(str(e))
        pass
    index = "http://m.gufengmh8.com/"
    spider = MSpider(index,CartoonName,dir_path)
    spider.get_cookie()
    index = spider.search_api()
    if index:
        chapters = spider.get_chapter(index)
        spider.get_oneChapter(chapters)
        if not ErrorQ.empty():
            errorTnames = ["error1","error2","error3"]
            eThreads = []
            for tname in errorTnames:
                eThread = ErrorUrlThrad(ErrorQ,tname,spider)
                eThread.start()
                eThreads.append(eThread)
            while not ErrorQ.empty():
                pass
            #等待线程结束
            for t in eThreads:
                t.join()
    else:
        print("------------漫画不存在-----------")
        exit(1)
if __name__ == '__main__':
    main()


运行环境  Android 和 windows  用的 是python3.5  不兼容python2.7

免费评分

参与人数 3吾爱币 +2 热心值 +3 收起 理由
世态炎凉S冷暖自 + 1 + 1 http://x42036.banatoon.net 这种能爬不?大神出个教程
zhoulinzhi + 1 + 1 鼓励转贴优秀软件安全工具和文档!
hustlzp + 1 我很赞同!

查看全部评分

发帖前要善用论坛搜索功能,那里可能会有你要找的答案或者已经有人发布过相同内容了,请勿重复发帖。

hustlzp 发表于 2019-3-5 20:30
[Python] 纯文本查看 复制代码
#coding:utf-8
import urllib.request as ub
import urllib.parse as parse
import http.cookiejar as cjar
import re
from lxml import etree
from queue import Queue
import threading
import os
import json
import sys

#获取数据的装饰器函数
def get_body(func):
    def wrapper(*args, **kwargs):
        for i in range(0, 3):
            try:
                html = func(*args, **kwargs)
            except Exception as e:
                if str(e).find('404')>=0:
                    print("error:{},url:{}".format(str(e), args[1]))
                    return 1
                print("error:{},url:{}".format(str(e),args[1]))
                if i == 2:
                    i = 3
                continue
            else:
                return html
        if i == 3:
            return False
    return wrapper
#获取章节中图片地址的线程类
class chaptersThread(threading.Thread):
    def __init__(self,chaptersQ,ImagesUrlQ,threadName,spider):
        super(chaptersThread,self).__init__()
        self.chaptersQ = chaptersQ
        self.threadName = threadName
        self.ImagesUrlQ = ImagesUrlQ
        self.spider = spider
    def run(self):
        print("{}:线程正在工作".format(self.threadName))
        global CHAPTERS_EXIT
        while not CHAPTERS_EXIT:
            try:
                chapterTuple = self.chaptersQ.get(False) #一会处理
            except Exception as e:
                break
            title = chapterTuple[0]
            url = chapterTuple[1]
            url = self.spider.index + url[1:]
            html = self.spider.get_data(url) #返回未处理的html
            if html:
                html = re.sub('\xa9|\xbb', '', html.decode(encoding="utf-8"))
                imagesUrl=self.parseUrl(html)
                self.ImagesUrlQ.put({title:imagesUrl})
            else:
                print("获取失败{}".format(url))
        print("{}:{}完成工作".format(self.threadName,self.name))
    def parseUrl(self,html):
        imagesUrl=[]
        compile = re.compile("chapterImages = (\[.*?\])")
        compile2 = re.compile('chapterPath = "(.*?)"')
        images = json.loads(compile.findall(html)[0])
        im_path = compile2.findall(html)[0]
        im_url = "http://res.gufengmh8.com/"
        for image in images:
            imagesUrl.append(im_url + im_path + image)
        return  imagesUrl
#获取每一章节中的图片的线程类
class ImagesUrlThread(threading.Thread):
    def __init__(self,ImagesUrlQ,threadName,spider):
        super(ImagesUrlThread,self).__init__()
        self.ImagesUrlQ = ImagesUrlQ
        self.threadName = threadName
        self.spider = spider
    def run(self):
        print("{}:线程正在工作".format(self.threadName))
        global IMAGESURL_EXIT
        while not IMAGESURL_EXIT:
            try:
                images_chapter = self.ImagesUrlQ.get(False)
            except:
                break
            title = list(images_chapter.keys())[0]
            images = images_chapter[title]#list类型~~
            try:
                os.mkdir(os.path.join(self.spider.dir_path+title))
            except Exception as e:
                pass
                #print("error:{}".format(str(e)))
            for i in range(len(images)):
                url = images
                imagesIo = self.spider.get_data(url)
                if imagesIo:
                    save_path = self.spider.dir_path + title + "/" + str(i) + ".jpg"
                    with open(save_path, "bw") as file:
                        try:
                            file.write(imagesIo)
                        except:
                            pass
                elif imagesIo==1:
                    pass
                else:
                    global ErrorQ
                    ErrorQ.put({"title":title,"page":str(i),"url":url})
                    print("章节:{},第{}页,url:{},获取失败".format(title,str(i),url))
        print("获取完成{}".format(self.threadName))
#用来从新获取 获取失败的章节
class ErrorUrlThrad(threading.Thread):
    def __init__(self,ErrorQ,tName,spider):
        super(ErrorUrlThrad,self).__init__()
        self.ErrorQ = ErrorQ
        self.spider = spider
        self.threadName = tName
    def run(self):
        print("{}:线程正在工作".format(self.threadName))
        global ERRORU_EXIT
        while not ERRORU_EXIT:
            try:
                error_dict = self.ErrorQ.get(False)
            except Exception as e:
                break

            title = error_dict["title"]
            page =error_dict["page"]
            url = error_dict["url"]

            imageIo = self.spider.get_data(url)
            if imageIo:
                with open(os.path.join(self.spider.dir_path+title + "/" + page + ".jpg"),"wb") as f:
                    f.write(imageIo)
            else:
                print("章节:{},第{}页,url:{},重新获取失败".format(title, page, url))

class MSpider(object):
    def __init__(self,index='',CartoonName=None,dir_path=''):
        self.cookie = cjar.CookieJar()
        self.opener = ub.build_opener(ub.HTTPCookieProcessor(self.cookie))
        self.opener.addheaders =[("User-Agent","Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Mobile Safari/537.36")]
        self.index = index #漫画首页
        self.CartoonName = CartoonName
        self.dir_path = dir_path
    def get_cookie(self):
        try:
            self.opener.open(self.index,timeout=10)
        except Exception as e:
            print(str(e))
    def search_api(self):
        if not isinstance(self.CartoonName,str):
            self.CartoonName = str(self.CartoonName)

        data_dict = {
            'keywords': self.CartoonName
        }
        data = parse.urlencode(data_dict)
        url = self.index+"search/?" + data

        html = self.get_data(url)
        html = re.sub('\xa9|\xbb', '', html.decode(encoding="utf-8"))
        html_xpath = etree.HTML(html)
        try:
            cartoonList =  html_xpath.xpath('//*[@id="update_list"]/div/div/div[2]/a/@href')#漫画首页
            update = html_xpath.xpath('//*[@id="update_list"]/div/div/div[2]/p[3]/span[2]/text()')
            for index,date in zip(cartoonList,update):
                print("更新日期:{},漫画链接:{}".format(date,index))
            index = int(input('请根据时间选择你要看的漫画?请输入阿拉伯数字进行选择。'))
            if index<=0:
                index = 1
        except Exception as e:
            print("error:{}".format(str(e)))
            return ""
        return cartoonList[index-1]
    #获取章节
    def get_chapter(self,index):
        html = self.get_data(index)
        html = re.sub('\xa9|\xbb','',html.decode(encoding="utf-8"))#删除特殊字符
        html_xpath = etree.HTML(html)
        chapters_href = html_xpath.xpath('//*[@id="chapter-list-1"]/li/a/@href')
        chapters_title = html_xpath.xpath('//*[@id="chapter-list-1"]/li/a/span/text()')
        chapters_len = len(chapters_title)

        print("""最近更新10章更新:
        {}
        {}""".format(chapters_title[chapters_len-10:chapters_len-5],chapters_title[chapters_len-5:chapters_len]))

        print('因为其中包含特殊章节,并不是每个章节链接和每一话动漫都对应.\n'
              '请自行斟酌要爬去的章节范围.\n\n\n'
              '您搜素漫画一共{}章节,'\
              .format(chapters_len))
        while True:
            try:
                start_page = int(input("请输入起始章节:"))
                end_page = int(input("请输入结束章节:"))
                if end_page>chapters_len:
                    print("章节超出搜素范围,请重新输入")
                    continue
                elif start_page>end_page:
                    print('起始章节大于结束章节,请重新输入')
                    continue
                elif start_page<1:
                    print("起始章节存在错误")
                    continue
                break
            except Exception as e:
                print('您输入的章节数目格式存在错误请重新出入,Error:{}'.format(str(e)))

        if chapters_len==len(chapters_href):
            chapters = Queue()
            for i in range(start_page-1,end_page):
                chapters.put((chapters_title,chapters_href))
        return chapters

    #开始获取章节的函数
    def get_oneChapter(self,chaptersQ):
        ImagesUrlQ=Queue()
        tNames = ["cps1","cps2","cps3","cps4","cps5","cps6","cps7","cps8","cps9","cps10"]
        cpts =[]#存储章节爬去线程

        for tName in tNames:
            cpt = chaptersThread(chaptersQ,ImagesUrlQ,tName,self)#创建进程
            cpt.start()
            cpts.append(cpt)
        while not chaptersQ.empty():
            pass

        global CHAPTERS_EXIT
        CHAPTERS_EXIT = True

        for cpt in cpts:
            cpt.join()
        print("章节获取完成,一共获取了{}章漫画".format(ImagesUrlQ.qsize()))
        if ImagesUrlQ.empty():
            print("ImagesUrlQ is empty ,漫画被下架!")
            exit(1)
        Imuts = []#image 获取线程
        t2Names = ["IMUs1","IMUs2","IMUs3","IMUs4","IMUs5","IMUs6","IMUs7","IMUs8","IMUs9","IMUs10",
                   "IMUs11", "IMUs12", "IMUs13", "IMUs14", "IMUs15", "IMUs16", "IMUs17", "IMUs18", "IMUs19", "IMUs20"]
        for tName in t2Names:
            Imut = ImagesUrlThread(ImagesUrlQ, tName, self)  # 创建进程
            Imut.start()
            Imuts.append(Imut)
        while not ImagesUrlQ.empty():
            pass
        global IMAGESURL_EXIT
        IMAGESURL_EXIT = True
        for Imut in Imuts:
            Imut.join()
        print("全部获取完成")
    @get_body
    def get_data(self,*args,**kwargs):
        return self.opener.open(args[0],timeout=30).read()#args[0]=url变量

CHAPTERS_EXIT = False
IMAGESURL_EXIT=False
ERRORU_EXIT=False
error_num=0
ErrorQ =Queue()#获取错误的url的队列
def main():
    CartoonName = input("请输入你想搜素的漫画名:")
    if sys.platform.startswith('win'):
        dir_path  ="manhua/"+CartoonName+"/"
    else:
        dir_path ="/storage/emulated/0/manhua/"+CartoonName+"/"
    try:
        os.mkdir(dir_path)
    except Exception as e:
        #print(str(e))
        pass
    index = "http://m.gufengmh8.com/"
    spider = MSpider(index,CartoonName,dir_path)
    spider.get_cookie()
    index = spider.search_api()
    if index:
        chapters = spider.get_chapter(index)
        spider.get_oneChapter(chapters)
        if not ErrorQ.empty():
            errorTnames = ["error1","error2","error3"]
            eThreads = []
            for tname in errorTnames:
                eThread = ErrorUrlThrad(ErrorQ,tname,spider)
                eThread.start()
                eThreads.append(eThread)
            while not ErrorQ.empty():
                pass
            #等待线程结束
            for t in eThreads:
                t.join()
    else:
        print("------------漫画不存在-----------")
        exit(1)
if __name__ == '__main__':
    main()

 楼主| .Net_破解 发表于 2019-3-5 21:39
hustlzp 发表于 2019-3-5 20:30
[mw_shl_code=python,true]#coding:utf-8
import urllib.request as ub
import urllib.parse as parse

谢谢大佬 我不会用这个功能
python3 发表于 2019-3-7 22:15
鸿鹄小白 发表于 2019-3-21 16:34
真小白前来学习
莫問道 发表于 2019-5-1 15:27
有成品吗。大佬
 楼主| .Net_破解 发表于 2019-5-2 07:36

没有图行界面需要自己下载python3 和lxml的库然后运行脚本
您需要登录后才可以回帖 登录 | 注册[Register]

本版积分规则

返回列表

RSS订阅|小黑屋|处罚记录|联系我们|吾爱破解 - LCG - LSG ( 京ICP备16042023号 | 京公网安备 11010502030087号 )

GMT+8, 2024-11-16 07:50

Powered by Discuz!

Copyright © 2001-2020, Tencent Cloud.

快速回复 返回顶部 返回列表