爬取qd免费小说

nstar1221 发表于 2020-2-27 19:25

本帖最后由 nstar1221 于 2020-2-27 19:29 编辑

爬取小说网站的免费小说。
分析过程就不写了，毕竟网络写手也挺苦逼的，尊重一下版权。

源码：
import re
import time
from threading import Thread
# 自己做的包，用于快速爬取
from dwspackage.crawlbug import CrawlBug

# 创建一个网站类，继承CrawlBug
class WebSource(CrawlBug):
def __init__(self, url):
 super().__init__(url)
 self.chapterTitle = None
 self.comboLinks()

def comboLinks(self):
 """
 根据网页页码规律重写
 """
 # 分析网页：链接
 print('开始解析')
 chapters = self.analysisHtml('class', 'volume-wrap')
 pattern = r'<li.*?data-rid="(.+?)"><a.*?data-cid="(.+?)".+?</li>'
 links = re.findall(pattern, str(chapters), re.S)
 count = 0
 for i, link in links:
 if not link.startswith('//vipreader'):
 count += 1
 tmpLink = "https:" + link
 self.linkList.append(tmpLink)

 print("共计解析%d章" % count)
 self.pages = count

def getPages(self):
 """
 获取全部的资源地址并合成资源列表。
 :return: [章标题，正文]
 """
 print("开始获取资源列表")
 for chapter in range(self.pages):# self.pages
 self.tmpHtml = self.getHtml(url=self.linkList)
 # 获取章回标题
 tmpTitle = self.analysisHtml('class', 'content-wrap')
 pattern = '(.+?)'
 chapter_title = re.findall(pattern, str(tmpTitle))
 self.chapterTitle = chapter_title
 print(chapter_title)
 # 得到每个页面的资源连接
 tmpContent = self.analysisHtml('class', 'read-content j_readContent')
 pattern = '(.+?)'
 self.getInfo(pattern, tmpContent)
 self.getSourceList(chapter)
 pass

def getSourceList(self, chapter):
 """
 获取资源并生成列表。
 视具体情况，可能要在子类中重写。
 :param chapter: 当前页码
 :return: [序号，章标题，正文]
 """
 tmpList = []
 for source in self.info:
 tmpList.append(source+'\n')
 print("获取第%d章" % (chapter+1))
 self.source_queue.put()
 pass

def saveFile(self):
 """
 单独开个线程用于保存文件
 :return:
 """
 index = 0
 while True:
 if self.pages == index:
 print(self.pages)
 break
 if not self.source_queue.empty():
 content = self.source_queue.get()
 filePath = self.path + content + '.' + content + '.' + self.fileType
 with open(filePath, 'w') as f:
 f.writelines(content)
 print('%s 写入文件完成' % content)
 index += 1
 pass

if __name__ == "__main__":
print('开始爬取免费小说')
start = time.time()
# 小说目录页面
url = 'https://book.qidian.com/info/1018152134'
tail = '#Catalog'
targetUrl = url + tail
# 创建一个CrawBug对象
myCrawl = CrawlBug(targetUrl)
# 分析网页：获取专辑名
folderTitle = myCrawl.analysisHtml('class', 'book-info')
targetPattern = '(.+?)'
folderTitle = myCrawl.getInfo(targetPattern, folderTitle)
print(folderTitle)

# 创建一个网页对象
qdzww = WebSource(targetUrl)

# 创建目录
qdzww.path = 'e:/download/source/'
# # 使用合集名作为文件夹名
qdzww.folder = str(folderTitle)
qdzww.fileType = 'txt'
qdzww.mkdir()
t1 = Thread(target=qdzww.getPages)
t2 = Thread(target=qdzww.saveFile)
t1.start()
t2.start()
t2.join()
end = time.time()
print("下载完成，用时{}秒".format(end - start))

模块源码：https://www.52pojie.cn/thread-1111968-1-1.html

nstar1221 发表于 2020-2-27 20:57

zhangxu888 发表于 2020-2-27 20:42
from dwspackage.crawlbug import CrawlBug错误如何解决？

下面有个链接，里面第二段代码是源码，把模块的源码做成成包导入，或者直接把源码直接赋值到文件里

tomdjf 发表于 2020-2-27 19:26

顶一个看看好不好

飞龙project 发表于 2020-2-27 19:35

看起来挺好的，只不过只能爬取一个网站就有点可惜了

qq9199 发表于 2020-2-27 19:42

看下代码，

52cavid 发表于 2020-2-27 19:45

谢谢楼主了

ladiosfei 发表于 2020-2-27 19:46

学习学习，如果可以适用同类型或者算法写法相似的就更棒了，感谢！

fight775 发表于 2020-2-27 19:47

学习学习

圣泽发表于 2020-2-27 19:49

看不懂只能说666

dsfive 发表于 2020-2-27 19:56

这个可以看看

枫叶荻花 发表于 2020-2-27 20:23

近来使用python的很多

页: [1] 2

吾爱破解 - 52pojie.cn's Archiver

爬取qd免费小说