本帖最后由 nstar1221 于 2020-2-27 19:29 编辑
爬取小说网站的免费小说。
分析过程就不写了,毕竟网络写手也挺苦逼的,尊重一下版权。
源码:
[Python] 纯文本查看 复制代码 import re
import time
from threading import Thread
# 自己做的包,用于快速爬取
from dwspackage.crawlbug import CrawlBug
# 创建一个网站类,继承CrawlBug
class WebSource(CrawlBug):
def __init__(self, url):
super().__init__(url)
self.chapterTitle = None
self.comboLinks()
def comboLinks(self):
"""
根据网页页码规律重写
"""
# 分析网页:链接
print('开始解析')
chapters = self.analysisHtml('class', 'volume-wrap')
pattern = r'<li.*?data-rid="(.+?)"><a.*?data-cid="(.+?)".+?</li>'
links = re.findall(pattern, str(chapters), re.S)
count = 0
for i, link in links:
if not link.startswith('//vipreader'):
count += 1
tmpLink = "https:" + link
self.linkList.append(tmpLink)
print("共计解析%d章" % count)
self.pages = count
def getPages(self):
"""
获取全部的资源地址并合成资源列表。
:return: [章标题,正文]
"""
print("开始获取资源列表")
for chapter in range(self.pages): # self.pages
self.tmpHtml = self.getHtml(url=self.linkList[chapter])
# 获取章回标题
tmpTitle = self.analysisHtml('class', 'content-wrap')
pattern = '<span class="content-wrap">(.+?)</span>'
chapter_title = re.findall(pattern, str(tmpTitle))
self.chapterTitle = chapter_title[0]
print(chapter_title[0])
# 得到每个页面的资源连接
tmpContent = self.analysisHtml('class', 'read-content j_readContent')
pattern = '<p>(.+?)</p>'
self.getInfo(pattern, tmpContent)
self.getSourceList(chapter)
pass
def getSourceList(self, chapter):
"""
获取资源并生成列表。
视具体情况,可能要在子类中重写。
:param chapter: 当前页码
:return: [序号,章标题,正文]
"""
tmpList = []
for source in self.info:
tmpList.append(source+'\n')
print("获取第%d章" % (chapter+1))
self.source_queue.put([str(chapter + 1), self.chapterTitle, tmpList])
pass
def saveFile(self):
"""
单独开个线程用于保存文件
:return:
"""
index = 0
while True:
if self.pages == index:
print(self.pages)
break
if not self.source_queue.empty():
content = self.source_queue.get()
filePath = self.path + content[0] + '.' + content[1] + '.' + self.fileType
with open(filePath, 'w') as f:
f.writelines(content[2])
print('%s 写入文件完成' % content[1])
index += 1
pass
if __name__ == "__main__":
print('开始爬取免费小说')
start = time.time()
# 小说目录页面
url = 'https://book.qidian.com/info/1018152134'
tail = '#Catalog'
targetUrl = url + tail
# 创建一个CrawBug对象
myCrawl = CrawlBug(targetUrl)
# 分析网页:获取专辑名
folderTitle = myCrawl.analysisHtml('class', 'book-info')
targetPattern = '<em>(.+?)</em>'
folderTitle = myCrawl.getInfo(targetPattern, folderTitle)
print(folderTitle[0])
# 创建一个网页对象
qdzww = WebSource(targetUrl)
# 创建目录
qdzww.path = 'e:/download/source/'
# # 使用合集名作为文件夹名
qdzww.folder = str(folderTitle[0])
qdzww.fileType = 'txt'
qdzww.mkdir()
t1 = Thread(target=qdzww.getPages)
t2 = Thread(target=qdzww.saveFile)
t1.start()
t2.start()
t2.join()
end = time.time()
print("下载完成,用时{}秒".format(end - start))
模块源码:https://www.52pojie.cn/thread-1111968-1-1.html |