好友
阅读权限25
听众
最后登录1970-1-1
|
本帖最后由 FeiyuYip 于 2021-5-14 01:27 编辑
我又来啦!!
今天尝试爬取某笔趣阁小说下载,有两个选项:1是自动下载全站完本小说,2是根据输入网址下载单本小说。思路部分参考@Raohz520的https://www.52pojie.cn/forum.php?mod=viewthread&tid=1425753&highlight=%B1%CA%C8%A4%B8%F3这个帖子,代码是全部是自己写的。大家可以对比看看。
区别在于:1.将小说按类别放在文件夹中;2.整本小说保存在一个txt文档,而非一个章节放一个txt文档,方便转到手机阅读。
本人还是新手,将学习的过程发出来,希望与大家共同进步,请各位大佬轻拍!!
运行界面如下:
代码如下:
import requests
from lxml import etree
import time
import os
import random
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
}
filePath = '.\笔趣阁全站小说'
def getPageInfo(url):
try:
html = requests.get(url=url, headers=headers)
html.encoding = 'utf-8'
if html.status_code == 200:
return html.text
return None
except Exception:
return None
# 下载全站完本小说
def downloadAllNovels(html):
seclector1 = etree.HTML(html)
# 获取小说页面的页数
page_list = seclector1.xpath('//div[@class="pagelink"]/a[@class="last"]/text()')[0]
# 获取每一页的链接
for i in range(1, int(page_list)):
# 每一页的链接
url_single_page = 'https://www.biquwx.la/wanjiexiaoshuo/' + str(i)
content = getPageInfo(url_single_page)
seclector2 = etree.HTML(content)
# 每一本小说链接
single_novels = seclector2.xpath('//*[@id="main"]/div[1]/ul/li')
for single_novel in single_novels:
# 链接
single_novel_address = single_novel.xpath('span[2]/a/@href')[0]
# 执行下载单本小说函数
downloadSingleNovel(single_novel_address)
# 下载单本小说
def downloadSingleNovel(html):
# 获取每一本小说的信息
content = getPageInfo(html)
seclector = etree.HTML(content)
# 小说名
novel_title = seclector.xpath('//*[@id="info"]/h1/text()')[0]
# 小说作者
novel_author = seclector.xpath('//*[@id="info"]/p[1]/text()')[0]
# 小说类别
novel_category = seclector.xpath('//*[@id="info"]/p[2]/text()')[0].split(':')[1]
# 按小说类别建立文件夹
filePath_novel_category = filePath + '/' + novel_category
if not os.path.exists(filePath_novel_category):
os.mkdir(filePath_novel_category)
# 小说简介
_novel_introduce = seclector.xpath('//*[@id="intro"]/p[1]/text()')
novel_introduce = ''
for _ in _novel_introduce:
novel_introduce += _ + '\n'
# 先将小说名、作者、简介写入txt文件,并保存到相应类别的文件夹中
with open(filePath_novel_category + '/' + novel_title + '.txt', 'a+', encoding='utf-8') as f:
f.write(' ' + novel_title + '\n\n')
f.write(' ' + novel_author + '\n\n')
# f.write(' ' + novel_category + '\n')
f.write(novel_introduce + '\n')
# 获取小说章节链接
_novel_chapter_name_address = seclector.xpath('//*[@id="list"]/dl/dd')
for _ in _novel_chapter_name_address:
# 章节链接
novel_chapter_address = html + _.xpath('a/@href')[0]
# 获取每一章节的章节名及正文
content3 = getPageInfo(novel_chapter_address)
seclector4 = etree.HTML(content3)
# 章节名
novel_chapter_name = seclector4.xpath('//*[@id="wrapper"]/div[5]/div/div[2]/h1/text()')[0]
# 每一章节的正文
_single_chapter_content = seclector4.xpath('//*[@id="content"]/text()')
single_chapter_content = ''
for _ in _single_chapter_content:
single_chapter_content += _
with open(filePath_novel_category + '/' + novel_title + '.txt', 'a+', encoding='utf-8') as f:
# 添加章节名
f.write(' ' + novel_chapter_name)
# 添加章节内容
f.write(' ' + single_chapter_content)
print('%s %s 下载完成' % (novel_title, novel_chapter_name))
print('____________')
print('%s 下载完成' % novel_title)
def main():
# 创建文件夹:笔趣阁全站小说
if not os.path.exists(filePath):
os.mkdir(filePath)
print('下载全站完本小说,请输入1;按需下载单本小说,请输入2')
if input() == '1':
url = 'https://www.biquwx.la/wanjiexiaoshuo/'
html = getPageInfo(url)
downloadAllNovels(html)
time.sleep(random.uniform(0.05, 0.2))
else:
print('请输入需要下载的小说网址(目录页):')
html = input()
downloadSingleNovel(html)
time.sleep(random.uniform(0.05, 0.2))
if __name__ == '__main__':
main()
|
免费评分
-
查看全部评分
|
发帖前要善用【论坛搜索】功能,那里可能会有你要找的答案或者已经有人发布过相同内容了,请勿重复发帖。 |
|
|
|
|