文库吧小说爬取下载打包 EPUB
本帖最后由 飞龙使者 于 2023-2-17 18:35 编辑采用整本下载的接口,无需登录(但需要知道小说ID),用正则分章。
开源在 Github 上:https://github.com/apachecn/Book ... dTool/lightnovel.py
关键代码:
```py
def format_text(text):
# 多个换行变为一个
text = re.sub(r'(\r\n)+', '\r\n', text)
# 去掉前两行
text = re.sub(r'^.+?\r\n.+?\r\n', '', text)
# 去掉后两行
text = re.sub(r'\r\n.+?\r\n.+?$', '', text)
# 划分标题和段落
def rep_func(m):
s = m.group(1)
return '' + s + '' \
if s.startswith(' ') else \
'<!--split--><h1>' + s + '</h1>'
text = re.sub(r'^(.+?)$', rep_func, text, flags=re.M)
# 拆分章节,过滤空白章节
chs = filter(None, text.split('<!--split-->'))
# 将章节拆分为标题和内容
map_func = lambda x: {
'title': re.search(r'<h1>(.+?)</h1>', x).group(1),
'content': re.sub(r'<h1>.+?<\/h1>', '', x),
}
return list(map(map_func, chs))
def get_info(html):
root = pq(html)
dt = root('#content > div:nth-child(1) > table:nth-child(1) tr:nth-child(2) > td:nth-child(4)').text().replace('-', '') or 'UNKNOWN'
url = root('#content > div:nth-child(1) > div:nth-child(6) > div > span:nth-child(1) > fieldset > div > a').attr('href')
title = root('#content > div:nth-child(1) > table:nth-child(1) tr:nth-child(1) > td > table tr > td:nth-child(1) > span > b').text()
author = root('#content > div:nth-child(1) > table:nth-child(1) tr:nth-child(2) > td:nth-child(2)').text()
return {'dt': dt, 'url': url, 'title': fname_escape(title), 'author': fname_escape(author)}
def download_ln(args):
id = args.id
save_path = args.save_path
headers = default_hdrs.copy()
headers['Cookie'] = args.cookie
url = f'https://www.wenku8.net/book/{id}.htm'
html = request_retry('GET', url, headers=headers).content.decode('gbk')
info = get_info(html)
print(info['title'], info['author'], info['dt'])
ofname = f"{save_path}/{info['title']} - {info['author']} - {info['dt']}.epub"
if path.exists(ofname):
print('已存在')
return
safe_mkdir(save_path)
articles = [{
'title': info['title'],
'content': f"作者:{info['author']}",
}]
url = f'http://dl.wenku8.com/down.php?type=udefault_hdrstf8&id={id}'
text = request_retry('GET', url, headers=headers).content.decode('utf-8')
chs = format_text(text)
articles += chs
gen_epub(articles, {}, None, ofname)
```
已发布到 PYPI,可以一键下载安装:
```sh
pip install BookerDownloadTool
dl-tool ln <id>
```
注:文库吧首页被隐藏了,需要手动输入【/login.php】来登录。 wenku8关闭了吧 谢谢分享,分析学习中。 好东西,谢谢,楼主辛苦了! 好东西 感谢分享 感谢大佬分享 大佬,咋用啊?
在哪儿下?大佬,有成品吗?小白求 学习💪 大佬,入口在哪里啊? 没看到main啊
页:
[1]
2