本帖最后由 wm517 于 2023-7-25 14:38 编辑
xiaoshuo.py
import argparse
import requests
import re
from rich.progress import Progress
from rich.progress import BarColumn, Progress, SpinnerColumn, TaskProgressColumn, TimeElapsedColumn, TimeRemainingColumn, MofNCompleteColumn
from concurrent.futures import ThreadPoolExecutor
import os
class Spider:
baseurl = "http://www.ibiquge.cc"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
}
def __init__(self, url):
self.url = url
self.getcatalog()
self.download()
# 获取章节目录
def getcatalog(self):
self.data = []
rsp = requests.get(self.url, headers=self.headers)
purl = '<dd><a href ="(.*?)">'
ptitle = '<dd><a href =".*?>(.*?)<'
for i, j in zip(re.findall(purl, rsp.text), re.findall(ptitle, rsp.text)):
self.data.append([j, self.baseurl + i])
# 下载
def download(self):
novel_name = self.extract_novel_name(self.url)
file_path = f"{novel_name}.txt"
with open(file_path, 'w') as f:
# 设置进度条参数
with Progress(
SpinnerColumn(spinner_name='monkey', speed=0.2),
"{task.description}",
BarColumn(),
MofNCompleteColumn(),
TaskProgressColumn(),
TimeElapsedColumn(),
TimeRemainingColumn()
) as progress:
track = progress.add_task(total=len(self.data), description='downloading')
progress.update(track, advance=0) # 初始化进度条
with ThreadPoolExecutor() as executor:
futures = []
for i in self.data:
future = executor.submit(self.parse, i[1])
futures.append((future, i))
for future, i in futures:
f.write(i[0] + '\n')
txt = future.result()
f.writelines(txt)
progress.update(track, advance=1, description='downloading')
# 解析小说内容
def parse(self, url):
rsp = requests.get(url, headers=self.headers)
content = rsp.text
p = '> ([\s\S]*?)<'
txt = re.findall(p, rsp.text)
return txt # txt is a list
# 提取小说名字
def extract_novel_name(self, url):
novel_id = url.split('/')[-2]
rsp = requests.get(f"{self.baseurl}/{novel_id}/")
ptitle = '<h1>(.*?)</h1>'
title = re.findall(ptitle, rsp.text)[0]
return title
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('url', help='The URL of the novel')
args = parser.parse_args()
Spider(args.url)
运行
|