带彩色进度条的笔趣阁爬虫
本帖最后由 xccxvb 于 2023-7-25 11:55 编辑平时的控制台程序比较枯燥乏味,rich这个库可以让我们的控制台程序不再枯燥乏味
手机也能用!
上代码!
直接运行代码即可,替换小说就修改Spider类里的url属性。
import requests, re
from rich.progress import Progress
from rich.progress import BarColumn,Progress,SpinnerColumn,TaskProgressColumn,TimeElapsedColumn,TimeRemainingColumn,MofNCompleteColumn
class Spider:
url = "http://www.ibiquge.cc/83110"
baseurl = "http://www.ibiquge.cc"
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
}
def __init__(self) -> None:
self.getcatalog()
self.download()
#获取章节目录
def getcatalog(self):
self.data = []
rsp = requests.get(self.url, headers=self.headers)
purl = '<dd><a href ="(.*?)">'
ptitle = '<dd><a href =".*?>(.*?)<'
for i, j in zip(re.findall(purl, rsp.text),re.findall(ptitle, rsp.text)):
self.data.append()
#下载
def download(self):
with open('小说.txt', 'a') as f:
#设置进度条参数
with Progress(SpinnerColumn(spinner_name='monkey', speed=0.2),"{task.description}",BarColumn(),MofNCompleteColumn(),TaskProgressColumn(),
TimeElapsedColumn(),TimeRemainingColumn(),) as progress:
track = progress.add_task(total=len(self.data), description='downloading')
progress.update(track, advance=0)# 初始化进度条
for i in self.data:
f.write(i+'\n')
txt = self.parse(i)
f.writelines(txt)
progress.update(track, advance=1,description='downloading')
#解析小说内容
def parse(self, url):
rsp = requests.get(url,headers=self.headers)
content = rsp.text
p = '> ([\s\S]*?)<'
txt = re.findall(p, rsp.text)
return txt #txt is list
Spider()
如果要把开头的猴子捂脸表情改成月亮,就把monkey改为moon!
第38行代码,正则里的空格( )代码复制过来竟然直接变成空格了,如果无法正确获取内容可以尝试修改第38行正则表达式为:> ([\s\S]*?)<
【ai编写】改了下;文件名从url取,url改成变量
本帖最后由 wm517 于 2023-7-25 14:38 编辑> #### xiaoshuo.py
```
import argparse
import requests
import re
from rich.progress import Progress
from rich.progress import BarColumn, Progress, SpinnerColumn, TaskProgressColumn, TimeElapsedColumn, TimeRemainingColumn, MofNCompleteColumn
from concurrent.futures import ThreadPoolExecutor
import os
class Spider:
baseurl = "http://www.ibiquge.cc"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
}
def __init__(self, url):
self.url = url
self.getcatalog()
self.download()
# 获取章节目录
def getcatalog(self):
self.data = []
rsp = requests.get(self.url, headers=self.headers)
purl = '<dd><a href ="(.*?)">'
ptitle = '<dd><a href =".*?>(.*?)<'
for i, j in zip(re.findall(purl, rsp.text), re.findall(ptitle, rsp.text)):
self.data.append()
# 下载
def download(self):
novel_name = self.extract_novel_name(self.url)
file_path = f"{novel_name}.txt"
with open(file_path, 'w') as f:
# 设置进度条参数
with Progress(
SpinnerColumn(spinner_name='monkey', speed=0.2),
"{task.description}",
BarColumn(),
MofNCompleteColumn(),
TaskProgressColumn(),
TimeElapsedColumn(),
TimeRemainingColumn()
) as progress:
track = progress.add_task(total=len(self.data), description='downloading')
progress.update(track, advance=0)# 初始化进度条
with ThreadPoolExecutor() as executor:
futures = []
for i in self.data:
future = executor.submit(self.parse, i)
futures.append((future, i))
for future, i in futures:
f.write(i + '\n')
txt = future.result()
f.writelines(txt)
progress.update(track, advance=1, description='downloading')
# 解析小说内容
def parse(self, url):
rsp = requests.get(url, headers=self.headers)
content = rsp.text
p = '> ([\s\S]*?)<'
txt = re.findall(p, rsp.text)
return txt# txt is a list
# 提取小说名字
def extract_novel_name(self, url):
novel_id = url.split('/')[-2]
rsp = requests.get(f"{self.baseurl}/{novel_id}/")
ptitle = '<h1>(.*?)</h1>'
title = re.findall(ptitle, rsp.text)
return title
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('url', help='The URL of the novel')
args = parser.parse_args()
Spider(args.url)
```
#### 运行
> #### python3 xiaoshuo.pyhttp://www.ibiquge.cc/83110 �downloading ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╺━━ 862/92893% 0:07:46 0:00:40
有进度条,时间很慢,抓取过程中,小说.txt是零字节的。完成后,小说.txt只有21KB,里面只是章节目录,无章节内容。 谢谢楼主,这个感觉挺厉害的哦!
这个不错哎挺好看学习一下 支持一下,黑白的太单调了 看小说app的一个小改动,不错 很好,很有意思 支持一下, 支持啦,谢谢 不错,支持 这个好玩。不错。不错。