稍微研究了一下,他这个章节的url有规律,是递增的,所以可以按url顺序排序;
章节做了排序,保存到同一个文本里面了;
[Java] 纯文本查看 复制代码 import requests
from bs4 import BeautifulSoup
import sys
import os
import threading
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.0.0 Safari/537.36'
}
# 解决特殊字符命名
def FileName(STR):
for i,j in ("//","\\\","??","|︱","\""","**","<<",">>"):
STR=STR.replace(i,j)
return STR
# 抓取单章小说
def catchOne(link,name):
res=requests.get(url=link,headers=headers)
res.encoding='utf-8'
# 解析数据
co=BeautifulSoup(res.text,'html.parser')
soups=co.find_all('dd')
title=co.find('h2').text
# 定义空数组,方便后续排序
list=[]
for soup in soups:
dict1={'id':soup['data-id'],'content':soup.text}
list.append(dict1)
# 定义排序方法
def order(e):
return int(e['id'])
list.sort(key=order,reverse=False)
# 删除后面无用的两行
del list[-1]
# del list[-1]
# 保存数据到本地文件
#标题
with open('e:\\catch\\' + name + '.txt', mode='a+',encoding='utf8') as f:
f.write('\n\n')
f.write(FileName(title))
f.write('\n')
f.close()
for i in list:
with open('e:\\catch\\' + name + '.txt', mode='a+',encoding='utf8') as f:
f.write(i['content'])
f.close()
# 获取所有章节并下载到本地
def catchAllText(link,name):
res=requests.get(url=link,headers=headers)
res.encoding='utf-8'
# 解析数据
co = BeautifulSoup(res.text, 'html.parser')
cos=co.find_all('a',class_='name')
# 创建文件夹
# os.mkdir('e:\\catch\\'+name)
list=[]
for c in cos:
target = link + c['href']
title = c.text
dict1={'id':c['href'].split(".")[0],'url':target,'title':title}
list.append(dict1)
#排序
def order(e):
return int(e['id'])
list.sort(key=order,reverse=False)
#遍历list,按顺序抓取
for i in list:
print('开始抓取章节 : ' + i['title'])
catchOne(i['url'], name)
# 指定小说爬取
def catchNovel( ):
par=input("请输入小说名进行搜索 : ")
link = 'https://www.aixs.la/search.php'
data = {
'key': par
}
res = requests.post(url=link, headers=headers, data=data)
res.encoding = 'utf-8'
# 解析数据
co = BeautifulSoup(res.text, 'html.parser')
cos = co.find_all(class_='bigpic-book-name')
list = []
for i in cos:
dit = {'id': cos.index(i) + 1, 'title': i.text, 'href': 'https://www.aixs.la' + i['href']}
print(dit)
list.append(dit)
if len(list) !=0:
id = input('请输入序号id:')
# 获得目标书籍的地址
targetUrl = list[int(id) - 1]['href']
# 书名
name=list[int(id)-1]['title']
res2 = requests.get(url=targetUrl, headers=headers)
res2.encoding = 'utf-8'
# 解析数据
co2 = BeautifulSoup(res2.text, 'html.parser')
# 获得目录地址
cos2 = co2.find(class_='tab fl j-content-tab').find_all('a')[1]['href']
mlUrl = 'https://www.aixs.la' + cos2
catchAllText(mlUrl,name)
else:
print('暂无搜索到结果!')
catchNovel()
catchNovel()
|