某不知名小说网爬取小说章节

gebiafu 发表于 2022-11-9 22:16

本帖最后由 gebiafu 于 2022-11-9 22:25 编辑

周末学习了一下爬虫，手痒，百度随便找了个小说网试水
可以搜索想要看的小说，按提示选择结果，最后批量保存所有章节内容到本地；import requests
from bs4 import BeautifulSoup
import sys
import os
import threading

# 解决特殊字符命名
def FileName(STR):
for i,j in ("/／","\\＼","?？","|︱","\"＂","*＊","<＜",">＞"):
   STR=STR.replace(i,j)
return STR

# 抓取单章小说
def catchOne(link,name):
# 请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.0.0 Safari/537.36'
}
res=requests.get(url=link,headers=headers)
res.encoding='utf-8'
# 解析数据
co=BeautifulSoup(res.text,'html.parser')
soups=co.find_all('dd')
title=co.find('h2').text
# 定义空数组，方便后续排序
list=[]
for soup in soups:
   dict1={'id':soup['data-id'],'content':soup.text}
   list.append(dict1)
# 定义排序方法
def order(e):
   return int(e['id'])
list.sort(key=order,reverse=False)
# 删除后面无用的两行
del list[-1]
# del list[-1]
# 保存数据到本地文件
for i in list:
   with open('e:\\img\\' + name+'\\'+FileName(title) + '.txt', mode='ab') as f:
         f.write(bytes(i['content'],encoding='utf8'))

# 获取所有章节并下载到本地
def catchAllText(link,name):
# 请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.0.0 Safari/537.36'
}
res=requests.get(url=link,headers=headers)
res.encoding='utf-8'
# 解析数据
co=BeautifulSoup(res.text,'html.parser')
cos=co.find_all('a',class_='name')
# 创建文件夹
os.mkdir('e:\\img\\'+name)
for c in cos:
   target=link+c['href']
   title=c.text
   print('开始抓取章节 : '+title)
   t=threading.Thread(target=catchOne,args=(target,name,))
   t.run()

# 指定小说爬取
def catchNovel( ):
par=input("请输入小说名进行搜索 : ")
link = 'https://www.aixs.la/search.php'
headers = {
   'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.0.0 Safari/537.36'
}
data = {
   'key': par
}
res = requests.post(url=link, headers=headers, data=data)
res.encoding = 'utf-8'
# 解析数据
co = BeautifulSoup(res.text, 'html.parser')
cos = co.find_all(class_='bigpic-book-name')

list = []
for i in cos:
   dit = {'id': cos.index(i) + 1, 'title': i.text, 'href': 'https://www.aixs.la' + i['href']}

   print(dit)
   list.append(dit)
if len(list) !=0:
   id = input('请输入序号id:')
   # 获得目标书籍的地址
   targetUrl = list['href']
   # 书名
   name=list['title']

   res2 = requests.get(url=targetUrl, headers=headers)
   res2.encoding = 'utf-8'
   # 解析数据
   co2 = BeautifulSoup(res2.text, 'html.parser')
   # 获得目录地址
   cos2 = co2.find(class_='tab fl j-content-tab').find_all('a')['href']
   mlUrl = 'https://www.aixs.la' + cos2
   catchAllText(mlUrl,name)
else:
   print('暂无搜索到结果！')
   catchNovel()

catchNovel()

gebiafu 发表于 2022-11-10 19:47

稍微研究了一下，他这个章节的url有规律，是递增的，所以可以按url顺序排序；

章节做了排序，保存到同一个文本里面了；
import requests
from bs4 import BeautifulSoup
import sys
import os
import threading

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.0.0 Safari/537.36'
}
# 解决特殊字符命名
def FileName(STR):
for i,j in ("/／","\\＼","?？","|︱","\"＂","*＊","<＜",">＞"):
   STR=STR.replace(i,j)
return STR

# 抓取单章小说
def catchOne(link,name):
res=requests.get(url=link,headers=headers)
res.encoding='utf-8'
# 解析数据
co=BeautifulSoup(res.text,'html.parser')
soups=co.find_all('dd')
title=co.find('h2').text
# 定义空数组，方便后续排序
list=[]
for soup in soups:
   dict1={'id':soup['data-id'],'content':soup.text}
   list.append(dict1)
# 定义排序方法
def order(e):
   return int(e['id'])
list.sort(key=order,reverse=False)
# 删除后面无用的两行
del list[-1]
# del list[-1]
# 保存数据到本地文件
#标题
with open('e:\\catch\\' + name + '.txt', mode='a+',encoding='utf8') as f:
   f.write('\n\n')
   f.write(FileName(title))
   f.write('\n')
   f.close()
for i in list:
   with open('e:\\catch\\' + name + '.txt', mode='a+',encoding='utf8') as f:
         f.write(i['content'])
         f.close()

# 获取所有章节并下载到本地
def catchAllText(link,name):

res=requests.get(url=link,headers=headers)
res.encoding='utf-8'
# 解析数据
co = BeautifulSoup(res.text, 'html.parser')
cos=co.find_all('a',class_='name')
# 创建文件夹
# os.mkdir('e:\\catch\\'+name)
list=[]
for c in cos:
   target = link + c['href']
   title = c.text
   dict1={'id':c['href'].split("."),'url':target,'title':title}
   list.append(dict1)
#排序
def order(e):
   return int(e['id'])
list.sort(key=order,reverse=False)
#遍历list，按顺序抓取
for i in list:
   print('开始抓取章节 : ' + i['title'])
   catchOne(i['url'], name)

# 指定小说爬取
def catchNovel( ):
par=input("请输入小说名进行搜索 : ")
link = 'https://www.aixs.la/search.php'
data = {
   'key': par
}
res = requests.post(url=link, headers=headers, data=data)
res.encoding = 'utf-8'
# 解析数据
co = BeautifulSoup(res.text, 'html.parser')
cos = co.find_all(class_='bigpic-book-name')

list = []
for i in cos:
   dit = {'id': cos.index(i) + 1, 'title': i.text, 'href': 'https://www.aixs.la' + i['href']}
   print(dit)
   list.append(dit)
if len(list) !=0:
   id = input('请输入序号id:')
   # 获得目标书籍的地址
   targetUrl = list['href']
   # 书名
   name=list['title']

   res2 = requests.get(url=targetUrl, headers=headers)
   res2.encoding = 'utf-8'
   # 解析数据
   co2 = BeautifulSoup(res2.text, 'html.parser')
   # 获得目录地址
   cos2 = co2.find(class_='tab fl j-content-tab').find_all('a')['href']
   mlUrl = 'https://www.aixs.la' + cos2
   catchAllText(mlUrl,name)
else:
   print('暂无搜索到结果！')
   catchNovel()

catchNovel()

linsixi 发表于 2022-11-9 22:45

你写一堆代码，我们不知道怎么下载。适用性不强，给个中评吧。

Shawliu 发表于 2022-11-9 23:01

感觉还不错，试一试看看，如果能下载就好了

晚辈小生 发表于 2022-11-9 23:05

linsixi 发表于 2022-11-9 22:45
你写一堆代码，我们不知道怎么下载。适用性不强，给个中评吧。

这一看就是python代码呀哥们复制代码到本地自己run一下就能下载了{:1_924:}

konley 发表于 2022-11-9 23:47

大部分的情况下，所有章节直接保存成一个文件更为合适，如word、pdf、epub等
当然也可以学一下python生成epub格式小说的操作，这样子里面带目录，就是一本真正的电子书啦~
也可以使用一些现成的工具处理多个章节的小说合并为pdf
mobi比较麻烦不推荐

tbloy 发表于 2022-11-10 00:54

过来学习一下，不错。

869175743 发表于 2022-11-10 08:01

能适用到采集规则就好了

weimeigame 发表于 2022-11-10 08:26

过来学习一下，不错。

petal 发表于 2022-11-10 09:25

等方便了试试，感谢楼主分享

huashengyue 发表于 2022-11-10 09:28

konley 发表于 2022-11-9 23:47
大部分的情况下，所有章节直接保存成一个文件更为合适，如word、pdf、epub等
当然也可以学一下python生成e ...

大佬牛啊，分享代码上来吧

页: [1] 2 3

吾爱破解 - 52pojie.cn's Archiver

某不知名小说网爬取小说章节