本帖最后由 gentlespider 于 2021-4-28 08:39 编辑
好久没发帖了,一时想不起来做点啥,就找了某阁网站,然后输入书名,出现结果编号,输入对应编号,按照编号爬取就好了。书会保存在同级目录下。
下面是效果图
接下来是源码
使用了第三方库:requests和lxml
安装方法自行百度即可。不喜欢安装的话文末有我打包的exe程序,自行下载。
import time
import requests
from lxml import etree
class MyBook():
def __init__(self):
self.index = 1
self.headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'
}
def get_book_url(self,bookname):
url = 'http://www.xbiquge.la/modules/article/waps.php'
data = {
'searchkey': bookname
}
res = requests.post(url,data=data).content.decode('utf-8')
search_ele = etree.HTML(res)
book_ele = search_ele.xpath('//table[@class="grid"]/tr')
if book_ele[1:]:
num = 0
print('查询结果:')
for i in book_ele[1:]:
book_name = ''.join(i.xpath('td[1]//text()'))
author = i.xpath('td[3]/text()')[0]
num += 1
print(f'{num}.书名:{book_name} , 作者:{author}')
choice = input('请输入编号,进行下载:')
mychoice = book_ele[int(choice)]
book_link = mychoice.xpath('td[1]/a/@href')[0]
book_name = ''.join(mychoice.xpath('td[1]//text()'))
self.get_cha_url(book_link,book_name)
else:
print('没有搜索到内容,可搜书名和作者,请您少字也别输错字!')
def get_cha_url(self,book_link,book_name):
print("下载中...")
res = requests.get(book_link, headers=self.headers).content.decode('utf-8')
ele = etree.HTML(res)
char_list = ele.xpath('//div[@id="list"]/dl/dd/a/@href')
char_list = ['http://www.xbiquge.la' + i for i in char_list]
for char_url in char_list:
self.get_content(char_url,book_name)
print('恭喜,下载完毕')
def get_content(self,char_url,book_name):
if self.index == 3:
self.index = 1
return
try:
cha_res = requests.get(char_url, headers=self.headers).content.decode('utf-8')
except:
print(f"当前章节出错,章节url:{char_url},重新获取中")
self.index += 1
time.sleep(3)
self.get_content(char_url, book_name)
else:
cha_ele = etree.HTML(cha_res)
cha_name = cha_ele.xpath('//h1/text()')[0]
char_content = cha_ele.xpath('//div[@id="content"]/text()')
print('正在存储:', cha_name)
with open(f'{book_name}.txt', 'a', encoding='utf-8') as w:
w.write(cha_name + '\n')
for i in char_content:
w.write(i[:50])
if i[50:]:
w.write('\n')
if 0 < len(i[50:]) < 50:
w.write(i[50:100])
elif 50 <= len(i[50:]) < 100:
w.write(i[50:100])
w.write('\n')
w.write(i[100:150])
elif 100 <= len(i[50:]) < 150:
w.write(i[50:100])
w.write('\n')
w.write(i[100:150])
w.write('\n')
w.write(i[150:200])
elif 150 <= len(i[50:]) < 200:
w.write(i[50:100])
w.write('\n')
w.write(i[100:150])
w.write('\n')
w.write(i[150:200])
w.write('\n')
w.write(i[200:250])
elif 200 <= len(i[50:]) < 250:
w.write(i[50:100])
w.write('\n')
w.write(i[100:150])
w.write('\n')
w.write(i[150:200])
w.write('\n')
w.write(i[200:250])
w.write('\n')
w.write(i[250:300])
elif 250 <= len(i[50:]):
w.write(i[50:100])
w.write('\n')
w.write(i[100:150])
w.write('\n')
w.write(i[150:200])
w.write('\n')
w.write(i[200:250])
w.write('\n')
w.write(i[250:300])
w.write('\n')
w.write(i[300:])
else:
w.write(i[50:])
w.write('\n')
bookname = input('请输入书名,输入完按回车:')
mybook = MyBook()
mybook.get_book_url(bookname)
说明:该网站速度无法提升太快,稍微快一点就会504,所以成品源码这里没有添加。感兴趣的伙伴可以自己实验下,测试开多少速度可以,欢迎分享,交流!
如有侵权联系我,马上删帖!
|