本帖最后由 战网无极限 于 2020-12-23 19:48 编辑
刚学习python 今天实践一下,因为个人比较喜欢看小说就写了一个新笔趣阁的爬虫
望大佬指点,因为还没学多线程所以是个单线程的
看了一下其他人的帖子,我又添加了从中间某一章开始下载的功能
[Python] 纯文本查看 复制代码 import os
import re
import requests
headers = {
'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36 OPR/72.0.3815.400',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
# 书名存储变量
name = ''
zhang = 0
def get_urls(url):
# 请求网址
html = requests.get(url, headers=headers)
# 指定页面编码
html.encoding = html.apparent_encoding
# 用正则获取章节URL
reg = r"<dd><a href='(.*?)' >(.*?)</a></dd>"
urls = re.findall(reg, html.text)
# 用正则获取章节名
reg2 = r"<h1>(.*?)</h1>"
name = re.findall(reg2, html.text)
return urls, name
def download_book(urls, zhang):
# 循环输出章节URL
for url in urls[zhang:]:
# 章节URL
bookUrl = url[0]
# 获取到的URL如下:/10/10489/4535761.html 所以要拼接一下
bookUrl = 'http://www.xbiquge.la' + bookUrl
bookTitle = url[1]
# 删除章节标题中可能影响创建文件的符号如:*?:/\\<>|
bookTitle = re.sub('[*?:/\\<>|]', '', bookTitle)
# 抓取每一章节内容
chapter = requests.get(bookUrl, headers=headers)
chapter.encoding = chapter.apparent_encoding
chapter_reg = r'<div id="content">(.*?)<p><a href='
chapter_content = re.findall(chapter_reg, chapter.text)
# 转化list为str
data = str(chapter_content)
# 替换章节内容中的字符
data = re.sub('[\r\[\]]', '', data)
data = data.replace(' ', ' ').replace('<br />', '').replace('\\r\\r', '').replace('\'', '')
# 将数据写入文档
print("正在下载:%s" % bookTitle)
with open('./Books/' + name + '/{}.txt'.format(bookTitle), 'w+', encoding='gbk') as f:
f.write(data)
f.close()
def mkdir(name):
folder = os.path.exists('Books\\' + name)
if not folder: # 判断是否存在文件夹如果不存在则创建为文件夹
os.makedirs('Books\\' + name) # makedirs 创建文件时如果路径不存在会创建这个路径
print('文件夹不存在,已创建')
else:
print('文件夹存在')
if __name__ == '__main__':
print("书籍ID由新笔趣阁:www.xbiquge.la 提供")
bookId = input("请输入书籍ID:")
url = 'http://www.xbiquge.la/1/'
url = url + bookId + '/'
# url = 'http://www.xbiquge.la/1/1618'
zhang = int(input("请输入开始章节:"))
zhang = zhang - 1
urls = get_urls(url)[0]
name = get_urls(url)[1]
name = str(name)
name = name.replace("['", "").replace("']", "")
mkdir(name)
download_book(urls, zhang)
|