应该算是我这个新人在吾爱的首贴了
前言
最近在初学python中,之前看到了吾爱的一个帖子:https://www.52pojie.cn/thread-699885-1-1.html 于是就萌生出了爬小说的这个想法
由于是py新人,所以代码可能不够简练,都是想到哪写到哪,但是看起来可能不会太吃力,应该还是比较好理解的
大神勿喷!
食用方法
1.首先需要python环境,这个网上教程也很多,我就不说明了
2.仅python 3的版本,3以下版本无法运行
3.不需要安装第三方库,一切操作基于标准库完成
爬取的对象:第三方小说网站:顶点小说网
以小说:修真聊天群 为例
然后就静静等待结束即可
后面有图,因为图链可能会挂。。。所以当附件上传了
源码
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import urllib.request
import re
import os
import time
import threading
import shutil
txt_content_partern = '<div id="content">(.*?)</div>'
txt_name_partern = '<h1>(.*?)</h1>'
catalog_partern = '<dd><a href="/\w+_\w+/(.*?).html">(.*?)</a></dd>'
flag = -1
max_len = 0
atalog = []
# 章节间隔
txt_max = 20
# 线程数量
max_thread = 20
thread_stop = 0
start_time = time.clock()
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Cache-Control': 'max-age=0',
'Proxy-Connection': 'keep-alive',
'Host': 'http://www.booktxt.net',
'Referer': 'https://www.google.com.hk/',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}
def down_txt(url, txtname, filename):
# print(url)
fo = open(filename, "a")
for i in range(0, 10):
try:
html_data = urllib.request.urlopen(url).read().decode('gbk')
content = re.findall(txt_content_partern, html_data, re.S | re.M)
fo.write("\r\n" + txtname + "\r\n")
fo.write(content[0].replace(" ", "").replace("<br />", "").replace("\r\n\r\n", "\r\n").replace("<", "").replace("/p>", ""))
fo.close()
break
except:
if i == 9:
print("请求失败次数过多,请重新下载")
print("请求失败,正在重试...")
time.sleep(0.5)
continue
def down_mul(url, cnt, file_path):
global flag, max_len, atalog, txt_max, thread_stop
down_flag = 1
while flag * txt_max < max_len - 1:
flag += 1
star = flag * txt_max
end = star + txt_max
if star >= end:
break
if end > max_len:
end = max_len
print("正在抓取章节" + str(star) + '-' + str(end) + '...')
down_flag = 0
for i in range(star, end):
if i >= max_len:
break
for j in range(0, 10):
try:
down_txt(url + atalog[i][0] + ".html", atalog[i][1], file_path + '\\' + str(star + 1) + '.txt')
break
except:
if i == 9:
print("请求失败次数过多,请重新下载")
print("请求失败,正在重试...")
time.sleep(0.5)
continue
thread_stop += 1
if down_flag:
print("线程[" + str(cnt) + "]未获取到任务...")
else:
print("线程[" + str(cnt) + "]运行完毕...")
def main():
global atalog, max_len, thread_stop, max_thread, start_time
basic_url = 'www.booktxt.net'
url_1 = input("请输入需要下载的小说目录地址,仅限顶点小说网[www.booktxt.net]:")
print('正在抓取目录章节...')
# url_1='http://www.booktxt.net/1_1137/'
for i in range(0, 10):
try:
html_data = urllib.request.urlopen(url_1).read().decode('gbk')
txt_name = re.compile(txt_name_partern).findall(html_data)
print('小说名称:' + txt_name[0])
atalog = re.compile(catalog_partern).findall(html_data)
print('章节目录抓取完毕...总章节数:' + str(len(atalog)))
break
except:
if i == 9:
print("请求失败次数过多,请重新下载")
print("请求失败,正在重试...")
time.sleep(0.5)
continue
files = txt_name[0]
if not os.path.exists(files):
os.mkdir(files)
else:
file_path_list = os.listdir(files)
for file in file_path_list:
os.remove(files + '\\' + file)
# print(atalog)
max_len = len(atalog)
atalog.sort(key=len)
# max_len =19
for x in range(0, max_thread):
t = threading.Thread(target=down_mul, args=(url_1, x + 1, files))
print('线程[' + str(x + 1) + ']Runing Star')
t.start()
while (1):
if thread_stop == max_thread:
break
print("正在抓取...请稍后...剩余线程数:" + str(max_thread - thread_stop))
time.sleep(5)
print("等待合并章节...")
filenames = os.listdir(files)
filenames.sort(key=len)
print(filenames)
fo = open(txt_name[0] + '.txt', "w")
for file in filenames:
filepath = files + '\\' + file
for line in open(filepath):
fo.write(line)
fo.close()
print("合并章节完成...等待删除工作目录...")
shutil.rmtree(files)
times = time.clock() - start_time
h = int(times) // 3600
m = int(times) % 3600 // 60
s = int(times) % 60
print("小说下载完成,总共消耗时间:", h, "小时", m, '分钟', s, '秒')
s = input()
if __name__ == '__main__':
opener = urllib.request.build_opener()
header_list = []
for key, value in headers.items():
header_list.append((key, value))
opener.addheaders = header_list
urllib.request.install_opener(opener)
main()