[Python] 纯文本查看 复制代码
import configparser, os, requests, chardet, time, refrom lxml import etree
from multiprocessing import Pool
from requests.adapters import HTTPAdapter
from urllib.parse import urljoin # 网址绝对路径与相对路径拼接
xzwz = '' # 下载小说的网址
ml = '' # 得到目录页连接
bt = '' # 得到目录页标题
zw = '' # 得到小说正文
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108
Safari/537.36',
}
def mkdir(path): # 创建目录
isExists = os.path.exists(path) # 判断路径是否存在 # 存在 True # 不存在 False
if not isExists: # 判断结果 # 如果不存在则创建目录
os.makedirs(path) # 创建目录操作函数
print(path + ' 创建成功')
return True
else:
# 如果目录存在则不创建,并提示目录已存在
print(path + ' 目录已存在')
return False
def duini():
global xzwz, bt, zw, ml
# print('读取网址列表')
cf = configparser.ConfigParser()
cf.read("网址接口.ini")
wzlist = cf.options('下载页')
# print(wzlist)
# print(type(wzlist))
wzm = wzlist[0]
print(wzm)
xzwz = cf.get('下载页', wzm)
ml = cf.get(wzm, '目录连接')
bt = cf.get(wzm, '目录标题')
zw = cf.get(wzm, '正文')
# print(pjt)
# print(wzlist[0])
# print(xzwz)
# print(bt)
path = os.getcwd() + '/小说下载'
mkdir(path)
os.chdir(path)
def huoqulist():
global xzwz, ml, bt
print('获取列表中.......')
print(xzwz)
# print(type(xzwz))
# print(type(ml) ,bt,zw)
s = requests.Session()
s.mount('http://', HTTPAdapter(max_retries=3))
s.mount('https://', HTTPAdapter(max_retries=3))
try:
html = requests.get(xzwz, headers=headers)
e = chardet.detect(html.content)['encoding']
html.encoding = e
# print(html.text)
txt = etree.HTML(html.text)
# print(txt)
tlj = txt.xpath(ml)
tbt = txt.xpath(bt) # text() 为提取文字内容,不然就要在下面 写 i.text
bturl = []
print('一共有 %d 章' % len(tlj))
if len(tlj) == 0 or len(tbt) == 0:
print('目录获取失败,检查下载页面和正则配置是否正确')
else:
for i in range(len(tlj)):
name = re.sub('[\/:*?"“”<>~ !,:‘’|]', '', tbt[i])#文件名安全化处理
# print(name)
fullurl = urljoin(xzwz, tlj[i])
bturl.append(fullurl + '|' + name)
# 分割写法 url = ulist.split('|')[0]
# print(i)
#print(bturl)
return (bturl)
except requests.exceptions.RequestException as e:
print(e)
def test(i, url_1, zw):
# print(i, pjt, url_1)
time.sleep(1)
wjm = url_1.split('|')[1]
lj = url_1.split('|')[0]
wj = os.getcwd() + '/{}.txt'.format(wjm)
if not os.path.exists(wj):
print(i, wjm)
s = requests.Session()
s.mount('http://', HTTPAdapter(max_retries=3))
s.mount('https://', HTTPAdapter(max_retries=3))
try:
html = requests.get(lj, headers=headers, timeout=5)
e = chardet.detect(html.content)['encoding']
html.encoding = e
txt = etree.HTML(html.text)
nr = txt.xpath(zw)
wb = ''
for x in nr:
# print(x)
wb = wb + x
# print(wb)
time.sleep(0.5)
# print(wj)
with open(wj, 'w', encoding='GBK') as f:
f.write(wb)
except requests.exceptions.RequestException as e:
print(e)
def is_number(s): # 判断是否为数字
try:
float(s)
return True
except ValueError:
pass
try:
import unicodedata
unicodedata.numeric(s)
return True
except (TypeError, ValueError):
pass
return False
if __name__ == '__main__':
print('start')
duini()
time.sleep(1)
clist = huoqulist()
#input()
pool = Pool(processes=30)
if len(clist) > 0:
for i in range(len(clist)):
# print(i)
url_1 = clist[i]
pool.apply_async(test, args=(i, url_1, zw)) # 维持执行的进程总数为10,当一个进程执行完后启动一
个新进程.test 为进程名字
pool.close()
pool.join()
else:
print('clist 列表为空列表')
# test(1, 'http://www.xbiquge.la/10/10489/9700534.html|第459章冥丹')
#下面是修改后的 增加了一个合并因为很多下载下来的片段文本不方便阅读
fname = open('./new.txt', "w")
for x in range(len(clist)):
print('正在合并文件第 %s 章' % x)
name = clist[x].split('|')[1]
xx = open('./{}.txt'.format(name), 'r')
fname.write(xx.read())
xx.close()
fname.close()