[Python] 纯文本查看 复制代码
import binascii
import PyPDF2
from PyPDF2 import PdfFileMerger, PdfFileReader, PdfFileWriter
from io import BytesIO
import os
import re
import sys
import requests
from lxml import etree
import img2pdf
from tqdm import tqdm
import threading
import shutil
import tkinter as tk
from tkinter import ttk, messagebox, simpledialog, DISABLED, NORMAL
import json
import win32api
headers = {'Connection': 'close'}
global mobile_url, page, pic_name_L, book_num, config_value, outfile
if not sys.warnoptions:
import warnings
warnings.simplefilter("ignore")
# 解密zip生成pdf文件,并去除pdf密码
def zip2pdf(zip_path):
global pdf_reader
f = open(zip_path, "rb") # 打开要读取的十六进制文件
hex_list = ("{:02X}".format(int(c)) for c in f.read()) # 定义变量接受文件内容
f.close() # 关闭文件 好习惯!
buflist = list(hex_list) # 用列表保存信息,方便后续操作
# 取密码
pw0 = bytes().fromhex(''.join(buflist[1080:1083])).decode('gb18030', "ignore") # 前半段
pw1 = bytes().fromhex(''.join(buflist[-1003:-1000])).decode('gb18030', "ignore") # 后半段
password = pw0 + pw1
# 取加密PDF数据
pdf_D = buflist[1083 + 4000:-1003]
decode_pdf = buflist[1083:1083 + 4000]
pdf_U = []
for i in decode_pdf:
n = hex(255 - int(i, 16)).upper()[2:].zfill(2)
pdf_U.append(n)
pdf = ''.join(pdf_U + pdf_D)
with open(f'{zip_path[:-4]}.pdf', 'wb') as f:
f.write(binascii.unhexlify(pdf))
pdfpath = f'{zip_path[:-4]}.pdf'
pdf_reader = PdfFileReader(pdfpath) # 输入你想要操作的pdf加密的文档的位置/名称
pdf_reader.decrypt(password)
pdf_writer = PdfFileWriter()
rfname = pdfpath[:-4] + '_decode.pdf'
for i in range(pdf_reader.getNumPages()):
pdf_writer.addPage(pdf_reader.getPage(i))
with open(rfname, 'wb') as out:
pdf_writer.write(out)
t.insert('end', f'页码【{book_num}/{page}】,是加密PDF文件,解密成功...\n')
t.see(tk.END)
f.close()
out.close()
os.unlink(pdfpath)
# 合并PDF
def mergepdf(target_path, outfile):
pdf_lst = [f for f in os.listdir(target_path) if f.endswith('_decode.pdf')]
pdf_lst.sort(key=lambda x: int(x[:-11]))
pdf_lst = [os.path.join(target_path, filename) for filename in pdf_lst]
file_merger = PdfFileMerger()
i = 0
for pdf in pdf_lst:
i += 1
ProcessBar().value(i)
per = round((i / page) * 100, 2)
per_ = ttk.Button(win, style='My.TButton', text=f'合并进度:{per}%-{i}/{page}', width=23)
per_.place(x=164, y=50)
file_merger.append(pdf, bookmark=str(i))
file_merger.write(outfile) # 输出
file_merger.close()
ml()
shutil.rmtree(target_path)
# 添加目录书签
def ml():
bookmark_url = mobile_url.split('index.html')[0] + 'javascript/bookmark_config.js'
res = requests.get(bookmark_url, headers=headers).text
bookmark_L = re.findall(r'ols = (.*?);', eval("u" + "\'" + res + "\'"))[0]
bookmark = json.loads(bookmark_L)
if 'caption' in bookmark_L:
# 打开文件
old_pdf = PdfFileReader(outfile)
# 复制文件
new_pdf = PdfFileWriter()
pages = [old_pdf.getPage(i) for i in range(old_pdf.getNumPages())]
for page in pages:
new_pdf.addPage(page)
pdf = outfile
# pdf = outfile[:-4] + '_ml.pdf'
title = []
page = []
for i in range(len(bookmark)):
title_ = bookmark[i]['caption']
page_ = bookmark[i]['page']
jie = bookmark[i]['children']
title.append(title_)
page.append(page_)
if len(jie) > 0:
for j in range(len(jie)):
title_ = '|' + jie[j]['caption']
page_ = jie[j]['page']
title.append(title_)
page.append(page_)
# 创建(几级书签,标题,页码)元组
bk = [(content.rfind('|') + 1, content.strip()[content.rfind('|') + 1:], int(p.strip()) - 1) for content, p in
parent_set = {}
DELTA = 0 # 偏移量
# 创建分级书签(注1)
for bm in bk:
# 将字典中小一个等级的书签当做父节点
parent = new_pdf.addBookmark(bm[1], bm[2] + DELTA, parent=parent_set.get(bm[0] - 1))
# 向自己作为字典中该等级的节点
parent_set[bm[0]] = parent
with open(pdf, 'wb+') as f:
new_pdf.write(f)
else:
t.insert('end', '没有找到书签,使用页码书签\n')
# 默认处于禁用状态
btnClear['state'] = 'normal'
def open_pdf():
threading.Thread(target=open_pdf_).start()
def open_pdf_():
if os.path.exists(outfile):
win32api.ShellExecute(0, 'open', rf'"{outfile}"', '', '', 0)
# 下载
def downlaod_pic():
global page, pic_name_L, book_num, config_value, outfile
t.insert('end', '正在解析,请稍等...\n')
Dir = os.getcwd()
path = Dir + '\\' + '云展网' # 存储路径
if not os.path.exists(path):
os.makedirs(path) # 如果路径不存在就创建
res0 = requests.get(mobile_url, headers=headers).content.decode("UTF-8") # 获取源码
pathname = etree.HTML(res0).xpath('/html/head/title/text()') # 获取书籍名称
zip_path = path + '/' + str(pathname[0]) + '/'
if not os.path.exists(zip_path):
os.makedirs(zip_path) # 如果路径不存在就创建
m = res0.split('\n') # 按行分割
for i in m:
config_line = re.findall(r"javascript/config.js?", i) # 获取config.js?xxxxxxxxxxxx==行
if config_line:
config_value = re.findall(r'src="(.*?)"', i, re.S)
config_url = mobile_url.split('index.html')[0] + config_value[0] # 获取config.js网址
res1 = requests.get(config_url, headers=headers).content.decode("UTF-8").replace(' ', '').replace(',', ',\n')
if 'fliphtml5_pages' in res1:
pic_C_L = re.findall(r'"normalPath":\["..(.*?)"', res1, re.S)[0].replace('\\', '')
pic_name_L = re.findall(r'"n":\["(.*?)"', res1, re.S) # 获取书籍图片名称列表
page = len(pic_name_L)
else:
try:
pic_C_L = re.findall(r'normalPath="..(.*?)"', res1, re.S)[0]
except:
pic_C_L = re.sub('[\"\[\]\\\.]', '', re.findall(r'normalPath":(.*?),', res1, re.S)[0])
try:
page = int(re.findall(r'totalPageCount="(.*?)"', res1, re.S)[0])
except:
page = int(re.findall(r'totalPageCount":(.*?),', res1, re.S)[0].replace('"', ''))
t.insert('end', '正在下载,请稍等...\n')
for i in tqdm(range(page), desc=f'{pathname[0]}'):
ProcessBar().value(i + 1)
per = round(((i + 1) / page) * 100, 2)
per_ = ttk.Button(win, style='My.TButton', text=f'下载进度:{per}%-{i + 1}/{page}', width=23)
per_.place(x=164, y=50)
try:
pic_name = pic_name_L[i]
except NameError:
pic_name = f'{i + 1}.jpg'
book_num = f'{i + 1}'
zip_url = mobile_url.split('/mobile/index.html')[0] + pic_C_L + pic_name
zip_ = requests.get(zip_url) # 获取zip内容
d_file = zip_path + book_num + pic_name[-4:]
if zip_.status_code == 200: # 判断下 如果图片存在就写入列表
with open(d_file, 'wb') as f:
f.write(zip_.content)
if '.jpg' in d_file:
with open(d_file[:-4] + '_decode.pdf', 'wb') as f: # 创建并打开一个pdf文件,准备写入
f.write(img2pdf.convert(d_file))
t.insert('end', f'页码【{book_num}/{page}】,是JPG图片文件,转换成功...\n')
t.see(tk.END)
else:
zip2pdf(d_file)
os.unlink(d_file)
mergepdf_name = str(pathname[0]) + '.pdf'
outfile = path + '/' + mergepdf_name
per_ = ttk.Button(win, style='My.TButton', text='正在合并PDF', width=23)
per_.place(x=164, y=50)
threading.Thread(target=mergepdf, args=(zip_path, outfile)).start()
t.insert('end', f'大功告成!\nPDF 位置:{path}\\{pathname[0]}.pdf\n')
t.see(tk.END)
per_ = ttk.Button(win, style='My.TButton', text=f'下载完成', width=23)
per_.place(x=164, y=50)
def ProcessBar1():
BB = ttk.Progressbar(win, length=473, mode="indeterminate", maximum=100, orient=tk.HORIZONTAL)
BB.place(x=14, y=80, width=473, height=10)
BB.start()
class ProcessBar:
def __init__(self):
self.process = ttk.Progressbar(win, length=473, mode="determinate", maximum=page, orient=tk.HORIZONTAL)
self.process.place(x=14, y=80, width=473, height=10)
def value(self, per):
self.process["value"] = per
win.update()
# 清除输入框文本
def delete_entry():
b1.delete(0, 'end')
def url():
global mobile_url
btnClearClick()
url = b1.get()
if url != '':
res = requests.get(url, headers=headers).content.decode("UTF-8") # 获取源码
page_tital = etree.HTML(res).xpath('/html/head/title/text()')[0] # 获取书籍名称
if '你所访问的页面不存在' not in page_tital:
if 'bookcase' in url: # 判断是那一种链接
res = requests.get(url, headers=headers).content.decode("UTF-8") # 获取源码
m = res.split('\n') # 按行分割
for i in m:
book_line = re.findall(r"js/bookConfig.js?", i) # 获取config.js?xxxxxxxxxxxx==行
if book_line:
book_url = re.findall(r'src="(.*?)"', i, re.S)[0]
book = requests.get(book_url, headers=headers).content.decode("UTF-8").replace(',', ',\n')
mobile_url_L = re.findall(r'"url":"(.*?)",', book, re.S)
t.insert('end', f'共[{len(mobile_url_L)}]本书籍\n')
t.see(tk.END)
j = 0
for _url in mobile_url_L:
j += 1
t.insert('end', f'正在下载第[{j}]本\n')
t.see(tk.END)
mobile_url = _url.replace('\\', '') + 'mobile/index.html'
downlaod_pic()
else:
if 'index.html' in url:
mobile_url = url
else:
res = requests.get(url, headers=headers) # 获取源码
res.encoding = res.apparent_encoding
mobile_url = \
etree.HTML(res.text).xpath('//div[@class="basic-bottom-name pointer"]/a/@href')[0].split(
'index.html')[
0] + 'mobile/index.html' # 构造书籍地址
threading.Thread(target=downlaod_pic).start()
ProcessBar1()
else:
messagebox.showinfo("信息提示", "链接地址有误或者内容已被删除")
t.insert('end', '链接地址有误或者内容已被删除\n')
t.see(tk.END)
else:
t.insert('end', '搜索框不能为空\n')
t.see(tk.END)
# btnClear['state'] = 'normal'
# enter调用
def btn_click_enter(self):
url()
win = tk.Tk()
# 窗口大小
width = 500
height = 330
win.geometry(f'{width}x{height}')
# 计算中心坐标点
screen_width = (win.winfo_screenwidth() - width) / 2
screen_height = (win.winfo_screenheight() - height) / 2
win.title('云展网书籍下载器')
# win.attributes("-toolwindow", 1)
win.resizable(False, False)
win.geometry(f"+{int(screen_width)}+{int(screen_height)}")
gui_style = ttk.Style()
gui_style.configure('My.TButton', foreground='red')
# 单行文本
L1 = tk.Label(win, text='云展网书籍网址:', font=('SimHei', 12))
L1.place(x=14, y=6)
# 单行文本框 可采集键盘输入
b1 = tk.Entry(win, font=('', 10), width=67)
b1.place(x=14, y=30)
b1.focus_set()
b1.bind("<Return>", btn_click_enter)
a = ttk.Button(win, text="开始下载", width=20, command=url)
a.place(x=14, y=50)
# 进度百分比
per_ = ttk.Button(win, style='My.TButton', text=f'数字百分比进度', width=23)
per_.place(x=164, y=50)
per_['state'] = 'disabled'
# 设置重置按钮
btnClear = ttk.Button(win, text='打开当前PDF', width=20, command=open_pdf)
btnClear.place(x=335, y=50)
# 默认处于禁用状态
btnClear['state'] = 'disabled'
# 设置多行文本框 宽 高 文本框中字体 选中文字时文字的颜色
t = tk.Text(win, width=78, height=16, font=('', 9), fg="green", bg="#191919", spacing1=1, spacing2=1, spacing3=1,
insertbackground='#fff', bd=0.8) # 显示多行文本
t.place(x=14, y=95)
ProcessBar1()
win.mainloop()