python下载云展网书籍并合并为PDF（附UI成品）

hebeijianke · 发表于 2022-8-7 19:56

本帖最后由 hebeijianke 于 2022-8-27 11:54 编辑

[Python] 纯文本查看 复制代码

import binascii
import PyPDF2
from PyPDF2 import PdfFileMerger, PdfFileReader, PdfFileWriter
from io import BytesIO
import os
import re
import sys
import requests
from lxml import etree
import img2pdf
from tqdm import tqdm
import threading
import shutil
import tkinter as tk
from tkinter import ttk, messagebox, simpledialog, DISABLED, NORMAL
import json
import win32api

headers = {'Connection': 'close'}
global mobile_url, page, pic_name_L, book_num, config_value, outfile

if not sys.warnoptions:
    import warnings

    warnings.simplefilter("ignore")

# 解密zip生成pdf文件，并去除pdf密码
def zip2pdf(zip_path):
    global pdf_reader
    f = open(zip_path, "rb")  # 打开要读取的十六进制文件
    hex_list = ("{:02X}".format(int(c)) for c in f.read())  # 定义变量接受文件内容
    f.close()  # 关闭文件 好习惯！
    buflist = list(hex_list)  # 用列表保存信息，方便后续操作
    # 取密码
    pw0 = bytes().fromhex(''.join(buflist[1080:1083])).decode('gb18030', "ignore")  # 前半段
    pw1 = bytes().fromhex(''.join(buflist[-1003:-1000])).decode('gb18030', "ignore")  # 后半段
    password = pw0 + pw1
    # 取加密PDF数据
    pdf_D = buflist[1083 + 4000:-1003]
    decode_pdf = buflist[1083:1083 + 4000]
    pdf_U = []
    for i in decode_pdf:
        n = hex(255 - int(i, 16)).upper()[2:].zfill(2)
        pdf_U.append(n)
    pdf = ''.join(pdf_U + pdf_D)
    with open(f'{zip_path[:-4]}.pdf', 'wb') as f:
        f.write(binascii.unhexlify(pdf))
    pdfpath = f'{zip_path[:-4]}.pdf'
    pdf_reader = PdfFileReader(pdfpath)  # 输入你想要操作的pdf加密的文档的位置/名称
    pdf_reader.decrypt(password)
    pdf_writer = PdfFileWriter()
    rfname = pdfpath[:-4] + '_decode.pdf'
    for i in range(pdf_reader.getNumPages()):
        pdf_writer.addPage(pdf_reader.getPage(i))
    with open(rfname, 'wb') as out:
        pdf_writer.write(out)
    t.insert('end', f'页码【{book_num}/{page}】，是加密PDF文件，解密成功...\n')
    t.see(tk.END)
    f.close()
    out.close()
    os.unlink(pdfpath)

# 合并PDF
def mergepdf(target_path, outfile):
    pdf_lst = [f for f in os.listdir(target_path) if f.endswith('_decode.pdf')]
    pdf_lst.sort(key=lambda x: int(x[:-11]))
    pdf_lst = [os.path.join(target_path, filename) for filename in pdf_lst]
    file_merger = PdfFileMerger()
    i = 0
    for pdf in pdf_lst:
        i += 1
        ProcessBar().value(i)
        per = round((i / page) * 100, 2)
        per_ = ttk.Button(win, style='My.TButton', text=f'合并进度：{per}%-{i}/{page}', width=23)
        per_.place(x=164, y=50)
        file_merger.append(pdf, bookmark=str(i))
    file_merger.write(outfile)  # 输出
    file_merger.close()
    ml()
    shutil.rmtree(target_path)

# 添加目录书签
def ml():
    bookmark_url = mobile_url.split('index.html')[0] + 'javascript/bookmark_config.js'
    res = requests.get(bookmark_url, headers=headers).text
    bookmark_L = re.findall(r'ols = (.*?);', eval("u" + "\'" + res + "\'"))[0]
    bookmark = json.loads(bookmark_L)
    if 'caption' in bookmark_L:
        # 打开文件
        old_pdf = PdfFileReader(outfile)
        # 复制文件
        new_pdf = PdfFileWriter()
        pages = [old_pdf.getPage(i) for i in range(old_pdf.getNumPages())]
        for page in pages:
            new_pdf.addPage(page)
        pdf = outfile
        # pdf = outfile[:-4] + '_ml.pdf'
        title = []
        page = []
        for i in range(len(bookmark)):
            title_ = bookmark[i]['caption']
            page_ = bookmark[i]['page']
            jie = bookmark[i]['children']
            title.append(title_)
            page.append(page_)
            if len(jie) > 0:
                for j in range(len(jie)):
                    title_ = '|' + jie[j]['caption']
                    page_ = jie[j]['page']
                    title.append(title_)
                    page.append(page_)
        # 创建（几级书签，标题，页码）元组
        bk = [(content.rfind('|') + 1, content.strip()[content.rfind('|') + 1:], int(p.strip()) - 1) for content, p in

        parent_set = {}
        DELTA = 0  # 偏移量
        # 创建分级书签（注1）
        for bm in bk:
            # 将字典中小一个等级的书签当做父节点
            parent = new_pdf.addBookmark(bm[1], bm[2] + DELTA, parent=parent_set.get(bm[0] - 1))
            # 向自己作为字典中该等级的节点
            parent_set[bm[0]] = parent
        with open(pdf, 'wb+') as f:
            new_pdf.write(f)
    else:
        t.insert('end', '没有找到书签，使用页码书签\n')
    # 默认处于禁用状态
    btnClear['state'] = 'normal'

def open_pdf():
    threading.Thread(target=open_pdf_).start()

def open_pdf_():
    if os.path.exists(outfile):
        win32api.ShellExecute(0, 'open', rf'"{outfile}"', '', '', 0)

# 下载
def downlaod_pic():
    global page, pic_name_L, book_num, config_value, outfile
    t.insert('end', '正在解析，请稍等...\n')
    Dir = os.getcwd()
    path = Dir + '\\' + '云展网'  # 存储路径
    if not os.path.exists(path):
        os.makedirs(path)  # 如果路径不存在就创建
    res0 = requests.get(mobile_url, headers=headers).content.decode("UTF-8")  # 获取源码
    pathname = etree.HTML(res0).xpath('/html/head/title/text()')  # 获取书籍名称
    zip_path = path + '/' + str(pathname[0]) + '/'
    if not os.path.exists(zip_path):
        os.makedirs(zip_path)  # 如果路径不存在就创建
    m = res0.split('\n')  # 按行分割
    for i in m:
        config_line = re.findall(r"javascript/config.js?", i)  # 获取config.js?xxxxxxxxxxxx==行
        if config_line:
            config_value = re.findall(r'src="(.*?)"', i, re.S)
    config_url = mobile_url.split('index.html')[0] + config_value[0]  # 获取config.js网址
    res1 = requests.get(config_url, headers=headers).content.decode("UTF-8").replace(' ', '').replace(',', ',\n')
    if 'fliphtml5_pages' in res1:
        pic_C_L = re.findall(r'"normalPath":\["..(.*?)"', res1, re.S)[0].replace('\\', '')
        pic_name_L = re.findall(r'"n":\["(.*?)"', res1, re.S)  # 获取书籍图片名称列表
        page = len(pic_name_L)
    else:
        try:
            pic_C_L = re.findall(r'normalPath="..(.*?)"', res1, re.S)[0]
        except:
            pic_C_L = re.sub('[\"\[\]\\\.]', '', re.findall(r'normalPath":(.*?),', res1, re.S)[0])
        try:
            page = int(re.findall(r'totalPageCount="(.*?)"', res1, re.S)[0])
        except:
            page = int(re.findall(r'totalPageCount":(.*?),', res1, re.S)[0].replace('"', ''))
    t.insert('end', '正在下载，请稍等...\n')
    for i in tqdm(range(page), desc=f'{pathname[0]}'):
        ProcessBar().value(i + 1)
        per = round(((i + 1) / page) * 100, 2)
        per_ = ttk.Button(win, style='My.TButton', text=f'下载进度：{per}%-{i + 1}/{page}', width=23)
        per_.place(x=164, y=50)
        try:
            pic_name = pic_name_L[i]
        except NameError:
            pic_name = f'{i + 1}.jpg'
        book_num = f'{i + 1}'
        zip_url = mobile_url.split('/mobile/index.html')[0] + pic_C_L + pic_name
        zip_ = requests.get(zip_url)  # 获取zip内容
        d_file = zip_path + book_num + pic_name[-4:]
        if zip_.status_code == 200:  # 判断下 如果图片存在就写入列表
            with open(d_file, 'wb') as f:
                f.write(zip_.content)
            if '.jpg' in d_file:
                with open(d_file[:-4] + '_decode.pdf', 'wb') as f:  # 创建并打开一个pdf文件，准备写入
                    f.write(img2pdf.convert(d_file))
                t.insert('end', f'页码【{book_num}/{page}】，是JPG图片文件，转换成功...\n')
                t.see(tk.END)
            else:
                zip2pdf(d_file)
        os.unlink(d_file)
    mergepdf_name = str(pathname[0]) + '.pdf'
    outfile = path + '/' + mergepdf_name
    per_ = ttk.Button(win, style='My.TButton', text='正在合并PDF', width=23)
    per_.place(x=164, y=50)
    threading.Thread(target=mergepdf, args=(zip_path, outfile)).start()
    t.insert('end', f'大功告成！\nPDF 位置：{path}\\{pathname[0]}.pdf\n')
    t.see(tk.END)
    per_ = ttk.Button(win, style='My.TButton', text=f'下载完成', width=23)
    per_.place(x=164, y=50)


def ProcessBar1():
    BB = ttk.Progressbar(win, length=473, mode="indeterminate", maximum=100, orient=tk.HORIZONTAL)
    BB.place(x=14, y=80, width=473, height=10)
    BB.start()

class ProcessBar:
    def __init__(self):
        self.process = ttk.Progressbar(win, length=473, mode="determinate", maximum=page, orient=tk.HORIZONTAL)
        self.process.place(x=14, y=80, width=473, height=10)

    def value(self, per):
        self.process["value"] = per
        win.update()

# 清除输入框文本
def delete_entry():
    b1.delete(0, 'end')

def url():
    global mobile_url
    btnClearClick()
    url = b1.get()
    if url != '':
        res = requests.get(url, headers=headers).content.decode("UTF-8")  # 获取源码
        page_tital = etree.HTML(res).xpath('/html/head/title/text()')[0]  # 获取书籍名称
        if '你所访问的页面不存在' not in page_tital:
            if 'bookcase' in url:  # 判断是那一种链接
                res = requests.get(url, headers=headers).content.decode("UTF-8")  # 获取源码
                m = res.split('\n')  # 按行分割
                for i in m:
                    book_line = re.findall(r"js/bookConfig.js?", i)  # 获取config.js?xxxxxxxxxxxx==行
                    if book_line:
                        book_url = re.findall(r'src="(.*?)"', i, re.S)[0]
                        book = requests.get(book_url, headers=headers).content.decode("UTF-8").replace(',', ',\n')
                        mobile_url_L = re.findall(r'"url":"(.*?)",', book, re.S)
                        t.insert('end', f'共[{len(mobile_url_L)}]本书籍\n')
                        t.see(tk.END)
                        j = 0
                        for _url in mobile_url_L:
                            j += 1
                            t.insert('end', f'正在下载第[{j}]本\n')
                            t.see(tk.END)
                            mobile_url = _url.replace('\\', '') + 'mobile/index.html'
                            downlaod_pic()
            else:
                if 'index.html' in url:
                    mobile_url = url
                else:
                    res = requests.get(url, headers=headers)  # 获取源码
                    res.encoding = res.apparent_encoding
                    mobile_url = \
                        etree.HTML(res.text).xpath('//div[@class="basic-bottom-name pointer"]/a/@href')[0].split(
                            'index.html')[
                            0] + 'mobile/index.html'  # 构造书籍地址
                threading.Thread(target=downlaod_pic).start()
            ProcessBar1()
        else:
            messagebox.showinfo("信息提示", "链接地址有误或者内容已被删除")
            t.insert('end', '链接地址有误或者内容已被删除\n')
            t.see(tk.END)
    else:
        t.insert('end', '搜索框不能为空\n')
        t.see(tk.END)
    # btnClear['state'] = 'normal'

# enter调用
def btn_click_enter(self):
    url()

win = tk.Tk()
# 窗口大小
width = 500
height = 330
win.geometry(f'{width}x{height}')

# 计算中心坐标点
screen_width = (win.winfo_screenwidth() - width) / 2
screen_height = (win.winfo_screenheight() - height) / 2
win.title('云展网书籍下载器')
# win.attributes("-toolwindow", 1)
win.resizable(False, False)
win.geometry(f"+{int(screen_width)}+{int(screen_height)}")

gui_style = ttk.Style()
gui_style.configure('My.TButton', foreground='red')
# 单行文本
L1 = tk.Label(win, text='云展网书籍网址：', font=('SimHei', 12))
L1.place(x=14, y=6)

# 单行文本框  可采集键盘输入
b1 = tk.Entry(win, font=('', 10), width=67)
b1.place(x=14, y=30)
b1.focus_set()
b1.bind("<Return>", btn_click_enter)

a = ttk.Button(win, text="开始下载", width=20, command=url)
a.place(x=14, y=50)

# 进度百分比
per_ = ttk.Button(win, style='My.TButton', text=f'数字百分比进度', width=23)
per_.place(x=164, y=50)
per_['state'] = 'disabled'
# 设置重置按钮
btnClear = ttk.Button(win, text='打开当前PDF', width=20, command=open_pdf)
btnClear.place(x=335, y=50)
# 默认处于禁用状态
btnClear['state'] = 'disabled'

# 设置多行文本框  宽 高  文本框中字体  选中文字时文字的颜色
t = tk.Text(win, width=78, height=16, font=('', 9), fg="green", bg="#191919", spacing1=1, spacing2=1, spacing3=1,
            insertbackground='#fff', bd=0.8)  # 显示多行文本
t.place(x=14, y=95)
ProcessBar1()
win.mainloop()

zip和单页pdf格式的加密了，我只能一页一页解开pdf密码，不会对js进行逆向，希望大佬研究一下，像这本书就是http://www.yunzhan365.com/basic/101-150/48138034.html

傲游截图20220809062028.jpg

多谢@涛之雨大佬指点，写了一个UI版的，大家先测试一下，
python新手，多多指点，源码是拼拼凑凑的，本来是用的pikepdf模块，结果打包成exe，运行一直出错
就直接用PyPDF4模块打包了
下载:https://hebeijianke.lanzoub.com/iL2ZJ09jm8sh 密码:52pj

请大佬解决吧，使用pikepdf模块，用源码可以跑代码，不会出现卡32页的情况，但是无法打包EXE
使用PyPDF4模块，其他书籍页面都没问题，就这个http://www.yunzhan365.com/basic/101-150/48138034.html会卡到32页，大佬利用源码跑跑，修复一下吧，我不知道怎么修复！！！
把PyPDF4模块降级到PyPDF2可以用了，也不知道那里的问题，成品已更新

8.14: 修复几个bug，增加使用本地默认PDF阅读器打开当前书籍功能，有目录书签的，自动添加

lxfw2000 · 发表于 2022-8-7 20:27

不懂就问：云展网是卖书吗？
那就买本书就好了吧。

艺路是蓝 · 发表于 2022-8-7 20:40

同问同问

百事阔落666 · 发表于 2022-8-7 20:44

以前找过类似的，别人用IDM+迅雷批量帮我下载好了，我小白也不会

cnljm · 发表于 2022-8-7 22:03

福利经验板块有个兄弟分享了一个云展网书籍下载插件，不知道符合楼主的需求嘛

qingfeng0923 · 发表于 2022-8-7 22:19

大佬厉害呀

orb001 · 发表于 2022-8-7 22:47

IDM比较好用

89684828 · 发表于 2022-8-7 22:58

支持一下，谢谢！

chuzhi1983 · 发表于 2022-8-8 08:29

大佬厉害呀

bj9ye666 · 发表于 2022-8-8 08:31

厉害很实用的脚本，感谢分享的

帐号		自动登录	找回密码
密码			注册[Register]

[讨论] python下载云展网书籍并合并为PDF（附UI成品）

免费评分