利用腾讯云实现PDF转“word”

lianxiang1122 · 发表于 2024-6-17 16:51

PDF转word很是让人头疼，特别是扫描版的又有图片又有表格的，基本很难搞定。

最近，刚刚发现腾讯云刚刚发布了一个文字识别新品—————智能结构化！

https://cloud.tencent.com/product/smart-ocr

进入网址后，用微信登录并开通服务就可免费使用，腾讯云一如既往的优秀，可以每月白嫖1千次！

点击demo就可以先体验一下了，只能一张一张PDF或图片识别，识别后会生成一个markdown文件，用相关的软件打开这个markdown文件就可以复制粘贴到word文档中了，从而间接实现PDF转word。
能力有限，不能一次到位，只能这样了。。。。。等高手大佬们解决一次到位转成word。。。

例如，用我们的pycharm软件打开.md文件后，会显示内容，我们全选复制，再粘贴到word中，就可以了。

当然了，一张一张识别肯定会很麻烦了。腾讯云提供了API调用，找到控制台的接入指引，第二步的AIP3.0 Explorer，点击进入

进入AIP3.0 Explorer，之前分享过，不一一解释了。

将代码复制到IDE中，就可以测试了。

提示：一次只能识别10页PDF，如果多于10页，就得多次调用了。

具体操作，就不一一详解了。利用chatgpt，做了一个带界面的，供大家参考。

简单解释一下：先读取PDF文件，再选择生成markdown文件保存位置，输入你的ID和KEY。程序会先读取D盘是否有一个idkey.txt文件，如果有自动读取到相关的ID和KEY，如果没有，需要你输入一下，当点击转换按钮后，会自动将ID 和key保存到D盘，当下一次使用时读取，就不用再输入了。当pdf文件大于10页后，拆分后拼接，多次请求，最后生成一个.md文件。

[Python] 纯文本查看 复制代码

import os
import tkinter as tk
from tkinter import filedialog, messagebox
import base64
import fitz  # PyMuPDF
import json
from tencentcloud.common.common_client import CommonClient
from tencentcloud.common import credential
from tencentcloud.common.profile.client_profile import ClientProfile
from tencentcloud.common.profile.http_profile import HttpProfile
from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException

def read_pdf_to_base64(paf_file):
    with open(paf_file, 'rb') as pdf_file:
        binary_data = pdf_file.read()
    base64_encoded_data = base64.b64encode(binary_data)
    return base64_encoded_data.decode('utf-8')

def decode_base64_to_markdown(base64_str):
    decoded_bytes = base64.b64decode(base64_str)
    return decoded_bytes.decode('utf-8')

def save_as_md_file(content, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(content)

def ocr_markdown(base64_string, FileStartPageNumber, FileEndPageNumber, secret_id, secret_key):
    cred = credential.Credential(secret_id, secret_key)
    httpProfile = HttpProfile()
    httpProfile.endpoint = "ocr.tencentcloudapi.com"
    clientProfile = ClientProfile()
    clientProfile.httpProfile = httpProfile

    params_set = {
        "FileType": "PDF",
        "FileBase64": base64_string,
        "FileStartPageNumber": FileStartPageNumber,
        "FileEndPageNumber": FileEndPageNumber
    }

    params = json.dumps(params_set)

    common_client = CommonClient("ocr", "2018-11-19", cred, "ap-guangzhou", profile=clientProfile)

    try:
        response = common_client.call_json("ReconstructDocument", json.loads(params))
        return response['Response']['MarkdownBase64']
    except TencentCloudSDKException as err:
        error_message = f"An error occurred: {err}\nPlease enter the correct API Secret ID and Secret Key."
        messagebox.showerror("Error", error_message)
        
        idkey_file = 'D:/idkey.txt'
        if os.path.exists(idkey_file):
            if messagebox.askyesno("Delete idkey.txt", "The idkey.txt file exists. Do you want to delete it?"):
                os.remove(idkey_file)
                messagebox.showinfo("Deleted", "idkey.txt file has been deleted. Please enter the correct API Secret ID and Secret Key.")
        return None

def process_pdf(paf_file, secret_id, secret_key, output_dir):
    base64_string = read_pdf_to_base64(paf_file)
    doc = fitz.open(paf_file)
    page_count = doc.page_count

    markdown_output = ''
    pdf_filename = os.path.splitext(os.path.basename(paf_file))[0]
    output_filepath = os.path.join(output_dir, f"{pdf_filename}.md")

    if page_count <= 10:
        FileStartPageNumber = 1
        FileEndPageNumber = page_count
        output = ocr_markdown(base64_string, FileStartPageNumber, FileEndPageNumber, secret_id, secret_key)
        if output:
            markdown_output = decode_base64_to_markdown(output)
            save_as_md_file(markdown_output, output_filepath)
    else:
        num = page_count // 10
        num_last = page_count % 10
        for i in range(num):
            FileStartPageNumber = i * 10 + 1
            FileEndPageNumber = i * 10 + 10
            output = ocr_markdown(base64_string, FileStartPageNumber, FileEndPageNumber, secret_id, secret_key)
            if output:
                markdown_output += decode_base64_to_markdown(output) + '\n'
        if num_last != 0:
            FileStartPageNumber = num * 10 + 1
            FileEndPageNumber = num * 10 + num_last
            output = ocr_markdown(base64_string, FileStartPageNumber, FileEndPageNumber, secret_id, secret_key)
            if output:
                markdown_output += decode_base64_to_markdown(output) + '\n'
        save_as_md_file(markdown_output, output_filepath)

def select_file():
    file_path = filedialog.askopenfilename(filetypes=[("PDF files", "*.pdf")])
    file_entry.delete(0, tk.END)
    file_entry.insert(0, file_path)

def select_output_dir():
    output_dir = filedialog.askdirectory()
    output_dir_entry.delete(0, tk.END)
    output_dir_entry.insert(0, output_dir)

def start_conversion():
    paf_file = file_entry.get()
    output_dir = output_dir_entry.get()

    # Check if idkey.txt exists and read the keys
    idkey_file = 'D:/idkey.txt'
    if os.path.exists(idkey_file):
        with open(idkey_file, 'r') as file:
            lines = file.readlines()
            secret_id = lines[0].strip().split(': ')[1]
            secret_key = lines[1].strip().split(': ')[1]
    else:
        secret_id = secret_id_entry.get()
        secret_key = secret_key_entry.get()
        
        if not secret_id or not secret_key:
            messagebox.showerror("Error", "Please enter the API Secret ID and Secret Key.")
            return
        
        with open(idkey_file, 'w') as file:
            file.write(f"API Secret ID: {secret_id}\n")
            file.write(f"API Secret Key: {secret_key}\n")

    if not secret_id or not secret_key:
        messagebox.showerror("Error", "Please enter the API Secret ID and Secret Key.")
        return

    process_pdf(paf_file, secret_id, secret_key, output_dir)

# 创建主窗口
root = tk.Tk()
root.title("PDF to Markdown Converter")

# 创建并放置标签和文本框
tk.Label(root, text="Select PDF File:").grid(row=0, column=0, padx=10, pady=10)
file_entry = tk.Entry(root, width=50)
file_entry.grid(row=0, column=1, padx=10, pady=10)
tk.Button(root, text="Browse", command=select_file).grid(row=0, column=2, padx=10, pady=10)

tk.Label(root, text="Select Output Directory:").grid(row=1, column=0, padx=10, pady=10)
output_dir_entry = tk.Entry(root, width=50)
output_dir_entry.grid(row=1, column=1, padx=10, pady=10)
tk.Button(root, text="Browse", command=select_output_dir).grid(row=1, column=2, padx=10, pady=10)

tk.Label(root, text="API Secret ID:").grid(row=2, column=0, padx=10, pady=10)
secret_id_entry = tk.Entry(root, width=50)
secret_id_entry.grid(row=2, column=1, padx=10, pady=10)

tk.Label(root, text="API Secret Key:").grid(row=3, column=0, padx=10, pady=10)
secret_key_entry = tk.Entry(root, width=50, show='*')
secret_key_entry.grid(row=3, column=1, padx=10, pady=10)

tk.Button(root, text="Convert", command=start_conversion).grid(row=4, column=1, pady=20)

# 运行主循环
root.mainloop()

lianxiang1122 · 发表于 2024-8-15 11:44

有段时间没有用这个了，要不是楼上qq63还真没发现这货居然修改了服务器。。。。。。调到“ 文档解析”里去了。。。我说怎么一提交就报错呢。。。。

参考官方说明：https://cloud.tencent.com/document/api/866/104610

主要修改如下：
1.

[Python] 纯文本查看 复制代码

#httpProfile.endpoint = "ocr.tencentcloudapi.com" 之前
    httpProfile.endpoint = "lke.tencentcloudapi.com"  #修改后

2.

[Python] 纯文本查看 复制代码

#"FileType": "PDF", 取消pdf设置，直接删除改行代码

3.

[Python] 纯文本查看 复制代码

#common_client = CommonClient("ocr", "2018-11-19", cred, "ap-guangzhou", profile=clientProfile)  之前
    common_client = CommonClient("lke", "2023-11-30", cred, "ap-guangzhou", profile=clientProfile)   #现在

完整代码修改参考如下：

[Python] 纯文本查看 复制代码

import os
import tkinter as tk
from tkinter import filedialog, messagebox
import base64
import fitz  # PyMuPDF
import json
from tencentcloud.common.common_client import CommonClient
from tencentcloud.common import credential
from tencentcloud.common.profile.client_profile import ClientProfile
from tencentcloud.common.profile.http_profile import HttpProfile
from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException

def read_pdf_to_base64(paf_file):
    with open(paf_file, 'rb') as pdf_file:
        binary_data = pdf_file.read()
    base64_encoded_data = base64.b64encode(binary_data)
    return base64_encoded_data.decode('utf-8')

def decode_base64_to_markdown(base64_str):
    decoded_bytes = base64.b64decode(base64_str)
    return decoded_bytes.decode('utf-8')

def save_as_md_file(content, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(content)

def ocr_markdown(base64_string, FileStartPageNumber, FileEndPageNumber, secret_id, secret_key):
    cred = credential.Credential(secret_id, secret_key)
    # 实例化一个http选项，可选的，没有特殊需求可以跳过
    httpProfile = HttpProfile()
    httpProfile.endpoint = "lke.tencentcloudapi.com"
    clientProfile = ClientProfile()
    clientProfile.httpProfile = httpProfile

    params_set = {
        #"FileType": "PDF",
        "FileBase64": base64_string,
        "FileStartPageNumber": FileStartPageNumber,
        "FileEndPageNumber": FileEndPageNumber
    }

    params = json.dumps(params_set)

    common_client = CommonClient("lke", "2023-11-30", cred, "ap-guangzhou", profile=clientProfile)

    try:
        response = common_client.call_json("ReconstructDocument", json.loads(params))
        return response['Response']['MarkdownBase64']
    except TencentCloudSDKException as err:
        error_message = f"An error occurred: {err}\nPlease enter the correct API Secret ID and Secret Key."
        messagebox.showerror("Error", error_message)
        
        idkey_file = 'D:/idkey.txt'
        if os.path.exists(idkey_file):
            if messagebox.askyesno("Delete idkey.txt", "The idkey.txt file exists. Do you want to delete it?"):
                os.remove(idkey_file)
                messagebox.showinfo("Deleted", "idkey.txt file has been deleted. Please enter the correct API Secret ID and Secret Key.")
        return None

def process_pdf(paf_file, secret_id, secret_key, output_dir):
    base64_string = read_pdf_to_base64(paf_file)
    doc = fitz.open(paf_file)
    page_count = doc.page_count

    markdown_output = ''
    pdf_filename = os.path.splitext(os.path.basename(paf_file))[0]
    output_filepath = os.path.join(output_dir, f"{pdf_filename}.md")

    if page_count <= 10:
        FileStartPageNumber = 1
        FileEndPageNumber = page_count
        output = ocr_markdown(base64_string, FileStartPageNumber, FileEndPageNumber, secret_id, secret_key)
        if output:
            markdown_output = decode_base64_to_markdown(output)
            save_as_md_file(markdown_output, output_filepath)
    else:
        num = page_count // 10
        num_last = page_count % 10
        for i in range(num):
            FileStartPageNumber = i * 10 + 1
            FileEndPageNumber = i * 10 + 10
            output = ocr_markdown(base64_string, FileStartPageNumber, FileEndPageNumber, secret_id, secret_key)
            if output:
                markdown_output += decode_base64_to_markdown(output) + '\n'
        if num_last != 0:
            FileStartPageNumber = num * 10 + 1
            FileEndPageNumber = num * 10 + num_last
            output = ocr_markdown(base64_string, FileStartPageNumber, FileEndPageNumber, secret_id, secret_key)
            if output:
                markdown_output += decode_base64_to_markdown(output) + '\n'
        save_as_md_file(markdown_output, output_filepath)

def select_file():
    file_path = filedialog.askopenfilename(filetypes=[("PDF files", "*.pdf")])
    file_entry.delete(0, tk.END)
    file_entry.insert(0, file_path)

def select_output_dir():
    output_dir = filedialog.askdirectory()
    output_dir_entry.delete(0, tk.END)
    output_dir_entry.insert(0, output_dir)

def start_conversion():
    paf_file = file_entry.get()
    output_dir = output_dir_entry.get()

    # Check if idkey.txt exists and read the keys
    idkey_file = 'D:/idkey.txt'
    if os.path.exists(idkey_file):
        with open(idkey_file, 'r') as file:
            lines = file.readlines()
            secret_id = lines[0].strip().split(': ')[1]
            secret_key = lines[1].strip().split(': ')[1]
    else:
        secret_id = secret_id_entry.get()
        secret_key = secret_key_entry.get()
        
        if not secret_id or not secret_key:
            messagebox.showerror("Error", "Please enter the API Secret ID and Secret Key.")
            return
        
        with open(idkey_file, 'w') as file:
            file.write(f"API Secret ID: {secret_id}\n")
            file.write(f"API Secret Key: {secret_key}\n")

    if not secret_id or not secret_key:
        messagebox.showerror("Error", "Please enter the API Secret ID and Secret Key.")
        return

    process_pdf(paf_file, secret_id, secret_key, output_dir)

# 创建主窗口
root = tk.Tk()
root.title("PDF to Markdown Converter")

# 创建并放置标签和文本框
tk.Label(root, text="Select PDF File:").grid(row=0, column=0, padx=10, pady=10)
file_entry = tk.Entry(root, width=50)
file_entry.grid(row=0, column=1, padx=10, pady=10)
tk.Button(root, text="Browse", command=select_file).grid(row=0, column=2, padx=10, pady=10)

tk.Label(root, text="Select Output Directory:").grid(row=1, column=0, padx=10, pady=10)
output_dir_entry = tk.Entry(root, width=50)
output_dir_entry.grid(row=1, column=1, padx=10, pady=10)
tk.Button(root, text="Browse", command=select_output_dir).grid(row=1, column=2, padx=10, pady=10)

tk.Label(root, text="API Secret ID:").grid(row=2, column=0, padx=10, pady=10)
secret_id_entry = tk.Entry(root, width=50)
secret_id_entry.grid(row=2, column=1, padx=10, pady=10)

tk.Label(root, text="API Secret Key:").grid(row=3, column=0, padx=10, pady=10)
secret_key_entry = tk.Entry(root, width=50, show='*')
secret_key_entry.grid(row=3, column=1, padx=10, pady=10)

tk.Button(root, text="Convert", command=start_conversion).grid(row=4, column=1, pady=20)

# 运行主循环
root.mainloop()

qq63 · 发表于 2024-8-14 20:48

md文件后，会显示内容，我们全选复制，再粘贴到word中,请问下表格文件转换成MD文件格式后复制到word后格式都没有了，又遇到过的么？测试文件https://www.52pojie.cn/thread-1949762-1-1.html，可以识别但无法完整复制到word

huazhongxu · 发表于 2024-6-17 17:06

这个牛B，无拘无束也支持

dayday26 · 发表于 2024-6-17 17:18

感谢分享，不知道能不能完美实现表格转换

ZhjhJZ · 发表于 2024-6-17 17:26

腾讯云一如既往的优秀，可以每月白嫖1千次！牛！

饮食协会 · 发表于 2024-6-17 17:49

带界面在哪里？

bluepeb · 发表于 2024-6-17 19:01

综合各种技术应用，全方位的高手！

lvtaode0657 · 发表于 2024-6-17 21:18

好东西啊，谢谢分享

Psc7day · 发表于 2024-6-18 00:14

感谢分享

kidll · 发表于 2024-6-18 09:11

我52破解的坤，渍渍渍

Vincent168 · 发表于 2024-6-18 09:28

感谢楼主分享

帐号		自动登录	找回密码
密码			注册[Register]

[Python 原创] 利用腾讯云实现PDF转“word”

免费评分

本帖被以下淘专辑推荐: