使用python批量将doc文档转换为docx
由于需要对word文档中的图片进行处理,但是python无法直接对doc格式进行处理,所以有了这个玩意,话说,兄弟萌有没有办法让这个代码处理速度更快import os
import win32com.client
from tkinter import filedialog
from tkinter import Tk
def doc_to_docx(doc_path):
try:
word = win32com.client.Dispatch("Word.Application")
word.Visible = 0
doc = word.Documents.Open(doc_path)
# 生成新的文件名,替换原来的 .doc 扩展为 .docx
docx_path = os.path.splitext(doc_path) + '.docx'
doc.SaveAs(docx_path, FileFormat=16)
print(f"Converted: {doc_path} to {docx_path}")
except Exception as e:
print(f"Failed to convert {doc_path} due to {str(e)}")
finally:
doc.Close()
word.Quit()
# 删除原.doc文件
os.remove(doc_path)
return docx_path
def convert_all_docs_in_folder(folder_path):
# 遍历文件夹中的所有文件
for file_name in os.listdir(folder_path):
# 获取文件的完整路径
full_file_name = os.path.join(folder_path, file_name)
# 检查文件是否是.doc文件
if os.path.splitext(file_name)[-1].lower() == '.doc':
# 转换文件
doc_to_docx(full_file_name)
if __name__ == '__main__':
# 创建一个Tk root窗口,但不显示
root = Tk()
root.withdraw()
# 打开文件夹选择对话框,让用户选择需要转换的文件夹
folder_path = filedialog.askdirectory()
if folder_path:
convert_all_docs_in_folder(folder_path) 本帖最后由 矢岛舞美 于 2024-1-31 11:43 编辑
优化了一下,转化速度大大提升了:
[*]使用glob模块来直接获取所有的.doc文件,而不是遍历所有文件,减少不必要的文件系统交互。
[*]将Word.Application对象的创建和退出移到循环外部,只需要启动和退出一次Word应用程序,而不是为每个文件都启动和退出一次。
下面是修改后的代码:import os
import win32com.client
from tkinter import filedialog
from tkinter import Tk
import glob
from datetime import datetime
def doc_to_docx(word, doc_path):
try:
doc = word.Documents.Open(doc_path)
docx_path = os.path.splitext(doc_path) + '.docx'
doc.SaveAs(docx_path, FileFormat=16)
current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
print(f"{current_time} Converted: {doc_path} to {docx_path}")
doc.Close()
os.remove(doc_path)
return docx_path
except Exception as e:
print(f"Failed to convert {doc_path} due to {str(e)}")
def convert_all_docs_in_folder(folder_path):
word = win32com.client.Dispatch("Word.Application")
word.Visible = 0
for doc_path in glob.glob(os.path.join(folder_path, '*.doc')):
doc_to_docx(word, doc_path)
word.Quit()
if __name__ == '__main__':
root = Tk()
root.withdraw()
folder_path = filedialog.askdirectory()
if folder_path:
convert_all_docs_in_folder(folder_path) 阿哈哈哈哈之前也写过类似的,就是因为python不能直接处理doc,不过对于性能没有考量,大佬们帮忙看看呢
import win32com.client as win32
from win32com.client import constants
import os
# 输入文件夹路径
path1 = input('请输入文件夹路径:') + '/'
# 创建Word应用程序对象
word = win32.gencache.EnsureDispatch('Word.Application')
number = 0
# 遍历文件夹中的所有.doc文件并进行转换
for root, dirs, files in os.walk(path1):
for file in files:
if file.endswith('.doc'):
doc_path = os.path.join(root, file)
doc = word.Documents.Open(doc_path)
doc.Activate()
# 将文件另存为.docx格式
new_file_path = os.path.abspath(doc_path)
new_file_path = new_file_path.replace('.doc', '.docx')
word.ActiveDocument.SaveAs(new_file_path, FileFormat=constants.wdFormatXMLDocument)
# 关闭当前文档
doc.Close(False)
os.remove(doc_path)
number += 1
# 退出Word应用程序
word.Quit()
print(f'已处理完成。共转换{number}个文件') 谢谢分享!!! 看网上都是基于无界面打开word操作的
估计docx格式转换很难做
能提速的方案就是并发了 把批量文件放进文件夹,函数指定文件夹做格式转换,完美解决 BTFKM 发表于 2024-1-31 11:15
看网上都是基于无界面打开word操作的
估计docx格式转换很难做
能提速的方案就是并发了
有点道理,我试试 谢谢分享 import os
import glob
from datetime import datetime
import win32com.client
from tkinter import filedialog
from tkinter import Tk
from concurrent.futures import ThreadPoolExecutor
def doc_to_docx(word, doc_path):
try:
doc = word.Documents.Open(doc_path)
docx_path = os.path.splitext(doc_path) + '.docx'
doc.SaveAs(docx_path, FileFormat=16)
current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
print(f"{current_time} Converted: {doc_path} to {docx_path}")
doc.Close()
return doc_path, docx_path# 返回旧文件和新文件路径
except Exception as e:
print(f"Failed to convert {doc_path} due to {str(e)}")
return None, None
def convert_all_docs_in_folder(folder_path):
word = win32com.client.Dispatch("Word.Application")
word.Visible = 0
# 使用线程池来提高效率
with ThreadPoolExecutor() as executor:
futures =
for future in futures:
old_path, new_path = future.result()
if old_path:
os.remove(old_path)# 在转换完成后删除原文件
word.Quit()
if __name__ == '__main__':
root = Tk()
root.withdraw()
folder_path = filedialog.askdirectory()
if folder_path:
convert_all_docs_in_folder(folder_path)
现在看看 import os
import win32com.client
from tkinter import filedialog, Tk
import concurrent.futures
def doc_to_docx(doc_path):
try:
word = win32com.client.Dispatch("Word.Application")
word.Visible = 0
doc = word.Documents.Open(doc_path)
docx_path = os.path.splitext(doc_path) + '.docx'
doc.SaveAs(docx_path, FileFormat=16)
print(f"Converted: {doc_path} to {docx_path}")
return docx_path
except Exception as e:
print(f"Failed to convert {doc_path} due to {str(e)}")
finally:
doc.Close()
word.Quit()
def convert_all_docs_in_folder(folder_path):
with concurrent.futures.ThreadPoolExecutor() as executor:
for file_name in os.listdir(folder_path):
full_file_name = os.path.join(folder_path, file_name)
if file_name.lower().endswith('.doc'):
future = executor.submit(doc_to_docx, full_file_name)
docx_path = future.result()
if docx_path:
os.remove(full_file_name)
if __name__ == '__main__':
root = Tk()
root.withdraw()
folder_path = filedialog.askdirectory()
if folder_path:
convert_all_docs_in_folder(folder_path)
上述代码使用了 concurrent.futures.ThreadPoolExecutor 来创建一个线程池,实现了对文件转换的并发处理。这样可以更充分地利用系统资源,提高文件转换的效率。
AI 谢谢分享,试一下
页:
[1]
2