本帖最后由 矢岛舞美 于 2024-11-4 11:06 编辑
某天工作中需要将大量word文档中的内容提取到excel表格里面来进行处理,所以便有了这个东西。
[Python] 纯文本查看 复制代码 import os
from docx import Document
import openpyxl
from tkinter import Tk, filedialog
def extract_text_from_docx(docx_path):
doc = Document(docx_path)
paragraphs = doc.paragraphs
title = paragraphs[0].text if paragraphs else ''
content = ''.join([f'<p>{p.text}</p>' for p in paragraphs[1:]])
return title, content
def create_excel(excel_path):
wb = openpyxl.Workbook()
ws = wb.active
ws.append(["标题", "内容"])
wb.save(excel_path)
return excel_path
def append_to_excel(data, excel_path):
wb = openpyxl.load_workbook(excel_path)
ws = wb.active
for title, content in data:
ws.append([title, content])
wb.save(excel_path)
def main():
# 使用Tkinter打开文件夹选择对话框
root = Tk()
root.withdraw() # 隐藏主窗口
folder_path = filedialog.askdirectory(title="选择包含Word文档的文件夹")
if not folder_path:
print("未选择文件夹,程序退出。")
return
excel_path = filedialog.asksaveasfilename(defaultextension=".xlsx", filetypes=[("Excel files", "*.xlsx")],
title="保存Excel文件")
if not excel_path:
print("未选择保存路径,程序退出。")
return
# 创建Excel文件
create_excel(excel_path)
# 使用os.walk递归遍历文件夹
for root_dir, _, files in os.walk(folder_path):
data = []
for filename in files:
if filename.endswith(".docx"):
docx_path = os.path.join(root_dir, filename)
title, content = extract_text_from_docx(docx_path)
data.append((title, content))
# 将数据追加到Excel文件
append_to_excel(data, excel_path)
print(f"数据已成功保存到 {excel_path}")
if __name__ == "__main__":
main()
优化版本,解决提取后内容有标签的问题
[Python] 纯文本查看 复制代码 import os
from docx import Document
import openpyxl
from tkinter import Tk, filedialog
def extract_text_from_docx(docx_path):
doc = Document(docx_path)
paragraphs = doc.paragraphs
title = paragraphs[0].text if paragraphs else ''
content = '\n'.join([p.text for p in paragraphs[1:]])
return title, content
def create_excel(excel_path):
wb = openpyxl.Workbook()
ws = wb.active
ws.append(["标题", "内容"])
wb.save(excel_path)
def append_to_excel(data, excel_path):
wb = openpyxl.load_workbook(excel_path)
ws = wb.active
for title, content in data:
ws.append([title, content])
wb.save(excel_path)
def main():
# 使用Tkinter打开文件夹选择对话框
root = Tk()
root.withdraw() # 隐藏主窗口
folder_path = filedialog.askdirectory(title="选择包含Word文档的文件夹")
if not folder_path:
print("未选择文件夹,程序退出。")
return
excel_path = filedialog.asksaveasfilename(defaultextension=".xlsx", filetypes=[("Excel files", "*.xlsx")],
title="保存Excel文件")
if not excel_path:
print("未选择保存路径,程序退出。")
return
# 创建Excel文件
create_excel(excel_path)
# 使用os.walk递归遍历文件夹
for root_dir, _, files in os.walk(folder_path):
data = []
for filename in files:
if filename.endswith(".docx"):
docx_path = os.path.join(root_dir, filename)
title, content = extract_text_from_docx(docx_path)
data.append((title, content))
# 将数据追加到Excel文件
append_to_excel(data, excel_path)
print(f"数据已成功保存到 {excel_path}")
if __name__ == "__main__":
main()
|