吾爱破解 - 52pojie.cn

 找回密码
 注册[Register]

QQ登录

只需一步,快速开始

查看: 540|回复: 12
上一主题 下一主题
收起左侧

[Python 原创] 批量提取word文档标题

[复制链接]
跳转到指定楼层
楼主
Eks6666 发表于 2025-4-3 09:41 回帖奖励
[Python] 纯文本查看 复制代码
001
002
003
004
005
006
007
008
009
010
011
012
013
014
015
016
017
018
019
020
021
022
023
024
025
026
027
028
029
030
031
032
033
034
035
036
037
038
039
040
041
042
043
044
045
046
047
048
049
050
051
052
053
054
055
056
057
058
059
060
061
062
063
064
065
066
067
068
069
070
071
072
073
074
075
076
077
078
079
080
081
082
083
084
085
086
087
088
089
090
091
092
093
094
095
096
097
098
099
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
import os
import sys
from collections import defaultdict
from docx import Document
from PyQt5.QtWidgets import (QApplication, QMainWindow, QVBoxLayout, QHBoxLayout, QWidget,
                            QLabel, QPushButton, QTextEdit, QFileDialog, QSpinBox,
                            QGroupBox, QProgressBar)
from PyQt5.QtCore import Qt
from PyQt5.QtGui import QFont, QIcon
 
class TitleExtractorApp(QMainWindow):
    def __init__(self):
        super().__init__()
        self.setWindowTitle("Word文档标题提取工具")
        self.setWindowIcon(QIcon('icon.png'))  # 请准备一个图标文件或删除这行
        self.setGeometry(100, 100, 800, 600)
         
        # 设置主窗口样式
        self.setStyleSheet("""
            QMainWindow {
                background-color: #f5f5f5;
            }
            QGroupBox {
                border: 1px solid #ccc;
                border-radius: 5px;
                margin-top: 10px;
                padding-top: 15px;
                font-size: 14px;
            }
            QGroupBox::title {
                subcontrol-origin: margin;
                left: 10px;
                padding: 0 3px;
            }
            QPushButton {
                background-color: #4CAF50;
                border: none;
                color: white;
                padding: 8px 16px;
                text-align: center;
                text-decoration: none;
                font-size: 14px;
                margin: 4px 2px;
                border-radius: 4px;
            }
            QPushButton:hover {
                background-color: #45a049;
            }
            QPushButton:pressed {
                background-color: #3e8e41;
            }
            QTextEdit {
                border: 1px solid #ccc;
                border-radius: 4px;
                padding: 8px;
                font-family: 'Segoe UI', Arial, sans-serif;
            }
            QSpinBox {
                padding: 5px;
                font-size: 14px;
            }
        """)
         
        self.init_ui()
         
    def init_ui(self):
        # 主布局
        main_widget = QWidget()
        main_layout = QVBoxLayout()
         
        # 标题
        title_label = QLabel("Word文档标题提取工具")
        title_label.setFont(QFont('Arial', 16, QFont.Bold))
        title_label.setAlignment(Qt.AlignCenter)
        title_label.setStyleSheet("color: #333; margin-bottom: 20px;")
         
        # 设置组
        settings_group = QGroupBox("提取设置")
        settings_layout = QHBoxLayout()
         
        # 文件夹选择
        folder_layout = QVBoxLayout()
        self.folder_label = QLabel("未选择文件夹")
        self.folder_label.setStyleSheet("color: #666;")
        browse_button = QPushButton("选择文件夹")
        browse_button.clicked.connect(self.select_folder)
        folder_layout.addWidget(QLabel("文档文件夹:"))
        folder_layout.addWidget(self.folder_label)
        folder_layout.addWidget(browse_button)
         
        # 标题级别设置
        level_layout = QVBoxLayout()
        level_label = QLabel("最大标题级别:")
        self.level_spin = QSpinBox()
        self.level_spin.setRange(1, 6)
        self.level_spin.setValue(3)
        level_layout.addWidget(level_label)
        level_layout.addWidget(self.level_spin)
         
        # 进度条
        self.progress_bar = QProgressBar()
        self.progress_bar.setRange(0, 100)
        self.progress_bar.setValue(0)
        self.progress_bar.setTextVisible(False)
         
        # 添加到设置组
        settings_layout.addLayout(folder_layout, 70)
        settings_layout.addLayout(level_layout, 30)
        settings_group.setLayout(settings_layout)
         
        # 操作按钮
        button_layout = QHBoxLayout()
        extract_button = QPushButton("提取标题")
        extract_button.clicked.connect(self.extract_titles)
        extract_button.setStyleSheet("background-color: #2196F3;")
         
        save_button = QPushButton("保存结果")
        save_button.clicked.connect(self.save_results)
        save_button.setStyleSheet("background-color: #FF9800;")
         
        clear_button = QPushButton("清空结果")
        clear_button.clicked.connect(self.clear_results)
        clear_button.setStyleSheet("background-color: #f44336;")
         
        button_layout.addWidget(extract_button)
        button_layout.addWidget(save_button)
        button_layout.addWidget(clear_button)
         
        # 结果显示
        result_group = QGroupBox("提取结果")
        result_layout = QVBoxLayout()
        self.result_text = QTextEdit()
        self.result_text.setReadOnly(True)
        result_layout.addWidget(self.result_text)
        result_group.setLayout(result_layout)
         
        # 组装主布局
        main_layout.addWidget(title_label)
        main_layout.addWidget(settings_group)
        main_layout.addWidget(self.progress_bar)
        main_layout.addLayout(button_layout)
        main_layout.addWidget(result_group)
         
        main_widget.setLayout(main_layout)
        self.setCentralWidget(main_widget)
         
        # 状态栏
        self.statusBar().showMessage("准备就绪")
         
        # 初始化变量
        self.selected_folder = ""
        self.extracted_data = {}
     
    def select_folder(self):
        folder = QFileDialog.getExistingDirectory(self, "选择包含Word文档的文件夹")
        if folder:
            self.selected_folder = folder
            self.folder_label.setText(folder)
            self.statusBar().showMessage(f"已选择文件夹: {folder}")
     
    def extract_titles(self):
        if not self.selected_folder:
            self.statusBar().showMessage("请先选择文件夹!", 3000)
            return
             
        max_level = self.level_spin.value()
        self.result_text.clear()
        self.extracted_data = {}
        file_count = 0
         
        # 统计文件数量用于进度条
        total_files = sum(1 for _, _, files in os.walk(self.selected_folder)
                         for f in files if f.endswith('.docx'))
        if total_files == 0:
            self.statusBar().showMessage("所选文件夹中没有找到Word文档!", 3000)
            return
             
        processed_files = 0
         
        for root, _, files in os.walk(self.selected_folder):
            for filename in files:
                if filename.endswith('.docx'):
                    file_path = os.path.join(root, filename)
                    try:
                        doc = Document(file_path)
                        file_data = defaultdict(list)
                         
                        for p in doc.paragraphs:
                            if p.style.name.startswith('Heading'):
                                try:
                                    level = int(p.style.name.split()[1])
                                    if level <= max_level:
                                        file_data[level].append(p.text)
                                except (IndexError, ValueError):
                                    continue
                         
                        if file_data:
                            self.extracted_data[filename] = file_data
                             
                        processed_files += 1
                        progress = int((processed_files / total_files) * 100)
                        self.progress_bar.setValue(progress)
                         
                    except Exception as e:
                        self.result_text.append(f"处理文件 {filename} 时出错: {str(e)}\n")
         
        # 显示结果
        self.display_results()
        self.progress_bar.setValue(100)
        self.statusBar().showMessage(f"提取完成!共处理 {processed_files} 个文件", 5000)
     
    def display_results(self):
        self.result_text.clear()
        if not self.extracted_data:
            self.result_text.append("没有提取到任何标题数据")
            return
             
        for filename, levels in self.extracted_data.items():
            self.result_text.append(f"=== {filename} ===")
            for level in sorted(levels.keys()):
                self.result_text.append(f"\n[标题 {level}]")
                for i, title in enumerate(levels[level], 1):
                    self.result_text.append(f"{i}. {title}")
            self.result_text.append("\n")
     
    def save_results(self):
        if not self.extracted_data:
            self.statusBar().showMessage("没有可保存的数据!", 3000)
            return
             
        file_path, _ = QFileDialog.getSaveFileName(self, "保存结果", "", "文本文件 (*.txt)")
        if file_path:
            try:
                with open(file_path, 'w', encoding='utf-8') as f:
                    for filename, levels in self.extracted_data.items():
                        f.write(f"=== {filename} ===\n")
                        for level in sorted(levels.keys()):
                            f.write(f"\n[标题 {level}]\n")
                            for i, title in enumerate(levels[level], 1):
                                f.write(f"{i}. {title}\n")
                        f.write("\n")
                self.statusBar().showMessage(f"结果已保存到: {file_path}", 5000)
            except Exception as e:
                self.statusBar().showMessage(f"保存失败: {str(e)}", 5000)
     
    def clear_results(self):
        self.result_text.clear()
        self.extracted_data = {}
        self.progress_bar.setValue(0)
        self.statusBar().showMessage("已清空结果", 3000)
 
if __name__ == "__main__":
    app = QApplication(sys.argv)
     
    # 设置全局字体
    font = QFont()
    font.setFamily("Segoe UI")
    font.setPointSize(10)
    app.setFont(font)
     
    window = TitleExtractorApp()
    window.show()
    sys.exit(app.exec_())

免费评分

参与人数 1吾爱币 +7 热心值 +1 收起 理由
苏紫方璇 + 7 + 1 欢迎分析讨论交流,吾爱破解论坛有你更精彩!

查看全部评分

发帖前要善用论坛搜索功能,那里可能会有你要找的答案或者已经有人发布过相同内容了,请勿重复发帖。

推荐
策士 发表于 2025-4-4 20:05
zen mo  da bao   >?wo d e shu  ru fa  da bu chu lai hanzi  le ,zaban
沙发
ronalp 发表于 2025-4-3 09:51
3#
gts5122 发表于 2025-4-3 10:30
4#
realcsnake 发表于 2025-4-3 11:03
应该是批量提取很多文件的标题
5#
xiaoruan1980 发表于 2025-4-3 11:21
这段代码目的是提取word文档内容里的标题吗?
6#
stormdzh 发表于 2025-4-3 11:37
楼主给个示例看看?
7#
Mblacker 发表于 2025-4-3 11:37
感谢分享
8#
alucard0992 发表于 2025-4-3 12:28
楼主举个例子看看?不明白,谢谢
9#
xingdh 发表于 2025-4-3 13:14
看代码的格式,很像是AI写的,不像原创。
10#
pyjiujiu 发表于 2025-4-3 14:58
理解一下,应该是提取 设置的不同层级的标题,结果就是 类似大纲一样的
您需要登录后才可以回帖 登录 | 注册[Register]

本版积分规则

返回列表

RSS订阅|小黑屋|处罚记录|联系我们|吾爱破解 - LCG - LSG ( 京ICP备16042023号 | 京公网安备 11010502030087号 )

GMT+8, 2025-4-7 05:02

Powered by Discuz!

Copyright © 2001-2020, Tencent Cloud.

快速回复 返回顶部 返回列表