分享自己练手的代码,根据关键字查找word文档包含,代码如下,欢迎拍砖。
[Python] 纯文本查看 复制代码 import os
from docx import Document
from win32com.client import Dispatch
# 声明一个文件列表
listFiles = []
# [函数]递归遍历获得目标文件夹全部文件
def getFiles(path):
for children in os.listdir(path):
if os.path.isfile(path + '\\' + children):
listFiles.append(path + '\\' + children)
else:
getFiles(path + '\\' + children)
# 获得文件名列表
getFiles(r'd:\test')
strQueryText = input('输入搜索关键字:')
listQueryResult = []
# [函数]判断文件为docx还是doc,分别用不同的库打开,并判断包含。
def toQuery():
for curFile in listFiles:
if os.path.splitext(curFile)[1] == ".docx":
strQueryContent = ''
curDocument = Document(curFile)
for paragraph in curDocument.paragraphs:
strQueryContent = strQueryContent + paragraph.text
if strQueryText in strQueryContent:
listQueryResult.append(curFile)
print(curFile)
elif os.path.splitext(curFile)[1] == ".doc":
strQueryContent = ''
word = Dispatch('Word.Application')
# word = DispatchEx('Word.Application') # 启动独立的进程
word.Visible = 0
word.DisplayAlerts = 0
doc = word.Documents.Open(FileName=curFile, Encoding='gbk')
for para in doc.paragraphs:
strQueryContent = strQueryContent + para.Range.Text
for t in doc.Tables:
for row in t.Rows:
for cell in row.Cells:
strQueryContent = strQueryContent + cell.Range.Text
doc.Close()
word.Quit
if strQueryText in strQueryContent:
listQueryResult.append(curFile)
print(curFile)
else:
print('[注意] 仅支持docx或doc文件,' + curFile + '被忽略。')
# 开始查询
toQuery()
# 输出结果
if len(listQueryResult) > 0:
print('[结果] 包含关键字的文件:')
print(listQueryResult)
else:
print('[结果] 无包含关键字的搜索结果')
|