[Python] 纯文本查看 复制代码
import jieba
import xlwt
import os
#导入停顿词
stopwords = {}.fromkeys([ line.rstrip() for line in open('cn_stopwords.txt',encoding='utf-8') ])
# 加载txt列表寻找关键词并保存到excel
def matchKeyWords(ThePath, keyWords,aim_path):
dir_list = os.listdir(ThePath)
book = xlwt.Workbook(encoding='utf-8', style_compression=0)
sheet = book.add_sheet('关键词词频统计', cell_overwrite_ok=True)
sheet.write(0, 0, '代码')
sheet.write(0, 1, '简称')
sheet.write(0, 2, '年份')
sheet.write(0, 3, '总词数')
for i,c_word in enumerate(keyWords):
sheet.write(0, i+4, c_word)
index=0
files = os.listdir(ThePath)
for file in files:
if os.path.splitext(file)[-1] == ".txt":
txt_path = os.path.join(ThePath, file)
stock_code = file.split("_")[0]
stock_name = file.split("_")[2]
year = file.split("_")[1]
sheet.write(index + 1, 0, stock_code)
sheet.write(index + 1, 1, stock_name)
sheet.write(index + 1, 2, year)
print(f'正在统计{file}')
with open(txt_path, "r", encoding='utf-8', errors='ignore')as fp:
text = fp.read()
words_list = list(jieba.cut(text))#jieba分词
words_list = [word for word in words_list if word not in stopwords]#去除停顿词
total_words = len(words_list)#计算总词数
sheet.write(index + 1, 3, str(total_words))
for ind,word in enumerate(keyWords):
word_freq=text.count(word)
sheet.write(index + 1, ind + 4, str(word_freq))
index+=1
book.save(aim_path)
ThePath= r'G:\年报\年报TXT版'#年报所在文件夹
aim_path=r'G:\年报\词频统计'#词频统计数据存放文件夹
keywords = ['营业收入','估值','资产','股东','智能数据分析','智能机器人','机器学习','深度学习']#所要进行统计的关键词
matchKeyWords(ThePath, keywords,f'{aim_path}\词频统计.xls')