def seg_sentence(sentence):
sentence = re.sub(u'[0-9\.]+', u'', sentence)
jieba.load_userdict('自建词表.txt')#加载自建词表
#suggest_freq((), tune=True) #修改词频,使其能分出来
#jieba.add_word('知识集成') # 这里是加入用户自定义的词来补充jieba词典
sentence_seged =jieba.cut(sentence.strip(),cut_all=False,use_paddle=10)#默认精确模式
#sentence_seged =jieba.cut_for_search(sentence.strip(),HMM=True)#搜索引擎模式
#keywords =jieba.analyse.extract_tags(sentence, topK=30, withWeight=True, allowPOS=('n', 'v','nr', 'ns'))#关键词模式
#sentence_seged=[item[0] for item in keywords]
stopwords = stopwordslist('停用词表.txt') # 这里加载停用词的路径
synwords=synwordslist('近义词表.txt')#这里加载近义词的路径
outstr = ''
for word in sentence_seged:
if word not in stopwords and word.__len__()>1:
if word != '\t':#判断出不是停用词
if word in synwords.keys():#如果是同义词
word = synwords[word]
outstr += word
outstr += " "
else:
outstr += word
outstr += " "
return outstr
2.加载分词文件,构建词典及向量空间导入分词文件:
[Python] 纯文本查看复制代码
def infile(fliepath):
#输入分词好的TXT,返回train
'''
all=[]
with open(fliepath,'r',encoding='utf-8')as f:
all_1=list(f.readlines())#列表
for i in all_1:#一句
i=i.strip()#去除占位符
if i:
all=all+i.split(' ')
#字典统计词频
dic={}
for key in all:
dic[key]=dic.get(key,0)+1
#print(dic)
#清除词频低的词
all_2=[]#低词频列表
for key,value in dic.items():
if value<=5:
all_2.append(key)
'''
train = []
fp = open(fliepath,'r',encoding='utf8')
for line in fp:
new_line=[]
if len(line)>1:
line = line.strip().split(' ')
for w in line:
w.encode(encoding='utf-8')
new_line.append(w)
if len(new_line)>1:
train.append(new_line)
return train
构建词典及向量空间:
[Python] 纯文本查看复制代码
def deal(train):
#输入train,输出词典,texts和向量
id2word = corpora.Dictionary(train) # Create Dictionary
texts = train # Create Corpus
corpus = [id2word.doc2bow(text) for text in texts] # Term Document Frequency
#使用tfidf
tfidf = models.TfidfModel(corpus)
corpus = tfidf[corpus]
id2word.save('tmp/deerwester.dict') #保存词典
corpora.MmCorpus.serialize('tmp/deerwester.mm', corpus)#保存corpus
return id2word,texts,corpus
3.进行LDA分析这里使用主题一致性指数和困惑度指数来确定合理的主题数目。
[Python] 纯文本查看复制代码
def run(corpus_1,id2word_1,num,texts):
#标准LDA算法
lda_model = LdaModel(corpus=corpus_1,
id2word=id2word_1,
num_topics=num,
passes=60,
alpha=(50/num),
eta=0.01,
random_state=42)
# num_topics:主题数目
# passes:训练伦次
# num:每个主题下输出的term的数目
#输出主题
#topic_list = lda_model.print_topics()
#for topic in topic_list:
#print(topic)
# 困惑度
perplex=lda_model.log_perplexity(corpus_1) # a measure of how good the model is. lower the better.
# 一致性
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word_1, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
#print('\n一致性指数: ', coherence_lda) # 越高越好
return lda_model,coherence_lda,perplex
def compute_coherence_values(dictionary, corpus, texts,start, limit, step):
"""
Compute c_v coherence for various number of topics
Parameters:
----------
dictionary : Gensim dictionary
corpus : Gensim corpus
texts : List of input texts
limit : Max num of topics
Returns:
-------
model_list : List of LDA topic models
coherence_values : Coherence values corresponding to the LDA model with respective number of topics
"""
coherence_values = []
perplexs=[]
model_list = []
for num_topic in range(start, limit, step):
#模型
lda_model,coherence_lda,perplex=run(corpus,dictionary,num_topic,texts)
#lda_model = LdaModel(corpus=corpus,num_topics=num_topic,id2word=dictionary,passes=50)
model_list.append(lda_model)
perplexs.append(perplex)#困惑度
#一致性
#coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
#coherence_lda = coherence_model_lda.get_coherence()
coherence_values.append(coherence_lda)
return model_list, coherence_values,perplexs