[Python] 纯文本查看 复制代码
import pandas as pd
# from jieba import posseg # 用于中文分词
def chinese_char_count(some_string):
return len([c for c in some_string if c >= '\u4e00'])
def match_by_similarity(df, full_col, kw_col, len_threshold=3):
result_list = []
for index, row in df.iterrows():
keywords = row[kw_col]
print(keywords)
keyword_words = [w for w in keywords]
# print(keyword_words)
for index1, row1 in df.iterrows():
full_name = row1[full_col]
# print(full_name)
# 对中文进行分词
full_name_words = [w for w in full_name]
# 计算相同字数
common_words = set(full_name_words) & set(keyword_words)
# print(common_words)
if chinese_char_count(''.join(common_words)) >= len_threshold:
print(common_words)
df.at[index, 'Matched'] = full_name
break
else:
df.at[index, 'Matched'] = "没有匹配到"
return df
# 读取Excel文件
df = pd.read_excel('test3.xlsx')
# 执行模糊匹配
data = match_by_similarity(df, "详细地址", "市")
print(data) |