[Python] 纯文本查看 复制代码
import tkinter as tk
from tkinter import filedialog
from tkinter import messagebox
from functools import partial
from threading import Thread
import pandas as pd
import re
import requests
from lxml import etree
from openpyxl import load_workbook
from concurrent.futures import ThreadPoolExecutor
from openpyxl.styles import Font, NamedStyle
def get_word_info(word):
# 构造请求URL
url = f'https://www.youdao.com/w/eng/{word}'
try:
paraphrase = ""
data = requests.get(url).text
html = etree.HTML(data)
British_pronunciation = html.xpath('//*[@id="phrsListTab"]/h2/div/span[1]/span/text()')[0]
American_pronunciation = html.xpath('//*[@id="phrsListTab"]/h2/div/span[2]/span/text()')[0]
li_elements = html.xpath('//*[@id="phrsListTab"]/div/ul')
for li in li_elements:
paraphrase = ''.join(li.xpath('.//text()'))
return British_pronunciation, American_pronunciation, paraphrase
except Exception as e:
print(e, word)
return None
def process_text_file(file_path):
# 读取文本文件
with open(file_path, 'r') as file:
content = file.read()
# 将内容分隔为单词列表
words = re.split(r"\b[,.:?!()'\"\s\n\t\r]+?\b", content)
# 全部转换为小写
lowercase_words = [word.lower() for word in words]
# 去重
unique_words = list(set(lowercase_words))
# 排序
sorted_words = sorted(unique_words)
# 清洗
filtered_words = [word for word in sorted_words if "'" not in word and not re.search(r'[\u4e00-\u9fff]', word) and not re.search(r'\d', word)] # 删除包含乱码、数字和缩写的单词
# 导出到Excel
df = pd.DataFrame(filtered_words, columns=['Words'])
output_file = file_path.replace('.txt', '.xlsx')
df.to_excel(output_file, index=False)
# 打开Excel文件
workbook = load_workbook(output_file)
worksheet = workbook.active
worksheet.cell(row=1, column=2, value="British_pronunciation")
worksheet.cell(row=1, column=3, value="American_pronunciation")
worksheet.cell(row=1, column=4, value="paraphrase")
# 创建样式并设置为加粗
bold_style = NamedStyle(name="bold_style")
bold_style.font = Font(bold=True)
worksheet.cell(row=1, column=2).style = bold_style
worksheet.cell(row=1, column=3).style = bold_style
worksheet.cell(row=1, column=4).style = bold_style
# 使用线程池处理请求
with ThreadPoolExecutor() as executor:
futures = [executor.submit(get_word_info, word) for word in filtered_words]
# 遍历每个单元格,获取单词并添加发音和释义
row_index = 2 # 设置初始单元格
for future, row in zip(futures, worksheet.iter_rows(min_row=2, max_col=4)):
word = row[0].value
word_info = future.result()
if word_info:
British_pronunciation, American_pronunciation, paraphrase = word_info
worksheet.cell(row=row_index, column=2).value = British_pronunciation
worksheet.cell(row=row_index, column=3).value = American_pronunciation
worksheet.cell(row=row_index, column=4).value = paraphrase
else:
# 如果单词发音获取不到,则检查单词是否s,ed,ing结尾,如果是,则去除s,d,ing后再试试
if word.endswith(('s', 'ed', 'ing')):
word_without_suffix = re.sub(r'(s|d|ing)$', '', word)
word_info = get_word_info(word_without_suffix)
if word_info:
British_pronunciation, American_pronunciation, paraphrase = word_info
worksheet.cell(row=row_index, column=2).value = British_pronunciation
worksheet.cell(row=row_index, column=3).value = American_pronunciation
worksheet.cell(row=row_index, column=4).value = paraphrase
row_index += 1
# 保存修改后的Excel文件
workbook.save(output_file)
messagebox.showinfo('Success', 'Process completed successfully.')
def browse_file(file_entry):
file_path = filedialog.askopenfilename(filetypes=[('Text Files', '*.txt')])
if file_path:
file_entry.delete(0, tk.END)
file_entry.insert(tk.END, file_path)
def execute_function(file_entry):
file_path = file_entry.get()
if not file_path:
messagebox.showerror('Error', 'Please select a file.')
return
execute_button.config(state=tk.DISABLED)
thread = Thread(target=process_text_file, args=(file_path,))
thread.start()
# Create the main window
window = tk.Tk()
window.title('英文文章切割为单词 V1.0')
window.configure(bg='sky blue')
# Create the file browse widget
file_label = tk.Label(window, text='Select a text file:', bg='sky blue')
file_label.pack()
file_entry = tk.Entry(window, width=50)
file_entry.pack()
browse_button = tk.Button(window, text='Browse', command=partial(browse_file, file_entry))
browse_button.pack()
# Create the execute button
execute_button = tk.Button(window, text='Execute', command=partial(execute_function, file_entry))
execute_button.pack()
# Start the main loop
window.mainloop()