import os
import csv
import json
def count_words_in_txt(file_path):
word_count = {}
try:
with open(file_path, 'r', encoding='utf-8') as file:
for line in file:
words = line.strip().split()
for word in words:
if word in word_count:
word_count[word] += 1
else:
word_count[word] = 1
except FileNotFoundError:
print(f"文本文件 {file_path} 未找到。")
except UnicodeDecodeError:
print(f"文件 {file_path} 解码错误。")
return word_count
def process_csv(file_path, output_path):
data = []
try:
with open(file_path, 'r', encoding='utf-8') as file:
reader = csv.reader(file)
for row in reader:
if row:
data.append(row[0])
data.sort()
with open(output_path, 'w', encoding='utf-8', newline='') as file:
writer = csv.writer(file)
for item in data:
writer.writerow([item])
except FileNotFoundError:
print(f"CSV文件 {file_path} 未找到。")
def merge_json_info(file_paths, output_path):
merged_info = []
for file_path in file_paths:
try:
with open(file_path, 'r', encoding='utf-8') as file:
json_data = json.load(file)
if 'info' in json_data:
merged_info.append(json_data['info'])
except FileNotFoundError:
print(f"JSON文件 {file_path} 未找到。")
except json.JSONDecodeError:
print(f"JSON文件 {file_path} 解析错误。")
with open(output_path, 'w', encoding='utf-8') as file:
json.dump(merged_info, file, ensure_ascii=False, indent=4)
def main(directory):
txt_word_count = {}
csv_output_path = os.path.join(directory, 'processed_csv.csv')
json_output_path = os.path.join(directory, 'merged_json.json')
json_file_paths = []
for root, dirs, files in os.walk(directory):
for file in files:
file_path = os.path.join(root, file)
if file.endswith('.txt'):
word_count = count_words_in_txt(file_path)
for word, count in word_count.items():
if word in txt_word_count:
txt_word_count[word] += count
else:
txt_word_count[word] = count
elif file.endswith('.csv'):
process_csv(file_path, csv_output_path)
elif file.endswith('.json'):
json_file_paths.append(file_path)
merge_json_info(json_file_paths, json_output_path)
print("文本文件单词统计结果:", txt_word_count)
if __name__ == "__main__":
directory = input("请输入目录路径: ")
main(directory)
代码说明:
count_words_in_txt
函数:用于统计单个.txt
文件中每个单词出现的次数,并处理文件未找到和编码错误的异常。
process_csv
函数:从.csv
文件中提取第一列数据,排序后写入新的.csv
文件,同时处理文件未找到的异常。
merge_json_info
函数:从多个.json
文件中提取info
字段内容并合并到一个新的.json
文件,处理文件未找到和JSON解析错误的异常。
main
函数:遍历指定目录,对不同类型文件调用相应处理函数,并统计所有.txt
文件单词总次数。
if __name__ == "__main__"
:程序入口,通过用户输入指定目录路径并执行主函数。
性能优化考虑:
- 文件读取:使用
with open
语句确保文件正确打开和关闭,提高资源管理效率。
- 内存使用:在处理大文件时,尽量避免一次性将整个文件读入内存。例如在处理
.csv
文件时,逐行读取数据而不是一次性读取整个文件。