python实现简单的英文词频统计
1 __author__ = 'Oscar_Yang'2 # -*- coding= utf-8 -*-
3 #copyRight by OSCAR
4 """
5 本脚本实现,合并几个英文文本,并且统计词频。
6 脚本定义了几个函数:
7 1、文件读取函数readFile(读取文件,输出每个文件的词频);
8 2、元素为词频的字典的合并函数,并且实现相同词的词频相加,返回全部词频;
9 3、调试部分,利用了高阶函数:map,reduce;
10 4、最后实现格式化输出,输入结果如图片所示。
11 """
12 import functools
13 #定义文件读取函数,并且输出元素为词频的字典
14 def readFile(file_name):
15 y = []
16 with open(file_name, 'r',encoding="utf-8") as f:
17 x=f.readlines()
18 for line in x:
19 y.extend(line.split())
20 word_list2 = []
21
22 # 单词格式化:去掉分词之后部分英文前后附带的标点符号
23 for word in y:
24 # last character of each word
25 word1 = word
26
27 # use a list of punctuation marks
28 while True:
29 lastchar = word1[-1:]
30 if lastchar in [",", ".", "!", "?", ";", '"']:
31 word2 = word1.rstrip(lastchar)
32 word1 = word2
33 else:
34 word2 = word1
35 break
36
37 while True:
38 firstchar = word2[0]
39 if firstchar in [",", ".", "!", "?", ";", '"']:
40 word3 = word2.lstrip(firstchar)
41 word2 = word3
42 else:
43 word3 = word2
44 break
45 # build a wordList of lower case modified words
46 word_list2.append(word3)
47 #统计词频
48 tf = {}
49 for word in word_list2:
50 word = word.lower()
51 # print(word)
52 word = ''.join(word.split())
53 if word in tf:
54 tf[word] += 1
55 else:
56 tf[word] = 1
57 return tf
58
59 def get_counts(words):
60 tf = {}
61 for word in words:
62 word = word.lower()
63 # print(word)
64 word = ''.join(word.split())
65 if word in tf:
66 tf[word] += 1
67 else:
68 tf[word] = 1
69
70
71 #合并两个字典的方法1
72 def merge1(dic1, dic2):
73 for k, v in dic1.items():
74 if k in dic2.keys():
75 dic2[k] += v
76 else:
77 dic2[k] = v
78 # print(dic2)
79 return dic2
80
81 #合并两个字典的方法2
82 def merge2(dic1, dic2):
83 from collections import Counter
84 counts = Counter(dic1) + Counter(dic2)
85 return counts
86
87 #获得前n个最热词和词频
88 def top_counts(word_list,n=10):
89 value_key_pairs = sorted([(count, tz) for tz, count in word_list.items()],reverse=True)
90 return value_key_pairs[:n]
91 # print(value_key_pairs[:n])
92
93 #测试部分
94 if __name__ == '__main__':
95 file_list = [r'E:\graduate\Python\python那些事\articles\article_000.txt',
96 r'E:\graduate\Python\python那些事\articles\article_001.txt',
97 r'E:\graduate\Python\python那些事\articles\article_002.txt',
98 r'E:\graduate\Python\python那些事\articles\article_003.txt',
99 r'E:\graduate\Python\python那些事\articles\article_004.txt',
100 r'E:\graduate\Python\python那些事\articles\article_005.txt']
101
102 cc=map(readFile,file_list)
103 word_list = functools.reduce(merge2,cc)
104 top_counts=top_counts(word_list)
105 # print(top_counts)
106 print ("最常用的单词排行榜:")
107 for word in top_counts[0:10]:
108 print("{0:10}{1}".format(word[1], word[0]))
2016-10-15
运行结果:
以上是 python实现简单的英文词频统计 的全部内容, 来源链接: utcz.com/z/387839.html