python实现简单的英文词频统计

python

  1 __author__ = 'Oscar_Yang'

2 # -*- coding= utf-8 -*-

3 #copyRight by OSCAR

4 """

5 本脚本实现,合并几个英文文本,并且统计词频。

6 脚本定义了几个函数:

7 1、文件读取函数readFile(读取文件,输出每个文件的词频);

8 2、元素为词频的字典的合并函数,并且实现相同词的词频相加,返回全部词频;

9 3、调试部分,利用了高阶函数:map,reduce;

10 4、最后实现格式化输出,输入结果如图片所示。

11 """

12 import functools

13 #定义文件读取函数,并且输出元素为词频的字典

14 def readFile(file_name):

15 y = []

16 with open(file_name, 'r',encoding="utf-8") as f:

17 x=f.readlines()

18 for line in x:

19 y.extend(line.split())

20 word_list2 = []

21

22 # 单词格式化:去掉分词之后部分英文前后附带的标点符号

23 for word in y:

24 # last character of each word

25 word1 = word

26

27 # use a list of punctuation marks

28 while True:

29 lastchar = word1[-1:]

30 if lastchar in [",", ".", "!", "?", ";", '"']:

31 word2 = word1.rstrip(lastchar)

32 word1 = word2

33 else:

34 word2 = word1

35 break

36

37 while True:

38 firstchar = word2[0]

39 if firstchar in [",", ".", "!", "?", ";", '"']:

40 word3 = word2.lstrip(firstchar)

41 word2 = word3

42 else:

43 word3 = word2

44 break

45 # build a wordList of lower case modified words

46 word_list2.append(word3)

47 #统计词频

48 tf = {}

49 for word in word_list2:

50 word = word.lower()

51 # print(word)

52 word = ''.join(word.split())

53 if word in tf:

54 tf[word] += 1

55 else:

56 tf[word] = 1

57 return tf

58

59 def get_counts(words):

60 tf = {}

61 for word in words:

62 word = word.lower()

63 # print(word)

64 word = ''.join(word.split())

65 if word in tf:

66 tf[word] += 1

67 else:

68 tf[word] = 1

69

70

71 #合并两个字典的方法1

72 def merge1(dic1, dic2):

73 for k, v in dic1.items():

74 if k in dic2.keys():

75 dic2[k] += v

76 else:

77 dic2[k] = v

78 # print(dic2)

79 return dic2

80

81 #合并两个字典的方法2

82 def merge2(dic1, dic2):

83 from collections import Counter

84 counts = Counter(dic1) + Counter(dic2)

85 return counts

86

87 #获得前n个最热词和词频

88 def top_counts(word_list,n=10):

89 value_key_pairs = sorted([(count, tz) for tz, count in word_list.items()],reverse=True)

90 return value_key_pairs[:n]

91 # print(value_key_pairs[:n])

92

93 #测试部分

94 if __name__ == '__main__':

95 file_list = [r'E:\graduate\Python\python那些事\articles\article_000.txt',

96 r'E:\graduate\Python\python那些事\articles\article_001.txt',

97 r'E:\graduate\Python\python那些事\articles\article_002.txt',

98 r'E:\graduate\Python\python那些事\articles\article_003.txt',

99 r'E:\graduate\Python\python那些事\articles\article_004.txt',

100 r'E:\graduate\Python\python那些事\articles\article_005.txt']

101

102 cc=map(readFile,file_list)

103 word_list = functools.reduce(merge2,cc)

104 top_counts=top_counts(word_list)

105 # print(top_counts)

106 print ("最常用的单词排行榜:")

107 for word in top_counts[0:10]:

108 print("{0:10}{1}".format(word[1], word[0]))

2016-10-15

运行结果:

以上是 python实现简单的英文词频统计 的全部内容, 来源链接: utcz.com/z/387839.html

回到顶部