机器学习实战之朴素贝叶斯进行文档分类(Python 代码版)

python

    贝叶斯是搞概率论的。学术圈上有个贝叶斯学派。看起来吊吊的。关于贝叶斯是个啥网上有很多资料。想必读者基本都明了。我这里只简单概括下:贝叶斯分类其实就是基于先验概率的基础上的一种分类法,核心公式就是条件概率。举个俗气的例子,通过我们的以往观察,鲤鱼中尾巴是红色的占比达90%,鲫鱼中尾巴是红色的占比只有1%不到,那么新来了一条小鱼,他是鲤鱼还是鲫鱼呢?我看一下他的尾巴,发现是红色,根据过去的先验概率经验,它是鲤鱼的概率比较大,我认为它是鲤鱼。

  这当时是个最简单的例子,实践中的问题就复杂了。比如说特征不止是尾巴红不红,还有鱼嘴巴大不大,鱼肥不肥,鱼身子长还是宽,各种,而且不是一个特征就能分辨出来的,还需要多方分析,然后贝爷感觉这个那个的真麻烦,就先假定每个特征都是独立的,如果一条鱼红尾巴大嘴巴肥得很还是长身子,就这样求她是鲤鱼的概率:鲤鱼中红尾巴0.9*鲤鱼中大嘴巴0.3*鲤鱼中肥猪0.6*鲤鱼中长身子0.4=0.27*0.24.。。。。

  闲话少扯。上代码分析。我代码干的不是鱼的分类了,而是一篇文档。

  

python;gutter:true;">from numpy import * 

def loadDataSet():#这个函数呢,他建立了一个敏感词典,并打了标签,共6个词集合,其中2、4、6词集合中的词是敏感词

postingList = [[\'my\',\'dog\',\'has\',\'flea\',\

\'problems\',\'help\',\'please\'],

[\'maybe\',\'not\',\'take\',\'him\',\

\'to\',\'dog\',\'park\',\'stupid\'],

[\'my\',\'dalmation\',\'is\',\'so\',\'cute\',\

\'T\',\'love\',\'him\'],

[\'stop\',\'posting\',\'stupid\',\'worthless\',\'garbage\'],

[\'mr\',\'licks\',\'ate\',\'my\',\'steak\',\'how\',\

\'to\',\'stop\',\'him\'],

[\'quit\',\'buying\',\'worthless\',\'dog\',\'food\',\'stupid\']]

classVec = [0,1,0,1,0,1]

return postingList,classVec

def createVocabList(dataSet):#这个函数呢,它是把输入的dataset(就是一个新文档嘛)进行分解处理,返回的是这个文档没有重复词的列表

vocabSet = set([])

for document in dataSet:

vocabSet = vocabSet | set(document)

return list(vocabSet)

def setOfWords2Vec(vocabList,inputSet):#这个函数呢,他就是根据输入的新文档,和词汇表,来对新文档打标签,看他有多少敏感词,只要是出现了词汇表里的词,就将标签打1,没有就默认为0

returnVec = [0]*len(vocabList)

for word in inputSet:

if word in vocabList:

returnVec[vocabList.index(word)] =1

else :print (\'the word: %s is not in my Vocabulary!\' % word)

return returnVec

def trainNB0(trainMatrix,trainCategory):

numTrainDocs = len(trainMatrix)

numWords = len(trainMatrix)

pAbusive = sum(trainCategory) / float(numTrainDocs)

p0Num = zeros(numWords)

p1Num= zeros(numWords)

p0Denom = 0.0;p1Denom = 0.0

for i in range(numTrainDocs):

if trainCategory[i] == 1:

p1Num += trainMatrix[i]

p1Denom += sum(trainMatrix[i])

else:

p0Num += trainMatrix[i]

p0Denom += sum(trainMatrix[i])

p1Vect = p1Num/p1Denom

p0Vect = p0Num /p0Denom

return p0Vect,p1Vect,pAbusive

def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
    p1= sum(vec2Classify * p1Vec) + log(pClass1)
    p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else :
        return 0
def testingNB():
    listOPosts,listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat=[]
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
    p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses))
    testEntry = [\'love\',\'my\',\'dalmation\']
    thisDoc = array(setOfWords2Vec(myVocabList,testEntry))
    print (testEntry,\'classified as: \',classifyNB(thisDoc,p0V,p1V,pAb))
    testEntry = [\'stupid\',\'garbage\']
    thisDoc = array(setOfWords2Vec(myVocabList,testEntry))
    print (testEntry,\'classified as :\',classifyNB(thisDoc,p0V,p1V,pAb))
def bagOfWords2VecMN(vocabList,inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] +=1
    return returnVec
def textParse(bigString):
    import re
    listOfTokens = re.split(r\'\W*\',bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) >2]
def spamTest():
    docList = []; classList = [];fullText = []
    for i in range(1,26):
        wordList = textParse(open(\'E:/数据挖掘/MLiA_SourceCode/machinelearninginaction/Ch04/email/spam/%d.txt\' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
     #   print(\'zhe li de i shi %d,\',  i)
        wordList = textParse(open(\'E:/数据挖掘/MLiA_SourceCode/machinelearninginaction/Ch04/email/ham/%d.txt\' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    trainingSet = list(range(50));testSet=[]
    for i in range(10):
        randIndex  = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMat=[];trainClasses=[]
    for docIndex in trainingSet:
        trainMat.append(setOfWords2Vec(vocabList,docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
    errorCount=0
    for docIndex in testSet:
        wordVector = setOfWords2Vec(vocabList,docList[docIndex])
        if classifyNB(array(wordVector),p0V,p1V,pSpam) !=classList[docIndex]:
            errorCount +=1
    print (\'the error rate is :\',float(errorCount)/len(testSet))
       

 

以上是 机器学习实战之朴素贝叶斯进行文档分类(Python 代码版) 的全部内容, 来源链接: utcz.com/z/386523.html

回到顶部