朴素贝叶斯分类器-垃圾邮件过滤
Posted xuxiaowen1990
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了朴素贝叶斯分类器-垃圾邮件过滤相关的知识,希望对你有一定的参考价值。
# -*- coding:utf-8 -*- import re import numpy as np import random def textParse(bigString): ‘‘‘ 接收一个文档中内容,转换成由字母、数字组成的字符串列表 :param bigString:接收内容 :return:字符串列表 ‘‘‘ reg = re.findall(‘\w3,‘,bigString) return [i.lower() for i in reg] def createVocabList(dataSet): vocabSet = set() for document in dataSet: vocabSet = vocabSet | set(document) return list(vocabSet) def setOfWords2Vec(vocalList,inputSet): ‘‘‘ 根据vocalList词汇表,将输入的inputSet对应转换成向量,字符串出现在vocalList中为1,否则为0 :param vocalList:词汇表 :param inputSet:输入字符串列表 :return: returnVec:文档向量 ‘‘‘ returnVec = [0] * len(vocalList) for word in inputSet: if word in vocalList: returnVec[vocalList.index(word)] = 1 else: print("the word:%s is not in my Vocabulary!"%word) return returnVec def trainNB0(trainMatrix,trainCategory): ‘‘‘ 计算条件概率 p1Vec:[p(w0|1) p(w1|1) p(w3|1) ... p(wn|1)] p0Vec:[p(w0|0) p(w1|0) p(w3|0) ... p(wn|0)] :param trainMatrix:训练矩阵 :param trainCategory:训练矩阵对应的标签 :return: p1Vec-侮辱类的条件概率数组 p0Vec-非侮辱类的条件概率数组 pAbusive-文档属于侮辱类的概率 ‘‘‘ numTrainDocs = len(trainMatrix) numWords = len(trainMatrix[0]) pAbusive = sum(trainCategory)/float(numTrainDocs) p1Vec = np.ones(numWords) p0Vec = np.ones(numWords) p1Demon = 2.0 p0Demon = 2.0 for i in range(numTrainDocs): if trainCategory[i] == 1: p1Vec += trainCategory[i] p1Demon += sum(trainMatrix[i]) else: p0Vec += trainCategory[i] p0Demon += sum(trainMatrix[i]) p1Vec = p1Vec / p1Demon p0Vec = p0Vec / p0Demon return p0Vec,p1Vec,pAbusive def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1): ‘‘‘ 计算并比较文档是侮辱性和非侮辱性概率大小 :param vec2Classify: 待分类的文档向量 :param p0Vec:非侮辱类的条件概率数组 :param p1Vec:侮辱类的条件概率数组 :param pClass1:训练文档中,属于侮辱类的概率 :return:1-属于侮辱类,0-属于非侮辱类 ‘‘‘ p1 = sum(vec2Classify * p1Vec) + np.log(pClass1) p0 = sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1) if p1 > p0: return 1 else: return 0 def spamTest(): ‘‘‘ 随机选择40个文档作为训练数据,10个作为测试数据,测试朴素贝叶斯分类器效果 :return:无,打印错误率 ‘‘‘ docList = [] classList = [] fullText = [] for i in range(1,26): wordList = textParse(open(‘email/spam/%d.txt‘%i,‘r‘,encoding=‘ISO-8859-1‘).read()) docList.append(wordList) classList.append(1) wordList = textParse(open(‘email/ham/%d.txt‘%i,‘r‘,encoding=‘ISO-8859-1‘).read()) docList.append(wordList) classList.append(0) vocalList = createVocabList(docList) trainingSet = list(range(50)) testSet = [] for i in range(10): randIndex = int(random.uniform(0,len(trainingSet))) testSet.append(trainingSet[randIndex]) del trainingSet[randIndex] trainMat = [] trainingClasses = [] for docIndex in trainingSet: trainMat.append(setOfWords2Vec(vocalList,docList[docIndex])) trainingClasses.append(classList[docIndex]) p0V,p1V,pSpam = trainNB0(np.array(trainMat),np.array(trainingClasses)) errorCount = 0 for docIndex in testSet: wordVector = setOfWords2Vec(vocalList,docList[docIndex]) if classifyNB(np.array(wordVector),p0V,p1V,pSpam) != classList[docIndex]: errorCount += 1 print("分类错误的测试集:",docList[docIndex]) print("错误率:%.2f%%"%(float(errorCount)/len(testSet)*100)) if __name__ == ‘__main__‘: spamTest()
参考机器学习实战和博客https://blog.csdn.net/c406495762/article/details/77500679
以上是关于朴素贝叶斯分类器-垃圾邮件过滤的主要内容,如果未能解决你的问题,请参考以下文章