#朴素:考虑每个特征或者词,出项的可能性与它和其他单词相邻没有关系 #每个特征等权重 from numpy import * def loadDataSet(): postingList=[[‘my‘, ‘dog‘, ‘has‘, ‘flea‘, ‘problems‘, ‘help‘, ‘please‘], [‘maybe‘, ‘not‘, ‘take‘, ‘him‘, ‘to‘, ‘dog‘, ‘park‘, ‘stupid‘], [‘my‘, ‘dalmation‘, ‘is‘, ‘so‘, ‘cute‘, ‘I‘, ‘love‘, ‘him‘], [‘stop‘, ‘posting‘, ‘stupid‘, ‘worthless‘, ‘garbage‘], [‘mr‘, ‘licks‘, ‘ate‘, ‘my‘, ‘steak‘, ‘how‘, ‘to‘, ‘stop‘, ‘him‘], [‘quit‘, ‘buying‘, ‘worthless‘, ‘dog‘, ‘food‘, ‘stupid‘]] classVec = [0,1,0,1,0,1] #1 is abusive, 0 not return postingList,classVec #创建一个单词的集合 def createVocabList(dataSet): vocabSet = set([]) #创建空集合 for document in dataSet: vocabSet |= set(document) return list(vocabSet) #判断文档出现在词汇表中 def setOfWordsVec(vocabSet,inputSet): returnVec = [0]*len(vocabSet) for word in inputSet: if word in vocabSet: returnVec[vocabSet.index(word)] = 1 else: print ("the word: %s is not in the Vocabulary!" % word) return returnVec def main(): listOPosts,listClasses = loadDataSet() myVocabList = createVocabList(listOPosts) print (myVocabList) print(setOfWordsVec(myVocabList, listOPosts[0])) main()