Python NLTK 不情绪计算正确
Posted
技术标签:
【中文标题】Python NLTK 不情绪计算正确【英文标题】:Python NLTK not sentiment calculate correct 【发布时间】:2013-11-06 12:02:32 【问题描述】:我确实有一些正面和负面的句子。我想非常简单地使用 Python NLTK 来训练 NaiveBayesClassifier 来调查其他句子的情绪。
我尝试使用此代码,但我的结果始终是肯定的。 http://www.sjwhitworth.com/sentiment-analysis-in-python-using-nltk/
我是python的新手,所以我在复制代码时会出错。
import nltk
import math
import re
import sys
import os
import codecs
reload(sys)
sys.setdefaultencoding('utf-8')
from nltk.corpus import stopwords
__location__ = os.path.realpath(
os.path.join(os.getcwd(), os.path.dirname(__file__)))
postweet = __location__ + "/postweet.txt"
negtweet = __location__ + "/negtweet.txt"
customstopwords = ['band', 'they', 'them']
#Load positive tweets into a list
p = open(postweet, 'r')
postxt = p.readlines()
#Load negative tweets into a list
n = open(negtweet, 'r')
negtxt = n.readlines()
neglist = []
poslist = []
#Create a list of 'negatives' with the exact length of our negative tweet list.
for i in range(0,len(negtxt)):
neglist.append('negative')
#Likewise for positive.
for i in range(0,len(postxt)):
poslist.append('positive')
#Creates a list of tuples, with sentiment tagged.
postagged = zip(postxt, poslist)
negtagged = zip(negtxt, neglist)
#Combines all of the tagged tweets to one large list.
taggedtweets = postagged + negtagged
tweets = []
#Create a list of words in the tweet, within a tuple.
for (word, sentiment) in taggedtweets:
word_filter = [i.lower() for i in word.split()]
tweets.append((word_filter, sentiment))
#Pull out all of the words in a list of tagged tweets, formatted in tuples.
def getwords(tweets):
allwords = []
for (words, sentiment) in tweets:
allwords.extend(words)
return allwords
#Order a list of tweets by their frequency.
def getwordfeatures(listoftweets):
#Print out wordfreq if you want to have a look at the individual counts of words.
wordfreq = nltk.FreqDist(listoftweets)
words = wordfreq.keys()
return words
#Calls above functions - gives us list of the words in the tweets, ordered by freq.
print getwordfeatures(getwords(tweets))
wordlist = []
wordlist = [i for i in wordlist if not i in stopwords.words('english')]
wordlist = [i for i in wordlist if not i in customstopwords]
def feature_extractor(doc):
docwords = set(doc)
features =
for i in wordlist:
features['contains(%s)' % i] = (i in docwords)
return features
#Creates a training set - classifier learns distribution of true/falses in the input.
training_set = nltk.classify.apply_features(feature_extractor, tweets)
classifier = nltk.NaiveBayesClassifier.train(training_set)
print classifier.show_most_informative_features(n=30)
while True:
input = raw_input('ads')
if input == 'exit':
break
elif input == 'informfeatures':
print classifier.show_most_informative_features(n=30)
continue
else:
input = input.lower()
input = input.split()
print '\nWe think that the sentiment was ' + classifier.classify(feature_extractor(input)) + ' in that sentence.\n'
p.close()
n.close()
这只是代码错误吗?或者是什么问题。
当问题开始时,它应该打印出 print classifier.show_most_informative_features(n=30)
但我得到的结果是 Most Informative Features
没有
如果这能给出提示,不要这样做。
谢谢
【问题讨论】:
【参考方案1】:致所有对使用 NLTK 进行情感分析感兴趣的人。这是完整的工作代码。感谢@NLPer
import nltk
import math
import re
import sys
import os
import codecs
reload(sys)
sys.setdefaultencoding('utf-8')
from nltk.corpus import stopwords
__location__ = os.path.realpath(
os.path.join(os.getcwd(), os.path.dirname(__file__)))
postweet = __location__ + "/postweet.txt"
negtweet = __location__ + "/negtweet.txt"
customstopwords = ['band', 'they', 'them']
#Load positive tweets into a list
p = open(postweet, 'r')
postxt = p.readlines()
#Load negative tweets into a list
n = open(negtweet, 'r')
negtxt = n.readlines()
neglist = []
poslist = []
#Create a list of 'negatives' with the exact length of our negative tweet list.
for i in range(0,len(negtxt)):
neglist.append('negative')
#Likewise for positive.
for i in range(0,len(postxt)):
poslist.append('positive')
#Creates a list of tuples, with sentiment tagged.
postagged = zip(postxt, poslist)
negtagged = zip(negtxt, neglist)
#Combines all of the tagged tweets to one large list.
taggedtweets = postagged + negtagged
tweets = []
#Create a list of words in the tweet, within a tuple.
for (word, sentiment) in taggedtweets:
word_filter = [i.lower() for i in word.split()]
tweets.append((word_filter, sentiment))
#Pull out all of the words in a list of tagged tweets, formatted in tuples.
def getwords(tweets):
allwords = []
for (words, sentiment) in tweets:
allwords.extend(words)
return allwords
#Order a list of tweets by their frequency.
def getwordfeatures(listoftweets):
#Print out wordfreq if you want to have a look at the individual counts of words.
wordfreq = nltk.FreqDist(listoftweets)
words = wordfreq.keys()
return words
#Calls above functions - gives us list of the words in the tweets, ordered by freq.
print getwordfeatures(getwords(tweets))
wordlist = getwordfeatures(getwords(tweets))
wordlist = [i for i in wordlist if not i in stopwords.words('english')]
wordlist = [i for i in wordlist if not i in customstopwords]
def feature_extractor(doc):
docwords = set(doc)
features =
for i in wordlist:
features['contains(%s)' % i] = (i in docwords)
return features
#Creates a training set - classifier learns distribution of true/falses in the input.
training_set = nltk.classify.apply_features(feature_extractor, tweets)
classifier = nltk.NaiveBayesClassifier.train(training_set)
print classifier.show_most_informative_features(n=30)
while True:
input = raw_input('ads')
if input == 'exit':
break
elif input == 'informfeatures':
print classifier.show_most_informative_features(n=30)
continue
else:
input = input.lower()
input = input.split()
print '\nWe think that the sentiment was ' + classifier.classify(feature_extractor(input)) + ' in that sentence.\n'
p.close()
n.close()
【讨论】:
【参考方案2】:wordList 为空。它应该分配给 getwordfeatures(getwords(tweets))。
以下两行:
wordlist = [i for i in wordlist if not i in stopwords.words('english')]
和
wordlist = [i for i in wordlist if not i in customstopwords]
是“非此即彼”;您可以尝试哪个停用词列表效果更好。
【讨论】:
以上是关于Python NLTK 不情绪计算正确的主要内容,如果未能解决你的问题,请参考以下文章
Python NLTK:SyntaxError:文件中的非 ASCII 字符“\xc3”(情绪分析-NLP)
Python NLTK pos_tag 未返回正确的词性标记