Python 的多项朴素贝叶斯从零开始
Posted
技术标签:
【中文标题】Python 的多项朴素贝叶斯从零开始【英文标题】:Multinomial Naive Bayes for Python from scratch 【发布时间】:2020-07-13 03:26:56 【问题描述】:如果没有 sklearn MultinomialNB 库,我无法从头开始找到并求解多项朴素贝叶斯。但是,当我将 MultinomialNB 分类器拟合到训练集时。但是有一点问题。问题来了
enter image description here
我制作词袋模型并拆分训练集和测试集。
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split #it was "sklearn.cross_validation" but now it changed
X = corpus
y = dataset.id_sentimen
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.30, random_state=0)
vect = TfidfVectorizer(analyzer='word', ngram_range=(1,2))
X_train = vect.fit_transform(X_train).toarray()
X_test = vect.transform(X_test)
这是来自 sklearn 的没有库的 MultinomialNB 代码
class MultinomialNB():
def fit(self, X_train, y_train, ls=0.01):
self.ls = ls
self.y_classes, y_counts = np.unique(y_train, return_counts=True)
self.x_classes = [np.unique(x) for x in X.T]
self.phi_y = 1.0 * y_counts/y_counts.sum()
self.phi_x = self.mean_X(X_train, y_train)
self.c_x = self.count_x(X_train, y_train)
return self
def mean_X(self, X_train, y_train):
return [[self.ls_mean_x(X_train, y_train, k, j) for j in range(len(self.x_classes))] for k in self.y_classes]
def ls_mean_x(self, X_train, y_train, k, j):
x_data = (X_train[:,j][y==k].reshape(-1,1) == self.x_classes[j])
return (x_data.sum(axis=0) + self.ls ) / (len(x_data) + (len(self.x_classes) * self.ls))
def get_mean_x(self, y_train, j):
return 1 + self.ls / (self.c_x[y][j] + (len(self.x_classes) * self.ls))
def count_x(self, X_train, y_train):
return [[len(X[:,j][y==k].reshape(-1,1) == self.x_classes[j])
for j in range(len(self.x_classes))]
for k in self.y_classes]
def predict(self, X_train):
return np.apply_along_axis(lambda x: self.compute_probs(x), 1, X_train)
def compute_probs(self, x):
probs = np.array([self.compute_prob(x, y) for y in range(len(self.y_classes))])
return self.y_classes[np.argmax(probs)]
def compute_prob(self, x, y):
Pxy = 1
for j in range(len(x)):
x_clas = self.x_classes[j]
if x[j] in x_clas:
i = list(x_clas).index(x[j])
p_x_j_y = self.phi_x[y][j][i] # p(xj|y)
Pxy *= p_x_j_y
else:
Pxy *= get_mean_x(y, j)
return Pxy * self.phi_y[y]
def evaluate(self, X_train, y_train):
return (self.predict(X_train) == y_train).mean()
我想让 MultinomialNB 适合我的训练集
# Fitting MultinomialNB Classifier to the training set
classifier = MultinomialNB()
classifier.fit(X_train, y_train)
# Predicting test results
y_pred = classifier.predict(X_test)
ytest = np.array(y_test)
# f1_score(ytest, y_pred, average='weighted')
print(classification_report(ytest, y_pred))
print(confusion_matrix(ytest, y_pred))
谁能帮助解决代码和错误?
【问题讨论】:
欢迎来到 SO;在所有这些代码中 exactly 哪里会出现错误?请编辑您的帖子以包含完整的错误跟踪。 【参考方案1】:我从头开始构建自己的..也许可以帮助您构建自己的
将 numpy 导入为 np
类多项式NB:
def __init__(self, alpha=1):
self.alpha = alpha
def fit(self, X_train, y_train):
m, n = X_train.shape
self._classes = np.unique(y_train)
n_classes = len(self._classes)
# init: Prior & Likelihood
self._priors = np.zeros(n_classes)
self._likelihoods = np.zeros((n_classes, n))
# Get Prior and Likelihood
for idx, c in enumerate(self._classes):
X_train_c = X_train[c == y_train]
self._priors[idx] = X_train_c.shape[0] / m
self._likelihoods[idx, :] = ((X_train_c.sum(axis=0)) + self.alpha) / (np.sum(X_train_c.sum(axis=0) + self.alpha))
def predict(self, X_test):
return [self._predict(x_test) for x_test in X_test]
def _predict(self, x_test):
# Calculate posterior for each class
posteriors = []
for idx, c in enumerate(self._classes):
prior_c = np.log(self._priors[idx])
likelihoods_c = self.calc_likelihood(self._likelihoods[idx,:], x_test)
posteriors_c = np.sum(likelihoods_c) + prior_c
posteriors.append(posteriors_c)
return self._classes[np.argmax(posteriors)]
def calc_likelihood(self, cls_likeli, x_test):
return np.log(cls_likeli) * x_test
def score(self, X_test, y_test):
y_pred = self.predict(X_test)
return np.sum(y_pred == y_test)/len(y_test)
【讨论】:
您的代码期望的输入数据的形状是什么?以上是关于Python 的多项朴素贝叶斯从零开始的主要内容,如果未能解决你的问题,请参考以下文章
朴素贝叶斯:朴素贝叶斯定义朴素贝叶斯公式分解朴素贝叶斯分类流程高斯型朴素贝叶斯多项式朴素贝叶斯伯努利型朴素贝叶斯朴素贝叶斯预测概率校准朴素贝叶斯优缺点