从零开始的朴素贝叶斯分类器
Posted
技术标签:
【中文标题】从零开始的朴素贝叶斯分类器【英文标题】:Naive Bayes Classifier from scratch 【发布时间】:2021-11-12 20:52:42 【问题描述】:最近发现 GaussianNaiveBayes 分类器的代码如下。
import numpy as np
class GaussianNaiveBayes:
def fit(self, X, y):
n_samples, n_features = X.shape
self._classes = np.unique(y)
n_classes = len(self._classes)
self._mean = np.zeros((n_classes, n_features), dtype=np.float64)
self._var = np.zeros((n_classes, n_features), dtype=np.float64)
self._priors = np.zeros(n_classes, dtype=np.float64)
# calculating the mean, variance and prior P(H) for each class
for i, c in enumerate(self._classes):
X_for_class_c = X[y==c]
self._mean[i, :] = X_for_class_c.mean(axis=0)
self._var[i, :] = X_for_class_c.var(axis=0)
self._priors[i] = X_for_class_c.shape[0] / float(n_samples)
#function for calculating the likelihood, P(E|H), of data X given the mean and variance
def _calculate_likelihood(self, class_idx, x):
mean = self._mean[class_idx]
var = self._var[class_idx]
num = np.exp(- (x-mean)**2 / (2 * var))
denom = np.sqrt(2 * np.pi * var)
return num / denom
#classifications by calculating the posterior probability, P(H|E), of the classes
def predict(self, X):
y_pred = [self._classify_sample(x) for x in X]
return np.array(y_pred)
def _classify_sample(self, x):
posteriors = []
# calculating posterior probability for each class
for i, c in enumerate(self._classes):
prior = np.log(self._priors[i])
posterior = np.sum(np.log(self._calculate_likelihood(i, x)))
posterior = prior + posterior
posteriors.append(posterior)
# return the class with highest posterior probability
return self._classes[np.argmax(posteriors)]
通过以下代码在 Iris 数据集上尝试上述代码,但收到错误“AttributeError: 'GaussianNaiveBayes' object has no attribute 'predict'”
iris = datasets.load_iris()
X = pd.DataFrame(iris.data, columns = iris.feature_names)
y = pd.DataFrame(iris.target, columns = ['Target'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state = 42)
nb = GaussianNaiveBayes()
nb.fit(X_train, y_train)
predictions = nb.predict(X_test)
请求任何指导以显示我的错误。
【问题讨论】:
predict
方法没有缩进,所以它不属于GaussianNaiveBayes
类(而fit
属于)
【参考方案1】:
您需要正确缩进代码,并且当y
是数据框时,这行子集X
数组将不起作用:
X_for_class_c = X[y==c]
同样,此函数不适用于数据框:
y_pred = [self._classify_sample(x) for x in X]
所以让我们正确缩进:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
class GaussianNaiveBayes:
def fit(self, X, y):
n_samples, n_features = X.shape
self._classes = np.unique(y)
n_classes = len(self._classes)
self._mean = np.zeros((n_classes, n_features), dtype=np.float64)
self._var = np.zeros((n_classes, n_features), dtype=np.float64)
self._priors = np.zeros(n_classes, dtype=np.float64)
for i, c in enumerate(self._classes):
X_for_class_c = X[y==c]
self._mean[i, :] = X_for_class_c.mean(axis=0)
self._var[i, :] = X_for_class_c.var(axis=0)
self._priors[i] = X_for_class_c.shape[0] / float(n_samples)
def _calculate_likelihood(self, class_idx, x):
mean = self._mean[class_idx]
var = self._var[class_idx]
num = np.exp(- (x-mean)**2 / (2 * var))
denom = np.sqrt(2 * np.pi * var)
return num / denom
def predict(self, X):
y_pred = [self._classify_sample(x) for x in X]
return np.array(y_pred)
def _classify_sample(self, x):
posteriors = []
for i, c in enumerate(self._classes):
prior = np.log(self._priors[i])
posterior = np.sum(np.log(self._calculate_likelihood(i, x)))
posterior = prior + posterior
posteriors.append(posterior)
return self._classes[np.argmax(posteriors)]
首先使用您的示例运行拟合,您可以看到拟合为您的所有值返回 nan:
iris = datasets.load_iris()
X = pd.DataFrame(iris.data, columns = iris.feature_names)
y = pd.DataFrame(iris.target, columns = ['Target'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state = 42)
nb = GaussianNaiveBayes()
nb.fit(X_train, y_train)
nb._mean
array([[nan, nan, nan, nan],
[nan, nan, nan, nan],
[nan, nan, nan, nan]])
改变输入:
iris = datasets.load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state = 42)
nb = GaussianNaiveBayes()
nb.fit(X_train, y_train)
nb.predict(X_test)
array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
0, 2, 2, 2, 2, 2, 0, 0])
【讨论】:
以上是关于从零开始的朴素贝叶斯分类器的主要内容,如果未能解决你的问题,请参考以下文章