1 sklearn简单例子
from sklearn import svm
X = [[2, 0], [1, 1], [2,3]]
y = [0, 0, 1]
clf = svm.SVC(kernel = ‘linear‘)
clf.fit(X, y)
print clf
# get support vectors
print clf.support_vectors_
# get indices of support vectors
print clf.support_
# get number of support vectors for each class
print clf.n_support_
2 sklearn画出决定界限
print(__doc__)
import numpy as np
import pylab as pl
from sklearn import svm
# we create 40 separable points
np.random.seed(0)
X = np.r_[np.random.randn(20, 2) - [2, 2], np.random.randn(20, 2) + [2, 2]]
Y = [0] * 20 + [1] * 20
# fit the model
clf = svm.SVC(kernel=‘linear‘)
clf.fit(X, Y)
# get the separating hyperplane
w = clf.coef_[0]
a = -w[0] / w[1]
xx = np.linspace(-5, 5)
yy = a * xx - (clf.intercept_[0]) / w[1]
# plot the parallels to the separating hyperplane that pass through the
# support vectors
b = clf.support_vectors_[0]
yy_down = a * xx + (b[1] - a * b[0])
b = clf.support_vectors_[-1]
yy_up = a * xx + (b[1] - a * b[0])
print "w: ", w
print "a: ", a
# print " xx: ", xx
# print " yy: ", yy
print "support_vectors_: ", clf.support_vectors_
print "clf.coef_: ", clf.coef_
# In scikit-learn coef_ attribute holds the vectors of the separating hyperplanes for linear models. It has shape (n_classes, n_features) if n_classes > 1 (multi-class one-vs-all) and (1, n_features) for binary classification.
#
# In this toy binary classification example, n_features == 2, hence w = coef_[0] is the vector orthogonal to the hyperplane (the hyperplane is fully defined by it + the intercept).
#
# To plot this hyperplane in the 2D case (any hyperplane of a 2D plane is a 1D line), we want to find a f as in y = f(x) = a.x + b. In this case a is the slope of the line and can be computed by a = -w[0] / w[1].
# plot the line, the points, and the nearest vectors to the plane
pl.plot(xx, yy, ‘k-‘)
pl.plot(xx, yy_down, ‘k--‘)
pl.plot(xx, yy_up, ‘k--‘)
pl.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1],
s=80, facecolors=‘none‘)
pl.scatter(X[:, 0], X[:, 1], c=Y, cmap=pl.cm.Paired)
pl.axis(‘tight‘)
pl.show()
5.2 支持向量机(SVM)算法(下)
1. SVM算法特性:
1.1 训练好的模型的算法复杂度是由支持向量的个数决定的,而不是由数据的维度决定的。所以SVM不太容易产生overfitting
1.2 SVM训练出来的模型完全依赖于支持向量(Support Vectors), 即使训练集里面所有非支持向量的点都被去除,重复训练过程,结果仍然会得到完全一样的模型。
1.3 一个SVM如果训练得出的支持向量个数比较小,SVM训练出的模型比较容易被泛化。
2. 线性不可分的情况 (linearly inseparable case)
2.1 数据集在空间中对应的向量不可被一个超平面区分开
2.2 两个步骤来解决:
2.2.1 利用一个非线性的映射把原数据集中的向量点转化到一个更高维度的空间中
2.2.2 在这个高维度的空间中找一个线性的超平面来根据线性可分的情况处理
2.3 如何利用非线性映射把原始数据转化到高维中?
2.3.1 例子:
3维输入向量:
转化到6维空间 Z 中去:
新的决策超平面:
其中W和Z是向量,这个超平面是线性的
解出W和b之后,并且带入回原方程:
2.3.2 思考问题:
2.3.2.1: 如何选择合理的非线性转化把数据转到高纬度中?
2.3.2.2: 如何解决计算内积时算法复杂度非常高的问题?
2.3.3 使用核方法(kernel trick)
3. 核方法(kernel trick)
3.1 动机
在线性SVM中转化为最优化问题时求解的公式计算都是以内积(dot product)的形式出现的
,其中
是把训练集中的向量点转化到高维的非线性映射函数,因为内积的算法复杂
度非常大,所以我们利用核函数来取代计算非线性映射函数的内积
3.1 以下核函数和非线性映射函数的内积等同
3.2 常用的核函数(kernel functions)
h度多项式核函数(polynomial kernel of degree h):
高斯径向基核函数(Gaussian radial basis function kernel):
S型核函数(Sigmoid function kernel):
如何选择使用哪个kernel?
根据先验知识,比如图像分类,通常使用RBF,文字不使用RBF
尝试不同的kernel,根据结果准确度而定
3.3 核函数举例:
假设定义两个向量: x = (x1, x2, x3); y = (y1, y2, y3)
定义方程:f(x) = (x1x1, x1x2, x1x3, x2x1, x2x2, x2x3, x3x1, x3x2, x3x3)
K(x, y ) = (<x, y>)^2
假设x = (1, 2, 3); y = (4, 5, 6).
f(x) = (1, 2, 3, 2, 4, 6, 3, 6, 9)
f(y) = (16, 20, 24, 20, 25, 36, 24, 30, 36)
<f(x), f(y)> = 16 + 40 + 72 + 40 + 100+ 180 + 72 + 180 + 324 = 1024
K(x, y) = (4 + 10 + 18 ) ^2 = 32^2 = 1024
同样的结果,使用kernel方法计算容易很多
4. SVM扩展可解决多个类别分类问题
对于每个类,有一个当前类和其他类的二类分类器(one-vs-rest)
5.3 支持向量机(SVM)算法(下)应用
利用SVM进行人脸识别实例:
from __future__ import print_function
from time import time
import logging
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.datasets import fetch_lfw_people
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import RandomizedPCA
from sklearn.svm import SVC
print(__doc__)
# Display progress logs on stdout
logging.basicConfig(level=logging.INFO, format=‘%(asctime)s %(message)s‘)
###############################################################################
# Download the data, if not already on disk and load it as numpy arrays
lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)
# introspect the images arrays to find the shapes (for plotting)
n_samples, h, w = lfw_people.images.shape
# for machine learning we use the 2 data directly (as relative pixel
# positions info is ignored by this model)
X = lfw_people.data
n_features = X.shape[1]
# the label to predict is the id of the person
y = lfw_people.target
target_names = lfw_people.target_names
n_classes = target_names.shape[0]
print("Total dataset size:")
print("n_samples: %d" % n_samples)
print("n_features: %d" % n_features)
print("n_classes: %d" % n_classes)
###############################################################################
# Split into a training set and a test set using a stratified k fold
# split into a training and testing set
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.25)
###############################################################################
# Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
# dataset): unsupervised feature extraction / dimensionality reduction
n_components = 150
print("Extracting the top %d eigenfaces from %d faces"
% (n_components, X_train.shape[0]))
t0 = time()
pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)
print("done in %0.3fs" % (time() - t0))
eigenfaces = pca.components_.reshape((n_components, h, w))
print("Projecting the input data on the eigenfaces orthonormal basis")
t0 = time()
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print("done in %0.3fs" % (time() - t0))
###############################################################################
# Train a SVM classification model
print("Fitting the classifier to the training set")
t0 = time()
param_grid = {‘C‘: [1e3, 5e3, 1e4, 5e4, 1e5],
‘gamma‘: [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
clf = GridSearchCV(SVC(kernel=‘rbf‘, class_weight=‘auto‘), param_grid)
clf = clf.fit(X_train_pca, y_train)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(clf.best_estimator_)
###############################################################################
# Quantitative evaluation of the model quality on the test set
print("Predicting people‘s names on the test set")
t0 = time()
y_pred = clf.predict(X_test_pca)
print("done in %0.3fs" % (time() - t0))
print(classification_report(y_test, y_pred, target_names=target_names))
print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))
###############################################################################
# Qualitative evaluation of the predictions using matplotlib
def plot_gallery(images, titles, h, w, n_row=3, n_col=4):
"""Helper function to plot a gallery of portraits"""
plt.figure(figsize=(1.8 * n_col, 2.4 * n_row))
plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35)
for i in range(n_row * n_col):
plt.subplot(n_row, n_col, i + 1)
plt.imshow(images[i].reshape((h, w)), cmap=plt.cm.gray)
plt.title(titles[i], size=12)
plt.xticks(())
plt.yticks(())
# plot the result of the prediction on a portion of the test set
def title(y_pred, y_test, target_names, i):
pred_name = target_names[y_pred[i]].rsplit(‘ ‘, 1)[-1]
true_name = target_names[y_test[i]].rsplit(‘ ‘, 1)[-1]
return ‘predicted: %s\ntrue: %s‘ % (pred_name, true_name)
prediction_titles = [title(y_pred, y_test, target_names, i)
for i in range(y_pred.shape[0])]
plot_gallery(X_test, prediction_titles, h, w)
# plot the gallery of the most significative eigenfaces
eigenface_titles = ["eigenface %d" % i for i in range(eigenfaces.shape[0])]
plot_gallery(eigenfaces, eigenface_titles, h, w)
plt.show()
6.1 神经网络算法(Nerual Networks)(上)
1. 背景:
1.1 以人脑中的神经网络为启发,历史上出现过很多不同版本
1.2 最著名的算法是1980年的 backpropagation
2. 多层向前神经网络(Multilayer Feed-Forward Neural Network)
2.1 Backpropagation被使用在多层向前神经网络上
2.2 多层向前神经网络由以下部分组成:
输入层(input layer), 隐藏层 (hidden layers), 输入层 (output layers)
2.3 每层由单元(units)组成
2.4 输入层(input layer)是由训练集的实例特征向量传入
2.5 经过连接结点的权重(weight)传入下一层,一层的输出是下一层的输入
2.6 隐藏层的个数可以是任意的,输入层有一层,输出层有一层
2.7 每个单元(unit)也可以被称作神经结点,根据生物学来源定义
2.8 以上成为2层的神经网络(输入层不算)
2.8 一层中加权的求和,然后根据非线性方程转化输出
2.9 作为多层向前神经网络,理论上,如果有足够多的隐藏层(hidden layers) 和足够大的训练集, 可以模
拟出任何方程
3. 设计神经网络结构
3.1 使用神经网络训练数据之前,必须确定神经网络的层数,以及每层单元的个数
3.2 特征向量在被传入输入层时通常被先标准化(normalize)到0和1之间 (为了加速学习过程)
3.3 离散型变量可以被编码成每一个输入单元对应一个特征值可能赋的值
比如:特征值A可能取三个值(a0, a1, a2), 可以使用3个输入单元来代表A。
如果A=a0, 那么代表a0的单元值就取1, 其他取0;
如果A=a1, 那么代表a1de单元值就取1,其他取0,以此类推
3.4 神经网络即可以用来做分类(classification)问题,也可以解决回归(regression)问题
3.4.1 对于分类问题,如果是2类,可以用一个输出单元表示(0和1分别代表2类)
如果多余2类,每一个类别用一个输出单元表示
所以输入层的单元数量通常等于类别的数量
3.4.2 没有明确的规则来设计最好有多少个隐藏层
3.4.2.1 根据实验测试和误差,以及准确度来实验并改进
4. 交叉验证方法(Cross-Validation)
K-fold cross validation
5. Backpropagation算法
5.1 通过迭代性的来处理训练集中的实例
5.2 对比经过神经网络后输入层预测值(predicted value)与真实值(target value)之间
5.3 反方向(从输出层=>隐藏层=>输入层)来以最小化误差(error)来更新每个连接的权重(weight)
5.4 算法详细介绍
输入:D:数据集,l 学习率(learning rate), 一个多层前向神经网络
输入:一个训练好的神经网络(a trained neural network)
5.4.1 初始化权重(weights)和偏向(bias): 随机初始化在-1到1之间,或者-0.5到0.5之间,每个单元有
一个偏向
5.4.2 对于每一个训练实例X,执行以下步骤:
5.4.2.1: 由输入层向前传送
5.4.2.2 根据误差(error)反向传送
对于输出层:
对于隐藏层:
权重更新:
偏向更新
5.4.3 终止条件
5.4.3.1 权重的更新低于某个阈值
5.4.3.2 预测的错误率低于某个阈值
5.4.3.3 达到预设一定的循环次数
6. Backpropagation 算法举例
对于输出层:
对于隐藏层:
权重更新:
偏向更新
6.2 神经网络算法(Nerual Networks)应用(上)
1. 关于非线性转化方程(non-linear transformation function)
sigmoid函数(S 曲线)用来作为activation function:
1.1 双曲函数(tanh)
1.2 逻辑函数(logistic function)
2. 实现一个简单的神经网络算法
import numpy as np
def tanh(x):
return np.tanh(x)
def tanh_deriv(x):
return 1.0 - np.tanh(x)*np.tanh(x)
def logistic(x):
return 1/(1 + np.exp(-x))
def logistic_derivative(x):
return logistic(x)*(1-logistic(x))
class NeuralNetwork:
def __init__(self, layers, activation=‘tanh‘):
"""
:param layers: A list containing the number of units in each layer.
Should be at least two values
:param activation: The activation function to be used. Can be
"logistic" or "tanh"
"""
if activation == ‘logistic‘:
self.activation = logistic
self.activation_deriv = logistic_derivative
elif activation == ‘tanh‘:
self.activation = tanh
self.activation_deriv = tanh_deriv
self.weights = []
for i in range(1, len(layers) - 1):
self.weights.append((2*np.random.random((layers[i - 1] + 1, layers[i] + 1))-1)*0.25)
self.weights.append((2*np.random.random((layers[i] + 1, layers[i + 1]))-1)*0.25)
def fit(self, X, y, learning_rate=0.2, epochs=10000):
X = np.atleast_2d(X)
temp = np.ones([X.shape[0], X.shape[1]+1])
temp[:, 0:-1] = X # adding the bias unit to the input layer
X = temp
y = np.array(y)
for k in range(epochs):
i = np.random.randint(X.shape[0])
a = [X[i]]
for l in range(len(self.weights)): #going forward network, for each layer
a.append(self.activation(np.dot(a[l], self.weights[l]))) #Computer the node value for each layer (O_i) using activation function
error = y[i] - a[-1] #Computer the error at the top layer
deltas = [error * self.activation_deriv(a[-1])] #For output layer, Err calculation (delta is updated error)
#Staring backprobagation
for l in range(len(a) - 2, 0, -1): # we need to begin at the second to last layer
#Compute the updated error (i,e, deltas) for each node going from top layer to input layer
deltas.append(deltas[-1].dot(self.weights[l].T)*self.activation_deriv(a[l]))
deltas.reverse()
for i in range(len(self.weights)):
layer = np.atleast_2d(a[i])
delta = np.atleast_2d(deltas[i])
self.weights[i] += learning_rate * layer.T.dot(delta)
def predict(self, x):
x = np.array(x)
temp = np.ones(x.shape[0]+1)
temp[0:-1] = x
a = temp
for l in range(0, len(self.weights)):
a = self.activation(np.dot(a, self.weights[l]))
return a
6.3 神经网络算法(Nerual Networks)应用(下)
1. 简单非线性关系数据集测试(XOR):
X: Y
0 0 0
0 1 1
1 0 1
1 1 0
Code:
from NeuralNetwork import NeuralNetwork
import numpy as np
nn = NeuralNetwork([2,2,1], ‘tanh‘)
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([0, 1, 1, 0])
nn.fit(X, y)
for i in [[0, 0], [0, 1], [1, 0], [1,1]]:
print(i, nn.predict(i))
2. 手写数字识别:
每个图片8x8
识别数字:0,1,2,3,4,5,6,7,8,9
Code:
import numpy as np
from sklearn.datasets import load_digits
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelBinarizer
from NeuralNetwork import NeuralNetwork
from sklearn.cross_validation import train_test_split
digits = load_digits()
X = digits.data
y = digits.target
X -= X.min() # normalize the values to bring them into the range 0-1
X /= X.max()
nn = NeuralNetwork([64,100,10],‘logistic‘)
X_train, X_test, y_train, y_test = train_test_split(X, y)
labels_train = LabelBinarizer().fit_transform(y_train)
labels_test = LabelBinarizer().fit_transform(y_test)
print "start fitting"
nn.fit(X_train,labels_train,epochs=3000)
predictions = []
for i in range(X_test.shape[0]):
o = nn.predict(X_test[i] )
predictions.append(np.argmax(o))
print confusion_matrix(y_test,predictions)
print classification_report(y_test,predictions)
7.1 简单线性回归 (Simple Linear Regression)上
0. 前提介绍:
为什么需要统计量?
统计量:描述数据特征
0.1 集中趋势衡量
0.1.1均值(平均数,平均值)(mean)
{6, 2, 9, 1, 2}
(6 + 2 + 9 + 1 + 2) / 5 = 20 / 5 = 4
0.1.2中位数 (median): 将数据中的各个数值按照大小顺序排列,居于中间位置的变量
0.1.2.1. 给数据排序:1, 2, 2, 6, 9
0.1.2.2. 找出位置处于中间的变量:2
当n为基数的时候:直接取位置处于中间的变量
当n为偶数的时候,取中间两个量的平均值
0.1.2众数 (mode):数据中出现次数最多的数
0.2
0.2.1. 离散程度衡量
0.2.1.1方差(variance)
{6, 2, 9, 1, 2}
(1) (6 - 4)^2 + (2 - 4) ^2 + (9 - 4)^2 + (1 - 4)^2 + (2 - 4)^2
= 4 + 4 + 25 + 9 + 4
= 46
(2) n - 1 = 5 - 1 = 4
(3) 46 / 4 = 11.5
0.2.1.2标准差 (standard deviation)
s = sqrt(11.5) = 3.39
1. 介绍:回归(regression) Y变量为连续数值型(continuous numerical variable)
如:房价,人数,降雨量
分类(Classification): Y变量为类别型(categorical variable)
如:颜色类别,电脑品牌,有无信誉
2. 简单线性回归(Simple Linear Regression)
2.1 很多做决定过过程通常是根据两个或者多个变量之间的关系
2.3 回归分析(regression analysis)用来建立方程模拟两个或者多个变量之间如何关联
2.4 被预测的变量叫做:因变量(dependent variable), y, 输出(output)
2.5 被用来进行预测的变量叫做: 自变量(independent variable), x, 输入(input)
3. 简单线性回归介绍
3.1 简单线性回归包含一个自变量(x)和一个因变量(y)
3.2 以上两个变量的关系用一条直线来模拟
3.3 如果包含两个以上的自变量,则称作多元回归分析(multiple regression)
4. 简单线性回归模型
4.1 被用来描述因变量(y)和自变量(X)以及偏差(error)之间关系的方程叫做回归模型
4.2 简单线性回归的模型是:
其中: 参数 偏差
5. 简单线性回归方程
E(y) = β0+β1x
这个方程对应的图像是一条直线,称作回归线
其中,β0是回归线的截距
β1是回归线的斜率
E(y)是在一个给定x值下y的期望值(均值)
6. 正向线性关系:
7. 负向线性关系:
8. 无关系
9. 估计的简单线性回归方程
?=b0+b1x
这个方程叫做估计线性方程(estimated regression line)
其中,b0是估计线性方程的纵截距
b1是估计线性方程的斜率
?是在自变量x等于一个给定值的时候,y的估计值
10. 线性回归分析流程:
11. 关于偏差ε的假定
11.1 是一个随机的变量,均值为0
11.2 ε的方差(variance)对于所有的自变量x是一样的
11.3 ε的值是独立的
11.4 ε满足正态分布
7.1 简单线性回归 (Simple Linear Regression)下
1. 简单线性回归模型举例:
汽车卖家做电视广告数量与卖出的汽车数量:
1.1 如何练处适合简单线性回归模型的最佳回归线?
使sum of squares最小
1.1.2 计算
分子 = (1-2)(14-20)+(3-2)(24-20)+(2-2)(18-20)+(1-2)(17-20)+(3-2)(27-20)
= 6 + 4 + 0 + 3 + 7
= 20
分母 = (1-2)^2 + (3-2)^2 + (2-2)^2 + (1-2)^2 + (3-2)^2
= 1 + 1 + 0 + 1 + 1
4
b1 = 20/4 =5
b0 = 20 - 5*2 = 20 - 10 = 10
1.2 预测:
假设有一周广告数量为6,预测的汽车销售量是多少?
x_given = 6
Y_hat = 5*6 + 10 = 40
1.3 Python实现:
import numpy as np
def fitSLR(x, y):
n = len(x)
dinominator = 0
numerator = 0
for i in range(0, n):
numerator += (x[i] - np.mean(x))*(y[i] - np.mean(y))
dinominator += (x[i] - np.mean(x))**2
b1 = numerator/float(dinominator)
b0 = np.mean(y)/float(np.mean(x))
return b0, b1
def predict(x, b0, b1):
return b0 + x*b1
x = [1, 3, 2, 1, 3]
y = [14, 24, 18, 17, 27]
b0, b1 = fitSLR(x, y)
print "intercept:", b0, " slope:", b1
x_test = 6
y_test = predict(6, b0, b1)
print "y_test:", y_test
7.3 多元回归分析(multiple regression)
1. 与简单线性回归区别(simple linear regression)
多个自变量(x)
2. 多元回归模型
y=β0+β1x1+β2x2+ ... +βpxp+ε
其中:β0,β1,β2... βp是参数
ε是误差值
3. 多元回归方程
E(y)=β0+β1x1+β2x2+ ... +βpxp
4. 估计多元回归方程:
y_hat=b0+b1x1+b2x2+ ... +bpxp
一个样本被用来计算β0,β1,β2... βp的点估计b0, b1, b2,..., bp
5. 估计流程 (与简单线性回归类似)
6. 估计方法
使sum of squares最小
运算与简单线性回归类似,涉及到线性代数和矩阵代数的运算
7. 例子
一家快递公司送货:X1: 运输里程 X2: 运输次数 Y:总运输时间
Driving
Assignment
|
X1=Miles
Traveled
|
X2=Number of Deliveries
|
Y= Travel Time (Hours)
|
1
|
100
|
4
|
9.3
|
2
|
50
|
3
|
4.8
|
3
|
100
|
4
|
8.9
|
4
|
100
|
2
|
6.5
|
5
|
50
|
2
|
4.2
|
6
|
80
|
2
|
6.2
|
7
|
75
|
3
|
7.4
|
8
|
65
|
4
|
6.0
|
9
|
90
|
3
|
7.6
|
10
|
90
|
2
|
6.1
|
Time = b0+ b1*Miles + b2 * Deliveries
Time = -0.869 + 0.0611 Miles + 0.923 Deliveries
8. 描述参数含义
b0: 平均每多运送一英里,运输时间延长0.0611 小时
b1: 平均每多一次运输,运输时间延长 0.923 小时
9. 预测
如果一个运输任务是跑102英里,运输6次,预计多少小时?
Time = -0.869 +0.0611 *102+ 0.923 * 6
= 10.9 (小时)
10. 如果自变量中有分类型变量(categorical data) , 如何处理?
英里数 |
次数 |
车型 |
时间 |
100 |
4 |
1 |
9.3 |
50 |
3 |
0 |
4.8 |
100 |
4 |
1 |
8.9 |
100 |
2 |
2 |
6.5 |
50 |
2 |
2 |
4.2 |
80 |
2 |
1 |
6.2 |
75 |
3 |
1 |
7.4 |
65 |
4 |
0 |
6 |
90 |
3 |
0 |
7.6 |
11. 关于误差的分布
误差ε是一个随机变量,均值为0
ε的方差对于所有的自变量来说相等
所有ε的值是独立的
ε满足正态分布,并且通过β0+β1x1+β2x2+ ... +βpxp反映y的期望值
7.4 多元回归分析(multiple regression)应用
1. 例子
一家快递公司送货:X1: 运输里程 X2: 运输次数 Y:总运输时间
Driving
Assignment
|
X1=Miles
Traveled
|
X2=Number of Deliveries
|
Y= Travel Time (Hours)
|
1
|
100
|
4
|
9.3
|
2
|
50
|
3
|
4.8
|
3
|
100
|
4
|
8.9
|
4
|
100
|
2
|
6.5
|
5
|
50
|
2
|
4.2
|
6
|
80
|
2
|
6.2
|
7
|
75
|
3
|
7.4
|
8
|
65
|
4
|
6.0
|
9
|
90
|
3
|
7.6
|
10
|
90
|
2
|
6.1
|
目的,求出b0, b1,.... bp:
y_hat=b0+b1x1+b2x2+ ... +bpxp
2. Python代码:
from numpy import genfromtxt
import numpy as np
from sklearn import datasets, linear_model
dataPath = r"D:\MaiziEdu\DeepLearningBasics_MachineLearning\Datasets\Delivery.csv"
deliveryData = genfromtxt(dataPath, delimiter=‘,‘)
print "data"
print deliveryData
X = deliveryData[:, :-1]
Y = deliveryData[:, -1]
print "X:"
print X
print "Y: "
print Y
regr = linear_model.LinearRegression()
regr.fit(X, Y)
print "coefficients"
print regr.coef_
print "intercept: "
print regr.intercept_
xPred = [102, 6]
yPred = regr.predict(xPred)
print "predicted y: "
print yPred
7.5 非线性回归 logistic regression
1. 概率:
1.1 定义 概率(P)robability: 对一件事情发生的可能性的衡量
1.2 范围 0 <= P <= 1
1.3 计算方法:
1.3.1 根据个人置信
1.3.2 根据历史数据
1.3.3 根据模拟数据
1.4 条件概率:
2. Logistic Regression (逻辑回归)
2.1 例子
h(x) > 0.5
h(x) > 0.2
2.2 基本模型
测试数据为X(x0,x1,x2···xn)
要学习的参数为: Θ(θ0,θ1,θ2,···θn)
向量表示:
处理二值数据,引入Sigmoid函数时曲线平滑化
预测函数:
用概率表示:
正例(y=1):
反例(y=0):
2.3 Cost函数
线性回归:
找到合适的 θ0,θ1使上式最小
Logistic regression:
Cost函数:
目标:找到合适的 θ0,θ1使上式最小
2.4 解法:梯度下降(gradient decent)
更新法则:
学习率
同时对所有的θ进行更新
重复更新直到收敛
7.6 非线性回归应用:losgistic regression application
Python 实现:
import numpy as np
import random
# m denotes the number of examples here, not the number of features
def gradientDescent(x, y, theta, alpha, m, numIterations):
xTrans = x.transpose()
for i in range(0, numIterations):
hypothesis = np.dot(x, theta)
loss = hypothesis - y
# avg cost per example (the 2 in 2*m doesn‘t really matter here.
# But to be consistent with the gradient, I include it)
cost = np.sum(loss ** 2) / (2 * m)
print("Iteration %d | Cost: %f" % (i, cost))
# avg gradient per example
gradient = np.dot(xTrans, loss) / m
# update
theta = theta - alpha * gradient
return theta
def genData(numPoints, bias, variance):
x = np.zeros(shape=(numPoints, 2))
y = np.zeros(shape=numPoints)
# basically a straight line
for i in range(0, numPoints):
# bias feature
x[i][0] = 1
x[i][1] = i
# our target variable
y[i] = (i + bias) + random.uniform(0, 1) * variance
return x, y
# gen 100 points with a bias of 25 and 10 variance as a bit of noise
x, y = genData(100, 25, 10)
m, n = np.shape(x)
numIterations= 100000
alpha = 0.0005
theta = np.ones(n)
theta = gradientDescent(x, y, theta, alpha, m, numIterations)
print(theta)
7.7 回归中的相关度和R平方值
1. 皮尔逊相关系数 (Pearson Correlation Coefficient):
1.1 衡量两个值线性相关强度的量
1.2 取值范围 [-1, 1]:
正向相关: >0, 负向相关:<0, 无相关性:=0
1.3
2. 计算方法举例:
X |
Y |
1 |
10 |
3 |
12 |
8 |
24 |
7 |
21 |
9 |
34 |
|
|
3. 其他例子:
4. R平方值:
4.1定义:决定系数,反应因变量的全部变异能通过回归关系被自变量解释的比例。
4.2 描述:如R平方为0.8,则表示回归关系可以解释因变量80%的变异。换句话说,如果我们能控制自变量不变,则因变量的变异程度会减少80%
4.3: 简单线性回归:R^2 = r * r
多元线性回归:
5. R平方也有其局限性:R平方随着自变量的增加会变大,R平方和样本量是有关系的。因此,我们要到R平方进行修正。修正的方法:
7.8 回归中的相关度和R平方值应用
Python实现:
import numpy as np
from astropy.units import Ybarn
import math
def computeCorrelation(X, Y):
xBar = np.mean(X)
yBar = np.mean(Y)
SSR = 0
varX = 0
varY = 0
for i in range(0 , len(X)):
diffXXBar = X[i] - xBar
diffYYBar = Y[i] - yBar
SSR += (diffXXBar * diffYYBar)
varX += diffXXBar**2
varY += diffYYBar**2
SST = math.sqrt(varX * varY)
return SSR / SST
testX = [1, 3, 8, 7, 9]
testY = [10, 12, 24, 21, 34]
print computeCorrelation(testX, testY)
8.1 聚类(Clustering) K-means算法
1. 归类:
聚类(clustering) 属于非监督学习 (unsupervised learning)
无类别标记(class label)
2. 举例:
3. K-means 算法:
3.1 Clustering 中的经典算法,数据挖掘十大经典算法之一
3.2 算法接受参数 k ;然后将事先输入的n个数据对象划分为 k个聚类以便使得所获得的聚类满足:同一
聚类中的对象相似度较高;而不同聚类中的对象相似度较小。
3.3 算法思想:
以空间中k个点为中心进行聚类,对最靠近他们的对象归类。通过迭代的方法,逐次更新各聚类中心
的值,直至得到最好的聚类结果
3.4 算法描述:
(1)适当选择c个类的初始中心;
(2)在第k次迭代中,对任意一个样本,求其到c各中心的距离,将该样本归到距离最短的中心所在
的类;
(3)利用均值等方法更新该类的中心值;
(4)对于所有的c个聚类中心,如果利用(2)(3)的迭代法更新后,值保持不变,则迭代结束,
否则继续迭代。
3.5 算法流程:
输入:k, data[n];
(1) 选择k个初始中心点,例如c[0]=data[0],…c[k-1]=data[k-1];
(2) 对于data[0]….data[n], 分别与c[0]…c[k-1]比较,假定与c[i]差值最少,就标记为i;
(3) 对于所有标记为i点,重新计算c[i]={ 所有标记为i的data[j]之和}/标记为i的个数;
(4) 重复(2)(3),直到所有c[i]值的变化小于给定阈值。
4. 举例:
停止
优点:速度快,简单
缺点:最终结果跟初始点选择相关,容易陷入局部最优,需直到k值
8.2 聚类(Clustering) K-means算法应用
import numpy as np
# Function: K Means
# -------------
# K-Means is an algorithm that takes in a dataset and a constant
# k and returns k centroids (which define clusters of data in the
# dataset which are similar to one another).
def kmeans(X, k, maxIt):
numPoints, numDim = X.shape
dataSet = np.zeros((numPoints, numDim + 1))
dataSet[:, :-1] = X
# Initialize centroids randomly
centroids = dataSet[np.random.randint(numPoints, size = k), :]
centroids = dataSet[0:2, :]
#Randomly assign labels to initial centorid
centroids[:, -1] = range(1, k +1)
# Initialize book keeping vars.
iterations = 0
oldCentroids = None
# Run the main k-means algorithm
while not shouldStop(oldCentroids, centroids, iterations, maxIt):
print "iteration: \n", iterations
print "dataSet: \n", dataSet
print "centroids: \n", centroids
# Save old centroids for convergence test. Book keeping.
oldCentroids = np.copy(centroids)
iterations += 1
# Assign labels to each datapoint based on centroids
updateLabels(dataSet, centroids)
# Assign centroids based on datapoint labels
centroids = getCentroids(dataSet, k)
# We can get the labels too by calling getLabels(dataSet, centroids)
return dataSet
# Function: Should Stop
# -------------
# Returns True or False if k-means is done. K-means terminates either
# because it has run a maximum number of iterations OR the centroids
# stop changing.
def shouldStop(oldCentroids, centroids, iterations, maxIt):
if iterations > maxIt:
return True
return np.array_equal(oldCentroids, centroids)
# Function: Get Labels
# -------------
# Update a label for each piece of data in the dataset.
def updateLabels(dataSet, centroids):
# For each element in the dataset, chose the closest centroid.
# Make that centroid the element‘s label.
numPoints, numDim = dataSet.shape
for i in range(0, numPoints):
dataSet[i, -1] = getLabelFromClosestCentroid(dataSet[i, :-1], centroids)
def getLabelFromClosestCentroid(dataSetRow, centroids):
label = centroids[0, -1];
minDist = np.linalg.norm(dataSetRow - centroids[0, :-1])
for i in range(1 , centroids.shape[0]):
dist = np.linalg.norm(dataSetRow - centroids[i, :-1])
if dist < minDist:
minDist = dist
label = centroids[i, -1]
print "minDist:", minDist
return label
# Function: Get Centroids
# -------------
# Returns k random centroids, each of dimension n.
def getCentroids(dataSet, k):
# Each centroid is the geometric mean of the points that
# have that centroid‘s label. Important: If a centroid is empty (no points have
# that centroid‘s label) you should randomly re-initialize it.
result = np.zeros((k, dataSet.shape[1]))
for i in range(1, k + 1):
oneCluster = dataSet[dataSet[:, -1] == i, :-1]
result[i - 1, :-1] = np.mean(oneCluster, axis = 0)
result[i - 1, -1] = i
return result
x1 = np.array([1, 1])
x2 = np.array([2, 1])
x3 = np.array([4, 3])
x4 = np.array([5, 4])
testX = np.vstack((x1, x2, x3, x4))
result = kmeans(testX, 2, 10)
print "final result:"
print result
8.3 聚类(Clustering) hierarchical clustering 层次聚类
假设有N个待聚类的样本,对于层次聚类来说,步骤:
1、(初始化)把每个样本归为一类,计算每两个类之间的距离,也就是样本与样本之间的相似度;
2、寻找各个类之间最近的两个类,把他们归为一类(这样类的总数就少了一个);
3、重新计算新生成的这个类与各个旧类之间的相似度;
4、重复2和3直到所有样本点都归为一类,结束
整个聚类过程其实是建立了一棵树,在建立的过程中,可以通过在第二步上设置一个阈值,当最近的两个类的距离大于这个阈值,则认为迭代可以终止。另外关键的一步就是第三步,如何判断两个类之间的相似度有不少种方法。这里介绍一下三种:
SingleLinkage:又叫做 nearest-neighbor ,就是取两个类中距离最近的两个样本的距离作为这两个集合的距离,也就是说,最近两个样本之间的距离越小,这两个类之间的相似度就越大。容易造成一种叫做 Chaining 的效果,两个 cluster 明明从“大局”上离得比较远,但是由于其中个别的点距离比较近就被合并了,并且这样合并之后 Chaining 效应会进一步扩大,最后会得到比较松散的 cluster 。
CompleteLinkage:这个则完全是 Single Linkage 的反面极端,取两个集合中距离最远的两个点的距离作为两个集合的距离。其效果也是刚好相反的,限制非常大,两个 cluster 即使已经很接近了,但是只要有不配合的点存在,就顽固到底,老死不相合并,也是不太好的办法。这两种相似度的定义方法的共同问题就是指考虑了某个有特点的数据,而没有考虑类内数据的整体特点。
Average-linkage:这种方法就是把两个集合中的点两两的距离全部放在一起求一个平均值,相对也能得到合适一点的结果。
average-linkage的一个变种就是取两两距离的中值,与取均值相比更加能够解除个别偏离样本对结果的干扰。
8.4 聚类(Clustering) hierarchical clustering 层次聚类应用
from numpy import *
"""
Code for hierarchical clustering, modified from
Programming Collective Intelligence by Toby Segaran
(O‘Reilly Media 2007, page 33).
"""
class cluster_node:
def __init__(self,vec,left=None,right=None,distance=0.0,id=None,count=1):
self.left=left
self.right=right
self.vec=vec
self.id=id
self.distance=distance
self.count=count #only used for weighted average
def L2dist(v1,v2):
return sqrt(sum((v1-v2)**2))
def L1dist(v1,v2):
return sum(abs(v1-v2))
# def Chi2dist(v1,v2):
# return sqrt(sum((v1-v2)**2))
def hcluster(features,distance=L2dist):
#cluster the rows of the "features" matrix
distances={}
currentclustid=-1
# clusters are initially just the individual rows
clust=[cluster_node(array(features[i]),id=i) for i in range(len(features))]
while len(clust)>1:
lowestpair=(0,1)
closest=distance(clust[0].vec,clust[1].vec)
# loop through every pair looking for the smallest distance
for i in range(len(clust)):
for j in range(i+1,len(clust)):
# distances is the cache of distance calculations
if (clust[i].id,clust[j].id) not in distances:
distances[(clust[i].id,clust[j].id)]=distance(clust[i].vec,clust[j].vec)
d=distances[(clust[i].id,clust[j].id)]
if d<closest:
closest=d
lowestpair=(i,j)
# calculate the average of the two clusters
mergevec=[(clust[lowestpair[0]].vec[i]+clust[lowestpair[1]].vec[i])/2.0 \
for i in range(len(clust[0].vec))]
# create the new cluster
newcluster=cluster_node(array(mergevec),left=clust[lowestpair[0]],
right=clust[lowestpair[1]],
distance=closest,id=currentclustid)
# cluster ids that weren‘t in the original set are negative
currentclustid-=1
del clust[lowestpair[1]]
del clust[lowestpair[0]]
clust.append(newcluster)
return clust[0]
def extract_clusters(clust,dist):
# extract list of sub-tree clusters from hcluster tree with distance<dist
clusters = {}
if clust.distance<dist:
# we have found a cluster subtree
return [clust]
else:
# check the right and left branches
cl = []
cr = []
if clust.left!=None:
cl = extract_clusters(clust.left,dist=dist)
if clust.right!=None:
cr = extract_clusters(clust.right,dist=dist)
return cl+cr
def get_cluster_elements(clust):
# return ids for elements in a cluster sub-tree
if clust.id>=0:
# positive id means that this is a leaf
return [clust.id]
else:
# check the right and left branches
cl = []
cr = []
if clust.left!=None:
cl = get_cluster_elements(clust.left)
if clust.right!=None:
cr = get_cluster_elements(clust.right)
return cl+cr
def printclust(clust,labels=None,n=0):
# indent to make a hierarchy layout
for i in range(n): print ‘ ‘,
if clust.id<0:
# negative id means that this is branch
print ‘-‘
else:
# positive id means that this is an endpoint
if labels==None: print clust.id
else: print labels[clust.id]
# now print the right and left branches
if clust.left!=None: printclust(clust.left,labels=labels,n=n+1)
if clust.right!=None: printclust(clust.right,labels=labels,n=n+1)
def getheight(clust):
# Is this an endpoint? Then the height is just 1
if clust.left==None and clust.right==None: return 1
# Otherwise the height is the same of the heights of
# each branch
return getheight(clust.left)+getheight(clust.right)
def getdepth(clust):
# The distance of an endpoint is 0.0
if clust.left==None and clust.right==None: return 0
# The distance of a branch is the greater of its two sides
# plus its own distance
return max(getdepth(clust.left),getdepth(clust.right))+clust.distance