Logistic回归模型(C++代码实现)
Posted 晴堂
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Logistic回归模型(C++代码实现)相关的知识,希望对你有一定的参考价值。
Logistic回归主要针对输入的数据是多个,输出则是有限的数值型,多为2个分类。
涉及到以下方面:
1. 输出y = w0+w1*x1+w2*x2+..... (x1,x2,...是样本的属性值,为连续型的变量,w0,w1,w2,...为所要求的参数,y为有限的数值型变量,表示样本所属类别)。
2. logistic模型: 1/(1+exp(-z)),其中z= w0+w1*x1+w2*x2+..... 。
3.算法实现:
w初始化为1;
alph = 0.1; //设置步长,需根据情况逐步调整
i = 0;
while( i<样本数量)
zi = w0+w1*xi1+w2*xi2+.....
h = 1/(1+exp(-zi));
error = yi-h;
while(...)
wj = wj+alph *error*xij; // j表示第j个属性
end
end
以上算法过程在样本量比较小的时候可以实现,在样本量非常大的时候,需要考虑采用随机梯度下降法,即随机从总的样本的选出小的样本集来用于迭代过程(可以百度相关资料)。
本文主要采用了梯度下降法完成了参数值优化过程。以下程序主要将3中算法实现。主要包含main.h 和 main.cpp两个文件
测试结果发现预测的准确率可以到80%左右。但感觉这和参数的调整有很大关系,样本量还是太小(总样本量198,训练集:150,测试集:48),这里比较简便,不包含校准数据集,另外结果存在一些欠拟合的现象。
main.h
/*************
Logistic Regression( logistic 回归 )using newton gradient descent
CopyRight 2016/8/21 xukaiwen
All Rights Reserved
**************/
#ifndef MAIN_H
#define MAIN_H
#include "stdio.h"
#include "stdlib.h"
#include "iostream"
#include "string"
#include "string.h"
#include <sstream>
#include <memory.h>
#include "math.h"
using namespace std;
#define maxClassLabelNum 10;
int curLabelNum = 0;
const double alph = 0.3; //set the newton gradient algorithm fixed step
const int attriNum = 33;
const int sampleNum = 198;
int trainNum = 140;
struct DataSample
double attriValue[attriNum];
bool classLabel;
;
double StringTodouble(char * src)
double a;
stringstream str;
str<<src;
str>>a;
str.clear();
return a;
int ReadData( DataSample* data, char *file)
FILE *pFile;
char buf[1024];
pFile = fopen(file,"rt");
if(pFile==NULL)
printf("the data file is not existing: %s\\n", file);
return -1;
int row = 0; //data line
int cloumn = 0; //data attribute
char delim[] = ",";//data delimiter
char *tmpdata = NULL;//data cache
while(!feof(pFile)&&row<sampleNum)
buf[0] = '\\0';
fgets(buf,1024,pFile);
if( buf[strlen(buf)-1]=='\\n' )
buf[strlen(buf)-1]='\\0';
//the first column is non-used,and second column is class label;
for( int column = 0;column<(attriNum+2);++column )
if( column==0 )
tmpdata = strtok(buf,delim);
continue;
else if( column==1 )
tmpdata = strtok(NULL,delim);
if( tmpdata[0]=='R' )
data[row].classLabel = 1; //R:1; N:0
else
data[row].classLabel = 0;
else
tmpdata = strtok(NULL,delim);
if(tmpdata[0]!='?')// '?' mean the loss attribute value
data[row].attriValue[column-2] = StringTodouble(tmpdata);
else
data[row].attriValue[column-2] = -1000;
++row;
return 1;
void Normalize( DataSample* data )
double atrriMinValue[attriNum];
double atrriMaxValue[attriNum];//for normalization (x-xmin)/(xmax-xmin)
//think about the first sample is none-loss
//get the min and max value of each attribute without thinking about the loss atrribute
for( int i=0;i<attriNum;++i )
atrriMinValue[i] = data[0].attriValue[i];
atrriMaxValue[i] = data[0].attriValue[i];
for( int row = 1; row < sampleNum; ++row )
for( int column = 0; column < attriNum; ++column )
if( data[row].attriValue[column] > atrriMaxValue[column] && (data[row].attriValue[column]+1000)>0.0001 )
atrriMaxValue[column] = data[row].attriValue[column];
if( data[row].attriValue[column] < atrriMinValue[column] && (data[row].attriValue[column]+1000)>0.0001 )
atrriMinValue[column] = data[row].attriValue[column];
for( int row = 1; row < sampleNum; ++row )
for( int column = 0; column < attriNum; ++column )
if( (data[row].attriValue[column]+1000)>0.0001)
data[row].attriValue[column] = (data[row].attriValue[column]-atrriMinValue[column])/(atrriMaxValue[column]-atrriMinValue[column]);
else
data[row].attriValue[column] = 0;//set loss value 0;
//use newton gradient descent algorithm to get the w
//logistic model: 1/(1+exp(-z))
//class label
void Logistic( DataSample* data, double *logisW )
//memset( logisW,1.0,(attriNum+1)*sizeof(double) );//initial
for( int i=0;i<(attriNum+1);++i )
logisW[i] = 1.0;
Normalize( data );
double h = 0.0;
double error = 0.0;
for( int row=0; row<trainNum; ++row )
h = 0.0;
for( int column=0; column<attriNum; ++column )
h += data[row].attriValue[column]*logisW[column];
h += logisW[attriNum]*1;
h = 1/(1+exp(-h));
error = data[row].classLabel-h;
for( int column=0; column<attriNum; ++column )
logisW[column] += error*alph*data[row].attriValue[column];
logisW[attriNum] = error*alph*1;
bool Predict( DataSample sample, double *logisW )
double h = 0.0;
bool label = 0;
for( int column=0; column<attriNum; ++column )
h += sample.attriValue[column]*logisW[column];
h += logisW[attriNum];
if( h>0.5 )
label = 1;
else
label = 0;
if( label==sample.classLabel )
return 1;
else
return 0;
#endif
main.cpp
/*************
Logistic Regression( logistic 回归 )using newton gradient descent
the Data:from UCI datalib named "wpbc.data"(that is about cancer )
CopyRight 2016/8/21 xukaiwen
All Rights Reserved
**************/
#include "main.h"
int main()
char *file = "C:\\\\Users\\\\Administrator\\\\Desktop\\\\machine_learnning\\\\wpbc.data";
DataSample *data = new DataSample[sampleNum];
double *logisW = new double[attriNum+1];
if( -1!=ReadData( data,file ) )
Logistic( data,logisW );
for(int i=0;i<(attriNum+1);++i)
printf("%f\\t",logisW[i]);
printf("\\n\\n");
int correct = 0;
int sum = 0;
for(int i=trainNum;i<sampleNum; ++i)
++sum;
bool eva = Predict(data[i],logisW);
if(eva)
++correct;
double rp = double(correct)/sum;
printf("the right correction: %f\\n",rp);
delete []data;
delete []logisW;
return 0;
以上是关于Logistic回归模型(C++代码实现)的主要内容,如果未能解决你的问题,请参考以下文章
python逻辑回归(logistic regression LR) 底层代码实现 BGD梯度下降算法 softmax多分类