Logistic回归模型(C++代码实现)

Posted 晴堂

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Logistic回归模型(C++代码实现)相关的知识,希望对你有一定的参考价值。

Logistic回归主要针对输入的数据是多个,输出则是有限的数值型,多为2个分类。

涉及到以下方面:

1. 输出y = w0+w1*x1+w2*x2+..... (x1,x2,...是样本的属性值,为连续型的变量,w0,w1,w2,...为所要求的参数,y为有限的数值型变量,表示样本所属类别)。

2. logistic模型: 1/(1+exp(-z)),其中z= w0+w1*x1+w2*x2+..... 。

3.算法实现:

    w初始化为1;

    alph = 0.1; //设置步长,需根据情况逐步调整

    i = 0;

    while( i<样本数量)

          zi = w0+w1*xi1+w2*xi2+..... 

         h = 1/(1+exp(-zi));

         error = yi-h;

         while(...)

               wj = wj+alph *error*xij; // j表示第j个属性

          end

    end

以上算法过程在样本量比较小的时候可以实现,在样本量非常大的时候,需要考虑采用随机梯度下降法,即随机从总的样本的选出小的样本集来用于迭代过程(可以百度相关资料)。

本文主要采用了梯度下降法完成了参数值优化过程。以下程序主要将3中算法实现。主要包含main.h 和 main.cpp两个文件

测试结果发现预测的准确率可以到80%左右。但感觉这和参数的调整有很大关系,样本量还是太小(总样本量198,训练集:150,测试集:48),这里比较简便,不包含校准数据集,另外结果存在一些欠拟合的现象。


main.h

/*************
Logistic Regression( logistic 回归 )using newton gradient descent

CopyRight 2016/8/21 xukaiwen
All Rights Reserved

**************/

#ifndef MAIN_H
#define MAIN_H

#include "stdio.h"
#include "stdlib.h"
#include "iostream"
#include "string"
#include "string.h"
#include <sstream>
#include <memory.h>

#include "math.h"

using namespace std;

#define maxClassLabelNum 10;
int curLabelNum = 0;


const double alph = 0.3; //set  the newton gradient algorithm fixed step
const int attriNum = 33;
const int sampleNum = 198;
int trainNum = 140;

struct DataSample

	double attriValue[attriNum];
	bool classLabel;
;

double StringTodouble(char * src)

	double a;
	stringstream str;
	str<<src;
	str>>a;
	str.clear();
	return a;




int ReadData( DataSample* data, char *file)

	FILE *pFile;
	char buf[1024];
	pFile = fopen(file,"rt");
	if(pFile==NULL)
	
		printf("the data file is not existing: %s\\n", file);
		return -1;
	

	int row = 0;    //data line
	int cloumn = 0; //data attribute
	char delim[] = ",";//data delimiter
	char *tmpdata = NULL;//data cache
	
	while(!feof(pFile)&&row<sampleNum)
	
		buf[0] = '\\0';
		fgets(buf,1024,pFile);

		if( buf[strlen(buf)-1]=='\\n' )
		
			buf[strlen(buf)-1]='\\0';
		

		//the first column is non-used,and second column is class label;
		for( int column = 0;column<(attriNum+2);++column )
		
			if( column==0 )
			
				tmpdata = strtok(buf,delim);
				continue;
			
			else if( column==1 )
			
				tmpdata = strtok(NULL,delim);

				
				if( tmpdata[0]=='R' )
					data[row].classLabel = 1; //R:1;  N:0
				else
					data[row].classLabel = 0;

			
			else
			
				tmpdata = strtok(NULL,delim);

				if(tmpdata[0]!='?')// '?' mean the loss attribute value
					data[row].attriValue[column-2] = StringTodouble(tmpdata);
				else
					data[row].attriValue[column-2] = -1000;
			
		
		++row;

	

	return 1;


void Normalize( DataSample* data )

	double atrriMinValue[attriNum];
	double atrriMaxValue[attriNum];//for normalization (x-xmin)/(xmax-xmin)

	//think about the first sample is none-loss
	//get the min and max value of each attribute without thinking about the loss atrribute
	for( int i=0;i<attriNum;++i )
	
		atrriMinValue[i] = data[0].attriValue[i];
		atrriMaxValue[i] = data[0].attriValue[i];
	
	
	for( int row = 1; row < sampleNum; ++row )
		for( int column = 0; column < attriNum; ++column )
		
			if( data[row].attriValue[column] > atrriMaxValue[column] && (data[row].attriValue[column]+1000)>0.0001 )
				atrriMaxValue[column] = data[row].attriValue[column];

			if( data[row].attriValue[column] < atrriMinValue[column] && (data[row].attriValue[column]+1000)>0.0001 )
				atrriMinValue[column] = data[row].attriValue[column];
		

	for( int row = 1; row < sampleNum; ++row )
		for( int column = 0; column < attriNum; ++column )
		
			if( (data[row].attriValue[column]+1000)>0.0001)
				data[row].attriValue[column] = (data[row].attriValue[column]-atrriMinValue[column])/(atrriMaxValue[column]-atrriMinValue[column]);
			else
				data[row].attriValue[column] = 0;//set loss value 0;
		


//use newton gradient descent algorithm to get the w
//logistic model: 1/(1+exp(-z))
//class label
void Logistic( DataSample* data, double *logisW )


	//memset( logisW,1.0,(attriNum+1)*sizeof(double) );//initial

	for( int i=0;i<(attriNum+1);++i )
	
		logisW[i] = 1.0;
	

	
	Normalize( data );

	double h = 0.0;
	double error = 0.0;
	for( int row=0; row<trainNum; ++row )
	
		h = 0.0;
		for( int column=0; column<attriNum; ++column )
		
			h += data[row].attriValue[column]*logisW[column];
		
		h += logisW[attriNum]*1;
		h = 1/(1+exp(-h));

		error = data[row].classLabel-h;

		for( int column=0; column<attriNum; ++column )
		
			logisW[column] += error*alph*data[row].attriValue[column];
		
		logisW[attriNum] = error*alph*1;

	


bool Predict( DataSample sample, double *logisW )

	double h = 0.0;
	bool label = 0;
	for( int column=0; column<attriNum; ++column )
	
		h += sample.attriValue[column]*logisW[column];
	
	h += logisW[attriNum];

	if( h>0.5 )
		label = 1;
	else
		label = 0;

	if( label==sample.classLabel )
		return 1;
	else
		return 0;



#endif
main.cpp

/*************
Logistic Regression( logistic 回归 )using newton gradient descent

the Data:from UCI datalib named "wpbc.data"(that is about cancer )

CopyRight 2016/8/21 xukaiwen
All Rights Reserved

**************/

#include "main.h"

int main()

	char *file = "C:\\\\Users\\\\Administrator\\\\Desktop\\\\machine_learnning\\\\wpbc.data";
	DataSample *data = new DataSample[sampleNum];
	double *logisW = new double[attriNum+1];
	
	if( -1!=ReadData( data,file ) )
	
		Logistic( data,logisW );
	

	for(int i=0;i<(attriNum+1);++i)
	
		printf("%f\\t",logisW[i]);
	
	printf("\\n\\n");

	int correct = 0;
	int sum = 0;
	for(int i=trainNum;i<sampleNum; ++i)
	
		++sum;
		bool eva = Predict(data[i],logisW);
		if(eva)
			++correct;
	

	double rp = double(correct)/sum;
	printf("the right correction: %f\\n",rp);

	delete []data;
	delete []logisW;

	return 0;



以上是关于Logistic回归模型(C++代码实现)的主要内容,如果未能解决你的问题,请参考以下文章

Logistic回归模型和Python实现

Pytorch实现Logistic回归二分类

python逻辑回归(logistic regression LR) 底层代码实现 BGD梯度下降算法 softmax多分类

R语言广义线性模型Logistic回归案例代码

机器学习 —— 基础整理:线性回归;二项Logistic回归;Softmax回归;广义线性模型

逻辑回归模型(Logistic Regression)及Python实现