SAS 数据分析实例之数据描述预处理和抽样

Posted Sinsa_SI

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了SAS 数据分析实例之数据描述预处理和抽样相关的知识,希望对你有一定的参考价值。

1 描述统计

1.1 描述表属性

ods noproctitle;
ods select attributes variables enginehost directory;

proc datasets lib=SASHELP;
	contents data=SASHELP.CARS order=collate 
		out=WORK.TableAttributes (label="Contents Details for SASHELP.CARS");
quit;

proc print;
run;

1.2 描述数据特征

1.2.1 分析分类变量

title "分类变量的频数";

proc freq data=SASHELP.CARS;
	tables Make Model Type Origin DriveTrain / plots=(freqplot) missing;
run;

1.2.2 分析数值变量

title "数值变量的描述性统计量";

proc means data=SASHELP.CARS n nmiss min mean median max std;
	var MSRP Invoice EngineSize Cylinders Horsepower MPG_City MPG_Highway Weight 
		Wheelbase Length;
run;

title;

proc univariate data=SASHELP.CARS noprint;
	histogram MSRP Invoice EngineSize Cylinders Horsepower MPG_City MPG_Highway 
		Weight Wheelbase Length;
run;

1.3 描述缺失数据

ods noproctitle;

proc format;
	value _nmissprint low-high="非缺失";
	value $_cmissprint " "=" " other="非缺失";
run;

proc freq data=SASHELP.CARS;
	title3 "缺失数据频数";
	title4 h=2 "图例: .、A、B,其他 = 缺失";
	format MSRP Invoice EngineSize Cylinders Horsepower MPG_City MPG_Highway 
		Weight Wheelbase Length _nmissprint.;
	format Make Model Type Origin DriveTrain $_cmissprint.;
	tables Make Model Type Origin DriveTrain MSRP Invoice EngineSize Cylinders 
		Horsepower MPG_City MPG_Highway Weight Wheelbase Length / missing nocum;
run;

proc freq data=SASHELP.CARS noprint;
	table Make * Model * Type * Origin * DriveTrain * MSRP * Invoice * EngineSize 
		* Cylinders * Horsepower * MPG_City * MPG_Highway * Weight * Wheelbase * 
		Length / missing out=Work._MissingData_;
	format MSRP Invoice EngineSize Cylinders Horsepower MPG_City MPG_Highway 
		Weight Wheelbase Length _nmissprint.;
	format Make Model Type Origin DriveTrain $_cmissprint.;
run;

proc print data=Work._MissingData_ noobs label;
	title3 "跨变量的缺失数据模式";
	title4 h=2 "图例: .、A、B,其他 = 缺失";
	format MSRP Invoice EngineSize Cylinders Horsepower MPG_City MPG_Highway 
		Weight Wheelbase Length _nmissprint.;
	format Make Model Type Origin DriveTrain $_cmissprint.;
	label count="频数" percent="百分比";
run;

title3;

proc delete data=Work._MissingData_;
run;

2 数据预处理

2.1 列出数据

title1 "列出数据 - SASHELP.BASEBALL";

proc sort data=SASHELP.BASEBALL out=WORK.SORTTEMP;
	by Team;
run;

proc print data=WORK.SORTTEMP label n;
	var Position;
	by Team;
	sum Salary;
run;

proc delete data=work.SORTTEMP;
run;

title1;

2.2 过滤数据

proc sql noprint;
	create table WORK.filter as select * from SASHELP.BASEBALL where(Salary LT 100 
		AND Position EQ "CF");
quit;

2.3 排序数据

proc sort data=SASHELP.BASEBALL out=WORK.sortDS noequals;
	by descending Salary;
run;

2.4 排名数据

proc rank data=SASHELP.BASEBALL descending out=WORK.Rank;
	var Salary;
	ranks rank_Salary;
run;

2.5 转换数据

data WORK.transform;
	set SASHELP.BASEBALL;
	log_Salary=log(Salary);
	inv_CrHits=1 / CrHits;
run;

2.6 标准化数据

proc stdize data=SASHELP.BASEBALL method=std nomiss out=WORK.Stdize oprefix 
		sprefix=Standardized_;
	var Salary nHits nAtBat;
run;

2.7 重置码值

data WORK.recodedValues;
	set SASHELP.BASEBALL;

	select (Salary);
		when (100) _recodeVar_=99999;
		when (200) _recodeVar_=88888;
		otherwise _recodeVar_=Salary;
	end;
run;

2.8 重置码范围

data WORK.recodedRanges;
	set SASHELP.BASEBALL;

	select;
		when (-1 <=Salary <=100) _recodeVar_=100;
		otherwise _recodeVar_=Salary;
	end;
run;

3 随机抽样

3.1 简单随机抽样

3.1.1 无放回不重复抽样

proc surveyselect data=SASHELP.BASEBALL out=WORK.RandomSample 
		method=srs samprate=0.3 seed=2019;
	strata Position / alloc=prop;
run;

3.1.2 有放回重复抽样

proc surveyselect data=SASHELP.BASEBALL out=WORK.RandomSample 
	outhits method=urs sampsize=30 seed=2019;
	strata Position / alloc=prop;
run;

3.2 分层随机抽样

3.2.1 无放回不重复抽样

proc sort data=SASHELP.BASEBALL out=WORK.SORTTempTableSorted;
	by Position;
run;

proc surveyselect data=WORK.SORTTempTableSorted out=WORK.RandomSample 
		method=srs samprate=0.3 seed=2019;
	strata Position / alloc=prop;
run;

3.2.2 有放回重复抽样

proc sort data=SASHELP.BASEBALL out=WORK.SORTTempTableSorted;
	by Position;
run;

proc surveyselect data=WORK.SORTTempTableSorted out=WORK.RandomSample 
	outhits method=urs sampsize=30 seed=2019;
	strata Position / alloc=prop;
run;

以上是关于SAS 数据分析实例之数据描述预处理和抽样的主要内容,如果未能解决你的问题,请参考以下文章

6-机器学习-样本类别分布不均衡处理之过抽样和欠抽样

sas如何计算相邻观测的差值

数据采集及预处理

你,值得拥有的那些数据挖掘工具

SAS转型,再造云原生数据分析之王

过抽样 欠抽样