SAS 数据分析实例之数据描述预处理和抽样
Posted Sinsa_SI
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了SAS 数据分析实例之数据描述预处理和抽样相关的知识,希望对你有一定的参考价值。
1 描述统计
1.1 描述表属性
ods noproctitle;
ods select attributes variables enginehost directory;
proc datasets lib=SASHELP;
contents data=SASHELP.CARS order=collate
out=WORK.TableAttributes (label="Contents Details for SASHELP.CARS");
quit;
proc print;
run;
1.2 描述数据特征
1.2.1 分析分类变量
title "分类变量的频数";
proc freq data=SASHELP.CARS;
tables Make Model Type Origin DriveTrain / plots=(freqplot) missing;
run;
1.2.2 分析数值变量
title "数值变量的描述性统计量";
proc means data=SASHELP.CARS n nmiss min mean median max std;
var MSRP Invoice EngineSize Cylinders Horsepower MPG_City MPG_Highway Weight
Wheelbase Length;
run;
title;
proc univariate data=SASHELP.CARS noprint;
histogram MSRP Invoice EngineSize Cylinders Horsepower MPG_City MPG_Highway
Weight Wheelbase Length;
run;
1.3 描述缺失数据
ods noproctitle;
proc format;
value _nmissprint low-high="非缺失";
value $_cmissprint " "=" " other="非缺失";
run;
proc freq data=SASHELP.CARS;
title3 "缺失数据频数";
title4 h=2 "图例: .、A、B,其他 = 缺失";
format MSRP Invoice EngineSize Cylinders Horsepower MPG_City MPG_Highway
Weight Wheelbase Length _nmissprint.;
format Make Model Type Origin DriveTrain $_cmissprint.;
tables Make Model Type Origin DriveTrain MSRP Invoice EngineSize Cylinders
Horsepower MPG_City MPG_Highway Weight Wheelbase Length / missing nocum;
run;
proc freq data=SASHELP.CARS noprint;
table Make * Model * Type * Origin * DriveTrain * MSRP * Invoice * EngineSize
* Cylinders * Horsepower * MPG_City * MPG_Highway * Weight * Wheelbase *
Length / missing out=Work._MissingData_;
format MSRP Invoice EngineSize Cylinders Horsepower MPG_City MPG_Highway
Weight Wheelbase Length _nmissprint.;
format Make Model Type Origin DriveTrain $_cmissprint.;
run;
proc print data=Work._MissingData_ noobs label;
title3 "跨变量的缺失数据模式";
title4 h=2 "图例: .、A、B,其他 = 缺失";
format MSRP Invoice EngineSize Cylinders Horsepower MPG_City MPG_Highway
Weight Wheelbase Length _nmissprint.;
format Make Model Type Origin DriveTrain $_cmissprint.;
label count="频数" percent="百分比";
run;
title3;
proc delete data=Work._MissingData_;
run;
2 数据预处理
2.1 列出数据
title1 "列出数据 - SASHELP.BASEBALL";
proc sort data=SASHELP.BASEBALL out=WORK.SORTTEMP;
by Team;
run;
proc print data=WORK.SORTTEMP label n;
var Position;
by Team;
sum Salary;
run;
proc delete data=work.SORTTEMP;
run;
title1;
2.2 过滤数据
proc sql noprint;
create table WORK.filter as select * from SASHELP.BASEBALL where(Salary LT 100
AND Position EQ "CF");
quit;
2.3 排序数据
proc sort data=SASHELP.BASEBALL out=WORK.sortDS noequals;
by descending Salary;
run;
2.4 排名数据
proc rank data=SASHELP.BASEBALL descending out=WORK.Rank;
var Salary;
ranks rank_Salary;
run;
2.5 转换数据
data WORK.transform;
set SASHELP.BASEBALL;
log_Salary=log(Salary);
inv_CrHits=1 / CrHits;
run;
2.6 标准化数据
proc stdize data=SASHELP.BASEBALL method=std nomiss out=WORK.Stdize oprefix
sprefix=Standardized_;
var Salary nHits nAtBat;
run;
2.7 重置码值
data WORK.recodedValues;
set SASHELP.BASEBALL;
select (Salary);
when (100) _recodeVar_=99999;
when (200) _recodeVar_=88888;
otherwise _recodeVar_=Salary;
end;
run;
2.8 重置码范围
data WORK.recodedRanges;
set SASHELP.BASEBALL;
select;
when (-1 <=Salary <=100) _recodeVar_=100;
otherwise _recodeVar_=Salary;
end;
run;
3 随机抽样
3.1 简单随机抽样
3.1.1 无放回不重复抽样
proc surveyselect data=SASHELP.BASEBALL out=WORK.RandomSample
method=srs samprate=0.3 seed=2019;
strata Position / alloc=prop;
run;
3.1.2 有放回重复抽样
proc surveyselect data=SASHELP.BASEBALL out=WORK.RandomSample
outhits method=urs sampsize=30 seed=2019;
strata Position / alloc=prop;
run;
3.2 分层随机抽样
3.2.1 无放回不重复抽样
proc sort data=SASHELP.BASEBALL out=WORK.SORTTempTableSorted;
by Position;
run;
proc surveyselect data=WORK.SORTTempTableSorted out=WORK.RandomSample
method=srs samprate=0.3 seed=2019;
strata Position / alloc=prop;
run;
3.2.2 有放回重复抽样
proc sort data=SASHELP.BASEBALL out=WORK.SORTTempTableSorted;
by Position;
run;
proc surveyselect data=WORK.SORTTempTableSorted out=WORK.RandomSample
outhits method=urs sampsize=30 seed=2019;
strata Position / alloc=prop;
run;
以上是关于SAS 数据分析实例之数据描述预处理和抽样的主要内容,如果未能解决你的问题,请参考以下文章