EZ Study

Actuarial Biology Chemistry Economics Calculators Confucius Engineer
Physics
C.S.

Automation for Data Exploring in data mining

Question: Data exploring is the first step in data mining and modeling. How do
we get a rough picture of the data? How many character/numercial variables? What's the distribution of those variables? Any strong correlation between some variables?

Answer: Here is a SAS macro to get the distribution/correlation of all variables.

%macro explore(data);
proc contents noprint data=&data.
out=vars(keep=name type length varnum format label) varnum; run;

proc sql; select count(*) into: num_of_num_vars from vars where type=1;

proc sql noprint;
select name into: num_var_1-:num_var_%eval(&num_of_num_vars.)

from vars where type=1; quit;

proc sql; select count(*) into: num_of_cha_vars from vars where type=2;

proc sql noprint;
select name into: cha_var_1-:cha_var_%eval(&num_of_cha_vars.)
from vars where type=2; quit;

data testa; length var \$80. var_case \$200.;
format percent percent8.2;
var=''; var_case=''; count=.; percent=.; run;

data testb; length var \$200.;
var=''; Non_missing_cases=.; missing_cases=.;
Average=.; lower_95=.; upper_95=.;
min_1=.;min_2=.;min_3=.;min_4=.;min_5=.;
max_1=.;max_2=.;max_3=.;max_4=.;max_5=.; run;

data testc; length cha_var1 \$80.
cha_var1_cases \$200. cha_var2 \$80. cha_var2_cases \$200.;
format percent percent8.2; cha_var1=''; cha_var2='';
cha_var1_cases=''; cha_var2_cases=''; count=.; percent=.; run;

/***************************************************/
%macro cha_explore(data);
proc sql noprint; drop table output_freq; quit;
%do i=1 %to &num_of_cha_vars.;
proc sql noprint;
create table freq1 as select distinct &&cha_var_&i., count(*) as count
from &data. group by &&cha_var_&i. order by count desc ;quit;

proc sql; select count(*) into: cases from freq1; quit;

proc sql noprint; create table freq2a(where=(percent>0.01))
as select *,count/sum(count) as percent format percent8.1
from freq1 order by count desc ;quit;

data freq2(rename=(&&cha_var_&i.=var_case));
set freq2a; var="&&cha_var_&i."; run;

proc append base=Output_freq data=testa force; run;
proc append base=Output_freq data=freq2 force; run;
%end; %mend;
%cha_explore(&data.);

%macro corr(data);
proc sql noprint; drop table Output_corr; quit;
%do i=1 %to (&num_of_cha_vars.-1);
%do j=(&i.+1) %to (&num_of_cha_vars.);

%let var1=&&cha_var_&i.; %let var2=&&cha_var_&j.;
proc sql; create table corr1 as select distinct "&var1." as Cha_var1,
&var1. as Cha_var1_cases, "&var2." as Cha_var2,
&var2. as Cha_var2_cases, count(&var1.) as count
from &data. group by Cha_var1_cases, Cha_var2_cases
order by count desc ;quit;
proc sql noprint; select count(*) into: cases from corr1; quit;
proc sql noprint; create table corr2(where=(count>1))
as select *,count/sum(count) as percent format percent8.1
from corr1 order by count desc ;quit;

proc append base=Output_corr data=testc force; run;
proc append base=Output_corr data=corr2 force; run;
%end; %end; %mend;
%corr(&data.);

%macro num_explore(data);
proc sql noprint; drop table Output_mean; quit;
%do i=1 %to &num_of_num_vars.;
proc means data=&data. noprint; var &&num_var_&i. ;
output out=mean1(drop=_type_ rename=(_freq_=Non_missing_cases))
nmiss=missing_cases mean=Average lclm=lower_95 uclm=upper_95
min=min_case max=max_case
idgroup( max(&&num_var_&i.) out[5] (&&num_var_&i.)=max )
idgroup( min(&&num_var_&i.) out[5] (&&num_var_&i.)=min )
/autolabel autoname
; run;

data mean2 ; retain var ; set mean1; var="&&num_var_&i."; run;

proc append base=Output_mean data=testb force; run;
proc append base=Output_mean data=mean2 force; run;
%end; %mend;
%num_explore(&data.);

%mend;

Copy the above code into your sas, and run the macro %explore(your dataset) to get all the output datesets(Output_Freq, Output_Mean, Output_Corr) you needed.