EZ Study

Actuarial Biology Chemistry Economics Calculators Confucius Engineer
Physics
C.S.

Automation for Data Exploring in data mining

Question: Data exploring is the first step in data mining and modeling. How do
we get a rough picture of the data? How many character/numercial variables? What's the distribution of those variables? Any strong correlation between some variables?

Answer: Here is a SAS macro to get the distribution/correlation of all variables.

%macro explore(data);
        proc contents noprint data=&data.
        out=vars(keep=name type length varnum format label) varnum; run;

        proc sql; select count(*) into: num_of_num_vars from vars where type=1;

        proc sql noprint;
        select name into: num_var_1-:num_var_%eval(&num_of_num_vars.)

        from vars where type=1; quit;

        proc sql; select count(*) into: num_of_cha_vars from vars where type=2;

        proc sql noprint;
        select name into: cha_var_1-:cha_var_%eval(&num_of_cha_vars.)
        from vars where type=2; quit;

        data testa; length var $80. var_case $200.;
        format percent percent8.2;
        var=''; var_case=''; count=.; percent=.; run;

        data testb; length var $200.;
        var=''; Non_missing_cases=.; missing_cases=.;
        Average=.; lower_95=.; upper_95=.;
        min_1=.;min_2=.;min_3=.;min_4=.;min_5=.;
        max_1=.;max_2=.;max_3=.;max_4=.;max_5=.; run;

        data testc; length cha_var1 $80.
        cha_var1_cases $200. cha_var2 $80. cha_var2_cases $200.;
        format percent percent8.2; cha_var1=''; cha_var2='';
        cha_var1_cases=''; cha_var2_cases=''; count=.; percent=.; run;

        /***************************************************/
        %macro cha_explore(data);
        proc sql noprint; drop table output_freq; quit;
        %do i=1 %to &num_of_cha_vars.;
        proc sql noprint;
        create table freq1 as select distinct &&cha_var_&i., count(*) as count
        from &data. group by &&cha_var_&i. order by count desc ;quit;

        proc sql; select count(*) into: cases from freq1; quit;

        proc sql noprint; create table freq2a(where=(percent>0.01))
        as select *,count/sum(count) as percent format percent8.1
        from freq1 order by count desc ;quit;

        data freq2(rename=(&&cha_var_&i.=var_case));
        set freq2a; var="&&cha_var_&i."; run;

        proc append base=Output_freq data=testa force; run;
        proc append base=Output_freq data=freq2 force; run;
        %end; %mend;
        %cha_explore(&data.);

        %macro corr(data);
        proc sql noprint; drop table Output_corr; quit;
        %do i=1 %to (&num_of_cha_vars.-1);
        %do j=(&i.+1) %to (&num_of_cha_vars.);

        %let var1=&&cha_var_&i.; %let var2=&&cha_var_&j.;
        proc sql; create table corr1 as select distinct "&var1." as Cha_var1,
        &var1. as Cha_var1_cases, "&var2." as Cha_var2,
        &var2. as Cha_var2_cases, count(&var1.) as count
        from &data. group by Cha_var1_cases, Cha_var2_cases
        order by count desc ;quit;
        proc sql noprint; select count(*) into: cases from corr1; quit;
        proc sql noprint; create table corr2(where=(count>1))
        as select *,count/sum(count) as percent format percent8.1
        from corr1 order by count desc ;quit;

        proc append base=Output_corr data=testc force; run;
        proc append base=Output_corr data=corr2 force; run;
        %end; %end; %mend;
        %corr(&data.);

        %macro num_explore(data);
        proc sql noprint; drop table Output_mean; quit;
        %do i=1 %to &num_of_num_vars.;
        proc means data=&data. noprint; var &&num_var_&i. ;
        output out=mean1(drop=_type_ rename=(_freq_=Non_missing_cases))
        nmiss=missing_cases mean=Average lclm=lower_95 uclm=upper_95
        min=min_case max=max_case
        idgroup( max(&&num_var_&i.) out[5] (&&num_var_&i.)=max )
        idgroup( min(&&num_var_&i.) out[5] (&&num_var_&i.)=min )
        /autolabel autoname
; run;

        data mean2 ; retain var ; set mean1; var="&&num_var_&i."; run;

        proc append base=Output_mean data=testb force; run;
        proc append base=Output_mean data=mean2 force; run;
        %end; %mend;
        %num_explore(&data.);

%mend;

Copy the above code into your sas, and run the macro %explore(your dataset) to get all the output datesets(Output_Freq, Output_Mean, Output_Corr) you needed.

Related links:
Continue to: How to send email in SAS part-I   SAS tutorial home
Back to: How to format Excel file by SAS DDE?   Statistics tutorial home