BookmarkSubscribeRSS Feed
sai_prasad
Calcite | Level 5
options mlogic mprint symbolgen;

%let libpath=EDA;
%let rdata=rollupfinalEDA;

proc contents data=&libpath..&rdata out=cont1(keep=NAME TYPE LENGTH VARNUM);
run;

proc sort data=cont1; by VARNUM;
run;

ods output nlevels=checkfreq;

proc freq data=&libpath..&rdata nlevels;
tables _all_ /noprint; run;

ods output close;

data checkfreq;
set checkfreq;
varcnt+1;run;

data &libpath..contentfile;
merge cont1 (rename=(VARNUM=varcnt) rename=(Name=TableVar)) checkfreq;
by varcnt;
run;

proc print data=&libpath..contentfile;
run;



proc sql noprint;
select count(*) into: NOBS
from &libpath..contentfile
;
quit;


%macro stm();
%let dsId=%Sysfunc(open(&libpath..contentfile));
%let cvar=%Sysfunc(varnum(&dsId.,TableVar));
%let I=1;
create table stat_missing as
select
%do %while (%Sysfunc(fetch(&dsId.)) = 0);
%let var = %Sysfunc(getvarc(&dsId.,&cvar));
%if &I ne &NOBS %then coalesce(sum(missing(&var.)),0) as &var ,;
%else coalesce(sum(missing(&var.)),0) as &var;

%let I= %eval(&I + 1);

%end;

from &libpath..&rdata;
%let RC=%sysfunc(close(&DsId));

%mend;


%macro stc();
%let dsId=%Sysfunc(open(&libpath..contentfile));
%let cvar=%Sysfunc(varnum(&dsId.,TableVar));
%let I=1;

create table stat_nonmissing as
select
%do %while (%Sysfunc(fetch(&dsId.))=0);
%let var=%Sysfunc(getvarc(&dsId.,&cvar));
%if &I ne &NOBS &then count(&var.) as &var,;
%else count(&var.) as &var;
%let I=%eval(&I+1);
%end;

from &libpath..&rdata;

%let RC=%sysfunc(close(&DsId));

%mend;

proc sql;
%stm();
%stc();
quit();

proc transpose data=stat_missing out=stat_trans_missing Name=Tablevar prefix=Missing;
proc transpose data=stat_nonmissing out=stat_trans_nonmissing Name=Tablevar prefix=Non_Missing;

proc sort data=&libpath..contentfile; by Tablevar;
proc sort data=stat_trans_missing; by Tablevar;
proc sort data=stat_trans_nonmissing; by Tablevar;

data &libpath..final_content;
merge &libpath..contentfile stat_trans_missing stat_trans_nonmissing;
by tablevar;

proc sort data=&libpath..final_content; by varcnt;
run;

proc sql noprint;

select Tablevar into : varmore separated by ''
from &libpath..contentfile where TYPE=1;

select count(TableVar) into :vmcnt
from &libpath..contentfile where TYPE=1;
quit;

%let vmcnt=&vmcnt;


proc sql noprint;

select TableVar into :v1-:v&vmcnt
from &libpath..contentfile where TYPE=1;

select count(varcnt) into :varcount
from &libpath..contentfile where TYPE=2;

%let varcount=&varcount;

select tablevar into : x1-:x&varcount
from &libpath..contentfile where TYPE=2;

select count(*) into :obscnt
from &libpath..&rdata.;
quit;

%macro stkorig;

%do i=1 %to &vmcnt;

data v&i;
length NAME $32.;
set &libpath..&rdata(keep=&&v&i rename=(&&v&i=origvalue));
NAME="&&v&i";
format NAME $32.;
attrib _all_ label='';
run;

proc univariate data=v&i noprint;
var origvalue;
output out=vstats&i min=Minimum max=Maximum mean=Mean std=Standard_Deviation skewness=Skewness kurtosis=Kurtosis
pctlpts=0.01 0.05 0.1 0.5 1 5 10 25 50 75 90 95 99 99.5 99.9 99.95 99.99
pctlpre=pct_;
run;

data vstats&i;
set vstats&i;
length TableVar $32.;
TableVar="&&v&i";
run;

%end;

%mend;

%stkorig;


data stackorig;
set vstats1-vstats&vmcnt; run;

proc sort data=&libpath..final_content; by Tablevar; run;
proc sort data=stackorig;by Tablevar; run;
data &libpath..stackfinal;
merge &libpath..final_content stackorig;
by Tablevar;run;

%macro outshell;

%do i=1 %to &varcount;

proc sql noprint;

create table eda&i as
(select "&&x&i" as tablevar length=64 format=$64.,
&&x&i as tier length =64 format=$64.,
count(*) as Count

from &libpath..&rdata
group by "&&x&i", &&x&i
)
order by &&x&i;

quit;
%end;

%mend outshell;


%outshell;

data &libpath..edaall; set eda1-eda&varcount; run;
proc sort data=&libpath..stackfinal; by Tablevar; run;
proc sort data=&libpath..edaall; by Tablevar; run;

data &libpath..edaall_final;
merge &libpath..stackfinal &libpath..edaall;
by Tablevar;
run;

proc sort data=&libpath..edaall_final; by TYPE; run;

data &libpath..edaall_final;
set &libpath..edaall_final;
Total_Records=Missing1+Non_Missing1;
Missing_Percentages=Missing1/Total_Records;
Non_Missing_Percentages=Non_Missing1/Total_Records;
if count=. then count=Non_Missing1;
run;


data &libpath..edaall_final (drop=length varcnt);
Attrib tablevar Label="";
Attrib tier Label="tier ";
Attrib Count Label="Count";
Attrib TYPE Label="TYPE ";
Attrib NLevels Label="NLevels ";
Attrib Missing1 Label="Missing1 ";
Attrib Non_Missing1 Label="Non_Missing1 ";
Attrib Total_Records Label="Total_Records ";
Attrib Missing_Percentages Label="Missing_Percentages ";
Attrib Non_Missing_Percentages Label="Non_Missing_Percentages";
Attrib Minimum Label="Minimum";
Attrib Maximum Label="Maximum";
Attrib Mean Label="Mean ";
Attrib Standard_Deviation Label="Standard_Deviation";
Attrib Skewness Label="Skewness ";
Attrib Kurtosis Label="Kurtosis ";
Attrib pct_0_01 Label="Percentile at 0.01%";
Attrib pct_0_05 Label="Percentile at 0.05%";
Attrib pct_0_1 Label="Percentile at 0.1%";
Attrib pct_0_5 Label="Percentile at 0.5%";
Attrib pct_1 Label="Percentile at 1%";
Attrib pct_5 Label="Percentile at 5%";
Attrib pct_10 Label="Percentile at 10%";
Attrib pct_25 Label="Percentile at 25%";
Attrib pct_50 Label="Percentile at 50%";
Attrib pct_75 Label="Percentile at 75%";
Attrib pct_90 Label="Percentile at 90%";
Attrib pct_95 Label="Percentile at 95%";
Attrib pct_99 Label="Percentile at 99%";
Attrib pct_99_5 Label="Percentile at 99.5 %";
Attrib pct_99_9 Label="Percentile at 99.9 %";
Attrib pct_99_95 Label="Percentile at 99.95%";
Attrib pct_99_99 Label="Percentile at 99.99%";

set &libpath..edaall_final;
run;

proc format;
value TypeF
1="Numeric"
2="Categorical";
run;

data &libpath..edaall_final;
set &libpath..edaall_final;
format Type TypeF.
format Missing_Percentages Non_Missing_Percentages percentn10.2;
run;

ods csv file="Univariate_Report.csv";
proc print data=&libpath..edaall_final;
format Type TypeF.;
format Missing_Percentages Non_Missing_Percentages percentn10.2;
run;
ods csv close;











6 REPLIES 6
novinosrin
Tourmaline | Level 20

Hi @sai_prasad   What is the question or perhaps a problem you are facing?

Reeza
Super User
SAS has a characterize data task that does the EDA for most basic data sets. This is available in EG and SAS Studio but not Base AFAIK.
sai_prasad
Calcite | Level 5

Hi, I am looking for a sas macros which will automatically does EDA like univariate analysis,Bivariate Analysis and Frequency plots for all the variables in the dataset .Please kindly suggest if you have any such available macros

sai_prasad
Calcite | Level 5


%let path = D:\EDA\nnnn;
options symbolgen mprint;

%MACRO LEVELS(VAR=);
proc sql;
INSERT INTO dummytable
select "&var." as variable,count(distinct &var) as levels from &dsn.;
quit;
%mend Levels;

%macro freq_char(ds=,var=);
ods exclude all;
Proc freq data=&ds ;
table &var / missing nocum out=&var;
run;
ods exclude none;
proc export data=&var
outfile="&path\categorical_freq.xlsx" dbms=xlsx replace;
sheet="&var.";
run;
%mend freq_char;


%macro variable_analysis(dsn=);

proc contents data=&dsn out=variables;
run;

data variables(keep=name type);
set variables;
run;

data temp;
input variable $100. nlevels;
run;

PROC SQL;
CREATE TABLE DUMMYTABLE LIKE temp;
QUIT;

Data _null_;
Set variables;
Call Execute('%levels(var='||name||')');
Run;

proc sql;
create table var_analysis as
Select a.*,b.nlevels,max(b.nlevels) as total_levels from variables as a left join dummytable as b on a.name=b.variable;
quit;

data var_analysis;
set var_analysis;
level_percent=nlevels/total_levels;
if type=2 then variable_type="categorical";
else if type=1 and level_percent<0.01 then variable_type="categorical";
else variable_type="continous";
run;

ods output summary=nsummary;
proc means data=&dsn n nmiss min max mean std p1 p5 p10 p25 p50 p90 p95 p99 qrange skewness kurtosis stackods;
var _numeric_;
run;

data var_continious;
set var_analysis;
where variable_type="continous";
run;

proc sql;
create table cont_ana as select a.name,a.nlevels,b.* from var_continious as a left join nsummary as b on a.name=b.variable;
quit;

proc export data=var_analysis
outfile="&path\VariableAnalysis.xlsx" dbms=xlsx replace;
sheet="Variable_analysis";
run;

proc export data=nsummary
outfile="&path\VariableAnalysis.xlsx" dbms=xlsx replace;
sheet="Numeric_summary";
run;

proc export data=cont_ana
outfile="&path\VariableAnalysis.xlsx" dbms=xlsx replace;
sheet="Continious_summary";
run;

data var_cat;
set var_analysis;
where variable_type="categorical" and nlevels<500000;
run;

data _null_;
set var_cat;
call execute('%freq_char(ds=&dsn,var='||name||')');
run;

ods exclude all;
proc corr data=&dsn rank noprob;
ods output PearsonCorr=correlation;
run;
ods exclude none;

proc export data=correlation
outfile="&path\VariableAnalysis.xlsx" dbms=xlsx replace;
sheet="Numeric_correlation";
run;

ods excel file="&path\Correlation_plot.xlsx" ;
ods excel options(sheet_interval = 'proc' sheet_name = "Correlation_plot");
ods graphics on;
proc corr data=assignment plots(maxpoints=1000000)=matrix(histogram) ;
ods select matrixplot;
run;
ods excel close;

 

 

%mend variable_analysis;

%variable_analysis(dsn=sashelp.class);

Reeza
Super User
SAS has the characterize data task that essentially does this, ergo no need for a macro.

I have some custom tables/reports that I can run, you can find most of my code on my GitHub page.

https://gist.github.com/statgeek
sai_prasad
Calcite | Level 5

data _null_;
length status $ 15.;
set t;
if sum(today(),-datepart(update_time)) eq 0 and sum(-timepart(update_time),time())/60 ge 30 then status=1;
else status=0;;
call symputx('db_chk',status,'g');
put _all_;
run;


%end;

%else
%do;
%global db_chk;
%let db_chk=0;
%end;

%put &db_chk.;
%mend chkdb_status;
%chkdb_status;

Data _Null_;
length



%do i=1 %to 12;
data _null_;
call sleep(60,30);
run;
%chkdb_status;
%if &db_chk.=1 %then %goto action;
%end;

%action:
%if &db_chk.=1 %then %do;
%sysexec("&path\&fname.bat");
%end;


%end;



%mend;

%test;


sas-innovate-2024.png

Available on demand!

Missed SAS Innovate Las Vegas? Watch all the action for free! View the keynotes, general sessions and 22 breakouts on demand.

 

Register now!

How to Concatenate Values

Learn how use the CAT functions in SAS to join values from multiple variables into a single value.

Find more tutorials on the SAS Users YouTube channel.

Click image to register for webinarClick image to register for webinar

Classroom Training Available!

Select SAS Training centers are offering in-person courses. View upcoming courses for:

View all other training opportunities.

Discussion stats
  • 6 replies
  • 1345 views
  • 2 likes
  • 3 in conversation