Hi, I currently have a SAS project that takes around 3 days to run- I'm looking to reduce this time as much as possible by (hopefully) making the code more efficient. Is there anything I can change to make this code run faster? /* Means analysis using a format for each variable */
%macro Percentile_Summary;
/* If the count is populated, then run the percentile summary */
%if (%eval(&COUNT_ANLY_1.) ne 0) %then %do;
/* Kill means datasets in the WORK tempspace that may exist from previous runs */
proc datasets lib=work nolist;
SYSECHO "Percentile Analysis - ANLY_IND 1 - Step 1/4";
delete Percentile_Summary Anly_1_Summary;
run;
/* Run proc means for individual variables, using custom made formats for variable */
%do k = 1 %to &COUNT_ANLY_1.;
/* Create format for use in the proc means step below */
%Percentile(dsn=refi_lib.&Refi_DSN., var=&&ANLY_IND_1_VAR&k..);
%sleep(60); /* Test whether the system should go into sleep mode or not */
/* Analysis: PROC MEANS function for all numeric continuous data, at fine and total level */
PROC MEANS DATA = ranked_out noprint missing chartype COMPLETETYPES;
SYSECHO "Percentile Analysis - ANLY_IND 1 - Step 2/4 - Sub Step &k./&COUNT_ANLY_1.";
VAR &&ANLY_IND_1_VAR&k..;
CLASS VAR_SEG;
OUTPUT OUT = Anly_1_Summary (rename=(VAR_SEG=GROUPING))
N= N
NMISS= NMISS
MIN= MIN
MAX= MAX
MEAN= MEAN
SUM= SUM
STD= STD /noinherit;
RUN;
data Percentile_Summary;
SYSECHO "Percentile Analysis - ANLY_IND 1 - Step 2/4 - Sub Step &k./&COUNT_ANLY_1.";
/* Set original datastep and additions - N ne 1*/
%if &k. ne 1 %then %do;
set Percentile_Summary Anly_1_Summary;
%end;
/* Create variables and set lengths/labels. Set first batch of data available */
%else %if &k. eq 1 %then %do;
/* Assign Lengths and Formats*/
LENGTH ID 4 VARIABLE $32 GROUPING $11;
/* Assign Labels */
label
ID="ID Number"
VARIABLE="Classification Variable Name"
GROUPING="Classification Grouping"
;
/* Initialise with missing */
GROUPING = .;
set Anly_1_Summary;
%end;
/* Create column with relevant variable name and ID */
if ID = . then ID = &k.;
if VARIABLE = "" then VARIABLE = "&&ANLY_IND_1_VAR&k..";
if _TYPE_ = "0" then GROUPING = "Overall";
run;
%end;
/* Left join label onto dataset */
Proc SQL;
Create table output.Percentile_Summary (label="Expln=&Expiry_Months.m" Alter="IB") as
Select A.*
,B.Label
From Percentile_Summary A
Left Join output.Contents_Summary B
on A.Variable = B.Variable
;
Quit;
proc datasets library=work nolist;
SYSECHO "Percentile Analysis - ANLY_IND 1 - Step 4/4";
delete Anly_1_SUMMARY ranked ranked_out;
run;
%end;
%mend Percentile_Summary;
/* Macro to calculate the cutpoints for percentiles */
%Macro Percentile(
dsn /* Libname and dataset name - lib.dsetname */,
var /* Variable name to create percentiles on */);
/* Rank variable in question to create percentiles*/
proc rank data=&dsn.(keep=&var.) out=ranked ties=low;
SYSECHO "Formatting 1/4 - ANLY_IND 1 - Step 2/4 - Sub Step &k./&COUNT_ANLY_1.";
var &var.;
ranks Var_Rank;
run;
/* Sort data into ascending order using the ranked variable created above */
proc sort data=ranked;
SYSECHO "Formatting 2/4 - ANLY_IND 1 - Step 2/4 - Sub Step &k./&COUNT_ANLY_1.";
by Var_Rank;
run;
/* Add counter and determine splits for percentile calculation */
DATA ranked;
SYSECHO "Formatting 3/4 - ANLY_IND 1 - Step 2/4 - Sub Step &k./&COUNT_ANLY_1.";
/* Set non-missing values then missing values, so missing values are at the bottom*/
SET ranked(where=(Var_Rank ne .)) ranked(where=(Var_Rank eq .));
N = _N_; /* Counter */
IF (Var_Rank eq .) THEN N = .; /* Set N to missing for missing values */
RUN;
/* Step to create the segmentation for percentiles. See formatting in monitoring program */
PROC SQL;
SYSECHO "Formatting 4/4 - ANLY_IND 1 - Step 2/4 - Sub Step &k./&COUNT_ANLY_1.";
CREATE TABLE ranked_out(drop=N Var_Rank) AS
SELECT *,
CASE WHEN N = . THEN 'Missing'
WHEN N <= CEIL(MIN(N) + (MAX(N) - MIN(N) + 1)* 1/100 - 1) THEN '0% to 1%'
WHEN N <= MIN(N) + (MAX(N) - MIN(N) + 1)* 5/100 - 1 THEN '1% to 5%'
WHEN N <= MIN(N) + (MAX(N) - MIN(N) + 1)* 10/100 - 1 THEN '5% to 10%'
WHEN N <= MIN(N) + (MAX(N) - MIN(N) + 1)* 25/100 - 1 THEN '10% to 25%'
WHEN N <= MIN(N) + (MAX(N) - MIN(N) + 1)* 50/100 - 1 THEN '25% to 50%'
WHEN N <= MIN(N) + (MAX(N) - MIN(N) + 1)* 75/100 - 1 THEN '50% to 75%'
WHEN N <= MIN(N) + (MAX(N) - MIN(N) + 1)* 90/100 - 1 THEN '75% to 90%'
WHEN N <= MIN(N) + (MAX(N) - MIN(N) + 1)* 95/100 - 1 THEN '90% to 95%'
WHEN N <= FLOOR(MIN(N) + (MAX(N) - MIN(N) + 1)* 99/100 - 1) THEN '95% to 99%'
ELSE '99% to 100%'
END AS VAR_SEG /* Percentile Segmentation variable */
FROM ranked
;
QUIT;
%Mend Percentile;
/* Macro to create 1-way summaries */
%Macro one_way_summary(
dsetin /* Library and dataset name - lib.dset_name */,
varlist /* List of vars to feed through the proc freq procedure */,
dsetout /* Output library and dataset name - lib.dset_name */,
type /* ANLY_IND value */) /mindelimiter=',';
%sleep(60); /* Test whether the system should go into sleep mode or not */
/* Count the number of variables within the table matching the ANLY_IND = &type. criteria */
Proc Sql noprint;
SYSECHO "One Way Summary - ANLY_IND &type. - Step 1/4";
Select Count(*)
into :COUNT_ANLY_&type.
From output.Contents_Summary
where ANLY_IND=&type.;
Quit;
/* If the count is populated, then run the one way summary */
%if (%eval(&&COUNT_ANLY_&type..) ne 0) %then %do;
/* Housekeeping - Delete datasets */
proc datasets nodetails nolist;
SYSECHO "One Way Summary - ANLY_IND &type. - Step 2/4";
delete &dsetout.;
quit;
/* Create population format */
%if (&type. in (1, 5)) %then %do;
proc format;
value POP_FMT (multilabel)
. = "Missing"
other = "Populated"
;
run;
%end;
%else %if (&type. = 4) %then %do;
proc format;
value $ POP_FMT (multilabel)
' ' = "Missing"
other = "Populated"
;
run;
%end;
*loop through variable list;
%let i=1;
%do %while (%scan(&varlist., &i., " ") ^=%str());
%let var=%scan(&varlist., &i., " ");
%put &i. &var.;
*Cross tab;
proc freq data=&dsetin. noprint;
SYSECHO "One Way Summary - ANLY_IND &type. - Step 3/4 - Sub Step &i./&&COUNT_ANLY_&type..";
table &var./ out=temp1 missing;
%if (&type. in (1, 5)) %then %do;
format &var. POP_FMT.;
%end;
%else %if (&type. = 4) %then %do;
format &var. $POP_FMT.;
%end;
run;
*Get variable name as variable name;
data _null_;
SYSECHO "One Way Summary - ANLY_IND &type. - Step 3/4 - Sub Step &i./&&COUNT_ANLY_&type..";
set &dsetin. (obs=1);
call symput('var_name', vname(&var.));
run;
%put &var_name.;
*Add in variable name and label and store the levels as a text field;
data temp2;
SYSECHO "One Way Summary - ANLY_IND &type. - Step 3/4 - Sub Step &i./&&COUNT_ANLY_&type..";
length ANLY_IND $1. Variable $32. Grouping $50. Count 8. Percent 8.;
ANLY_IND = "&type.";
Variable = "&var_name.";
set temp1;
Grouping=input(&var., $50.);
%if (&type. in (1, 4, 5)) %then %do;
if grouping = "" then grouping = "Missing";
else grouping = "Populated";
%end;
percent=percent/100; * Stored as decimals instead of numbers, with format applied;
format percent PERCENTN12.4;
drop &var.;
run;
%put &var_name;
*Append datasets;
proc append data=temp2 base=&dsetout. force;
SYSECHO "One Way Summary - ANLY_IND &type. - Step 3/4 - Sub Step &i./&&COUNT_ANLY_&type..";
run;
/* drop temp tables */
proc datasets nodetails nolist;
SYSECHO "One Way Summary - ANLY_IND &type. - Step 3/4 - Sub Step &i./&&COUNT_ANLY_&type..";
delete temp1 temp2;
quit;
*Increment counter;
%let i=%eval(&i.+1);
%end;
/* Left join label onto dataset and store in perm location */
Proc SQL;
SYSECHO "One Way Summary - ANLY_IND &type. - Step 4/4";
Create table output.&dsetout. (label="Expln=&Expiry_Months.m" Alter="IB") as
Select A.*
,B.Label
From &dsetout. A
Left Join output.Contents_Summary B
on A.Variable = B.Variable
;
Quit;
%end;
%Mend;
%Macro Combine_Datasets;
proc format;
value $ Grouping_FMT (multilabel)
' ' = "Missing"
;
run;
data output.Frequency_Summary (label="Expln=&Expiry_Months.m" Alter="IB");
SYSECHO "Combine Datasets - Step 1/1";
length ANLY_IND $1. Variable $32. Label $256. Grouping $50. Count 8. Percent 8.;
set
%do i = 2 %to 6;
%if %sysfunc(exist(output.Frequency_&i)) %then %do;
output.Frequency_&i
%end;
%else %put Dataset Frequency_&i does not exist.;
%end;
;
Format Grouping Grouping_FMT.;
run;
%Mend Combine_Datasets;
%Macro Sleep(Buffer /* Number of minutes before 21:00 to pause code */);
%if %sysfunc(upcase(&sleep.)) = Y %then %do;
/* Sleep Code to remove change of 9pm error when running in nightqueue */
data _null_;
SYSECHO "Night Queue Test - &sleep.mins before 9:00pm";
timediff=input('22:00:00',time8.)-datetime()+date()*24*60*60;
timediff_nine=(input('21:00:00',time8.)-datetime()+date()*24*60*60)/60;
/* If Mins to 9:00 pm Less than the Buffer then sleep until 10:00 pm */
IF timediff_nine <= &Buffer. then
do;
sleeptime=timediff;
call sleep(sleeptime,1);
end;
run;
%end;
%Mend;
/* This macro will allow you to step through the lowercase letters of */
/* the alphabet on a %DO loop. */
%macro iterm(itr /* Iteration Number to convert to letter */);
%Global countvar_&itr.;
%let lst=a b c d e f g h i j k l m n o p q r s t u v w x y z
aa ab ac ad ae af ag ah ai aj ak al am an ao ap aq ar as at au av aw ax ay az;
%let countvar_&itr. = %scan(&lst.,&itr.);
%mend; /*----------------------------------------------------------------------------*/
/*-- Initialise User Unputs --------------------------------------------------*/
/*----------------------------------------------------------------------------*/
options obs=max fullstimer mprint symbolgen mlogic compress=binary varinitchk=error MINOPERATOR;
/* Format List Location and dataset name */
libname form "/team/basel/Capital Transformation Project/Team/Engin Tahsin/Temp" ACCESS=READONLY;
%let format_list = format_list;
/*--------------------------------------------------------------------*
* Dataset Location and libname type (Meta or Standard). See examples below:
* Meta Libname: LIBNAME ... META LIBRARY="..." METAOUT=data ACCESS=READONLY;
* Standard Libname: LIBNAME ... "..." ACCESS=READONLY
*---------------------------------------------------------------------*/
%let Lib_Type = standard;
%let Refi_Libname = /team/customer/Internal_Bureau/Marts;
%let Out_Libname = /team/decision_science/Simplification/Internal Bureau Remediation/Data/Reporting Datasets/Profiling Outputs;
/*--------------------------------------------------------------------*
* Dataset Name in location specified above
*---------------------------------------------------------------------*/
%let Refi_DSN = RLAD_PAYDAY_201609_NEW;
/*--------------------------------------------------------------------*
* Tag used to apply to output files - Data_Dictionary_&Tag.
* Use &Refi_DSN. to populate with dataset name
*---------------------------------------------------------------------*/
%let Tag = &Refi_DSN.;
/*--------------------------------------------------------------------*
* Sleep switch used to toggle sleep function on and off [Y/N]
* Log switch used to toggle log output on and off [Y/N]
* Expiry Date - Number of months to keep data [+ve integer]
*---------------------------------------------------------------------*/
%let sleep = N;
%let output_log = N;
%let Expiry_Months = 12;
/*--------------------------------------------------------------------*
* Below criteria decides which variables get detailed statistics.
* The threshholds for N_Distinct and Distinct_Pct_All may need tweaking,
depending on your data.
* N_Distinct refers to the maximum number of distinct values within a
variable.
* Distinct_Pct_All refers to the maximum ratio the number of distinct
values within a variable to all values in that same variable can take.
*--------------------------------------------------------------------*/
%let Max_N_Distinct = 25; /* Max number of distinct values */
%let Max_Distinct_Pct_All = 0.1; /* Max percent unique for all rows */
/*----------------------------------------------------------------------------*/
/*-- Define where the program is located -------------------------------------*/
/*----------------------------------------------------------------------------*/
/*Store log for each run*/
%Macro Output_Log;
%if %sysfunc(upcase(&output_log.))=Y %then %do;
proc printto log="&Out_Libname./&Refi_DSN./&Refi_DSN..log";
run;
%end;
%else %do;
%put No log output.;
%end;
%Mend Output_Log;
%Output_Log;
%macro LIBNAMES;
/*-- Setup libname locations based on the library type used --*/
%IF %SYSFUNC(lowcase(&Lib_Type.))=standard %THEN %DO;
LIBNAME refi_lib "&Refi_Libname." ACCESS=READONLY;
%END;
%ELSE %IF %SYSFUNC(lowcase(&Lib_Type.))=meta %THEN %DO;
LIBNAME refi_lib META LIBRARY="&Refi_Libname." METAOUT=data ACCESS=READONLY;
%END;
/* Location of output files - this can also create a folder */
options DLCREATEDIR; /* Turn on function to create library if no-exist */
LIBNAME output "&Out_Libname./&Refi_DSN.";
options NODLCREATEDIR; /* Turn off function to create library if exist */
%Mend LIBNAMES;
%LIBNAMES;
%sleep(120);
/*----------------------------------------------------------------------------*/
/*-- Contents of the REFI metadata and dataset -------------------------------*/
/*----------------------------------------------------------------------------*/
proc sql noprint;
SYSECHO "Key Statistics - Step 1/21";
%*-- List the SAS Metadata for this dataset --;
create table Contents_Listing as
select Name, Type, Length, strip(Format) as Format,
strip(Informat) as InFormat, strip(Label) as Label, varnum
from dictionary.columns
where libname = upcase('Refi_Lib') and memname = upcase("&Refi_DSN.")
order by upcase(Name);
%put Dataset has &sqlobs rows;
quit;
/*-- Custom dataset contents --*/
proc print data=Contents_Listing label uniform width=minimum noobs;
SYSECHO "Key Statistics - Step 2/21";
title "Contents of &Refi_DSN.";
run;
/*----------------------------------------------------------------------------*/
/*-- Look for various possible key columns -----------------------------------*/
/*----------------------------------------------------------------------------*/
%let Count_All_Vars =;
proc sql noprint stimer;
SYSECHO "Key Statistics - Step 3/21";
%*-- Count non-missing and unique rows in the dataset. --------------------;
%*-- Prepare the SQL statements to perform the counts. --------------------;
%*-- Handle issue where SAS variable name would be longer than 32 chars ---;
select "count(" || strip(Name) || ") as N_" ||
substr(left(Name),1,min(29, length(strip(Name)))) ||
", count(distinct " || strip(Name) || ") as ND_" ||
substr(left(Name),1,min(29, length(strip(Name))))
into :Count_All_Vars separated by ", "
from dictionary.columns
where libname = upcase('Refi_Lib') and memname = upcase("&Refi_DSN");
%put Count_All_Vars has &sqlobs rows;
quit;
/* Perform the dataset counts prepared above */
proc sql noprint stimer;
SYSECHO "Key Statistics - Step 4/21";
create table key_values_0 as
select count(*) as All_Rows
,&Count_All_Vars.
from refi_lib.&Refi_DSN.
%put Dataset has &sqlobs rows;
quit;
/*----------------------------------------------------------------------------*/
/*-- Create a more useable dataset for Key Column analysis -------------------*/
/*----------------------------------------------------------------------------*/
/*-- Exchange rows and columns --*/
proc transpose data=key_values_0 out=key_values_0_T;
SYSECHO "Key Statistics - Step 5/21";
run;
%sleep(120);
/*-- Derive other column counts from the transposed SQL results --*/
data key_values;
SYSECHO "Key Statistics - Step 6/21";
/*-- Order the variables and assign labels --*/
label
Variable=""
Count="Number of Rows Filled"
Filled="Percent of Rows Filled"
NMiss="Number of Rows Missing"
Miss_Pct="Percent of Rows Missing"
Unique="Number of Distinct Values"
Unique_Pct="Unique Percentage of Filled Rows"
Unique_Pct_All="Unique Percentage of All Rows"
Stats="Stats"
;
set key_values_0_T (rename=(_name_=Varname col1=Num));
length Variable $ 40 Stats $ 1;
retain Row_Count Variable Count NMiss;
drop Row_Count Varname Num;
format Filled Unique_Pct Miss_Pct Unique_Pct_All PERCENTN12.4;
/*-- Default value is N = No Statistics --*/
Stats = "N";
/*-- The first OBS contains the overall totals --*/
if _N_=1 then do;
Variable = Varname;
Count = Num;
Filled = 1;
Unique = Num;
Unique_Pct = 1;
Unique_Pct_All = 1;
NMiss = 0;
Miss_Pct = 0;
Row_Count = Num;
output;
end;
/*------------------------------------------------------------------------*
* Besides the first row, other rows contain, in order, records with the
* non-missing count, followed by another with the unique count.
* Other measures are derived from this data.
*------------------------------------------------------------------------*/
/*-- Got a new Variable ... Set the non-missing counts --*/
if (substr(Varname, 1, 2) = "N_") then do;
Variable = substr(Varname, 3);
Count = Num;
NMiss = Row_Count - Count;
end;
/*-- This row has unique counts, and most measures are computed here --*/
if (substr(Varname, 1, 3) = "ND_") then do;
Filled = Count / Row_Count;
Unique = Num;
/*-- This data is NOT all Missing values --*/
if (Count ^= 0) then do;
Unique_Pct = Num / Count;
Unique_Pct_All = Num / Row_Count;
Miss_Pct = NMiss / Row_Count;
end;
/*-- This data IS all Missing values, and cannot be summarized --*/
else do;
Unique_Pct = 0;
Unique_Pct_All = 0;
Miss_Pct = 1;
end;
/*--------------------------------------------------------------------*
* This criteria decides which variables get detailed statistics.
* The threshholds may need tweaking (at initialisation stages),
depending on your data.
*--------------------------------------------------------------------*/
if (Unique_Pct_All <= &Max_Distinct_Pct_All. and Unique < &Max_N_Distinct. and Count ^= 0) then
Stats = "Y";
output;
end;
run;
proc sort data=key_values;
SYSECHO "Key Statistics - Step 7/21";
by descending Count descending NMiss descending Unique Variable;
run;
/*-- The analysis so far, before the merge with the Contents dataset --*/
proc print data=key_values label uniform width=minimum noobs;
SYSECHO "Key Statistics - Step 8/21";
title "Counts and Uniqueness for Variables in &Refi_DSN.";
run;
/*-- Combine the Contents dataset with the Key Values analysis above ---------*/
proc sql noprint;
SYSECHO "Key Statistics - Step 9/21";
create table Contents_Summary as
select b.*, a.type, a.length, a.Format, a.InFormat, a.Label,
a.varnum
from Contents_Listing a, key_values b
where upcase(a.name) = upcase(b.Variable);
%put Dataset has &sqlobs rows;
quit;
/*--------------------------------------------------------------------*
* Pull in existing dataset and overwrite with additional indicator
* Create the analysis indicator, which will be used to drive which
monitoring tests are to be carried out on each variable. See below:
* Anly_ind = 1 Numeric - Continuous data
* Anly_ind = 2 Numeric - Segment/Indicator data
* Anly_ind = 3 Character - Segment/Indicator data
* Anly_ind = 4 Character - Continuous data
* Anly_ind = 5 Numeric - Date type data
* Anly_ind = 6 Other - Catchall - Must Explore
*--------------------------------------------------------------------*/
/* Macro to pull in and initialise formats from the formats table */
%macro initialise_formats;
/* Count the number of formats within the table matching the datetime criteria */
%global count_format;
Proc Sql noprint;
SYSECHO "Key Statistics - Step 10/21";
Select Count(*)
into :count_format
From form.&format_list.
where TYPE='datetime';
Quit;
/* Create N number of &FORMAT_i. variables to insert formats into */
%let count_format = %left(&count_format.);
%do i = 1 %to &count_format.;
%global FORMAT_&i.;
%end;
/* Insert formats from table into macro variables created in above process with quotation marks */
PROC SQL NOPRINT;
SYSECHO "Key Statistics - Step 11/21";
Select QUOTE(STRIP(SCAN_CRITERIA))
Into :FORMAT_1 - :FORMAT_&count_format.
From form.&format_list.
Where TYPE='datetime';
;
QUIT;
%put Formats Assigned to Macro Variables; %put;
%mend initialise_formats;
%initialise_formats;
/* Macro to execute the find process. Kept at macro level so we can use the &i. macro variable. */
%macro find;
/* If string cannot be found, then search = 0, else 1. */
%do i = 1 %to &count_format.;
search = max(find(Format, &&FORMAT_&i..), search);
%end;
%mend find;
data output.Contents_Summary (label="Expln=&Expiry_Months.m" Alter="IB" drop=search);
SYSECHO "Key Statistics - Step 12/21";
set Contents_Summary;
label
Anly_ind = "Data Type [Numeric/Character/Date/Other] and [Continuous/Discrete]"
;
/* Initialise the search variable to 0 then invoke the find macro */
search = 0;
%find;
/* Analysis indicator hierachy. This is used to determine which monitoring checks
to perform */
if (lowcase(type) = "num" AND search ^= 0) then Anly_ind = 5;
else if (lowcase(type) = "num" AND upcase(Stats) = "N") then Anly_ind = 1;
else if (lowcase(type) = "num" AND upcase(Stats) = "Y") then Anly_ind = 2;
else if (lowcase(type) = "char" AND upcase(Stats) = "Y") then Anly_ind = 3;
else if (lowcase(type) = "char" AND upcase(Stats) = "N") then Anly_ind = 4;
else Anly_ind = 6;
run;
/* Steps to group variables into macro variable to be used for processing later */
%global ANLY_IND_1_CSV ANLY_IND_1_SPACE
ANLY_IND_2_CSV ANLY_IND_2_SPACE
ANLY_IND_3_CSV ANLY_IND_3_SPACE
ANLY_IND_4_CSV ANLY_IND_4_SPACE
ANLY_IND_5_CSV ANLY_IND_5_SPACE
ANLY_IND_6_CSV ANLY_IND_6_SPACE
CLASS_VARS_CSV CLASS_VARS_SPACE
COUNT_ANLY_1 COUNT_ANLY_2
COUNT_ANLY_3 SEG_COUNT;
PROC SQL noprint;
SYSECHO "Key Statistics - Step 13/21";
%*-- Put all variables with ANLY_IND = 1 into macro variable for later --;
select Variable
,Variable
into :ANLY_IND_1_CSV separated by ", ",
:ANLY_IND_1_SPACE separated by " "
from output.Contents_Summary
where Anly_ind = 1;
%put ANLY_IND_1_CSV and ANLY_IND_1_SPACE have &sqlobs rows;
%*-- Put all variables with ANLY_IND = 2 into macro variable for later --;
select Variable
,Variable
into :ANLY_IND_2_CSV separated by ", ",
:ANLY_IND_2_SPACE separated by " "
from output.Contents_Summary
where Anly_ind = 2;
%put ANLY_IND_2_CSV and ANLY_IND_2_SPACE have &sqlobs rows;
%*-- Put all variables with ANLY_IND = 3 into macro variable for later --;
select Variable
,Variable
into :ANLY_IND_3_CSV separated by ", ",
:ANLY_IND_3_SPACE separated by " "
from output.Contents_Summary
where Anly_ind = 3;
%put ANLY_IND_3_CSV and ANLY_IND_3_SPACE have &sqlobs rows;
%*-- Put all variables with ANLY_IND = 4 into macro variable for later --;
select Variable
,Variable
into :ANLY_IND_4_CSV separated by ", ",
:ANLY_IND_4_SPACE separated by " "
from output.Contents_Summary
where Anly_ind = 4;
%put ANLY_IND_4_CSV and ANLY_IND_4_SPACE have &sqlobs rows;
%*-- Put all variables with ANLY_IND = 5 into macro variable for later --;
select Variable
,Variable
into :ANLY_IND_5_CSV separated by ", ",
:ANLY_IND_5_SPACE separated by " "
from output.Contents_Summary
where Anly_ind = 5;
%put ANLY_IND_5_CSV and ANLY_IND_5_SPACE have &sqlobs rows;
%*-- Put all variables with ANLY_IND = 6 into macro variable for later --;
select Variable
,Variable
into :ANLY_IND_6_CSV separated by ", ",
:ANLY_IND_6_SPACE separated by " "
from output.Contents_Summary
where Anly_ind = 6;
%put ANLY_IND_6_CSV and ANLY_IND_6_SPACE have &sqlobs rows;
%*-- Put all class variables into macro variable for later --;
select Variable
,Variable
into :CLASS_VARS_CSV separated by ", ",
:CLASS_VARS_SPACE separated by " "
from output.Contents_Summary
where upcase(Stats) = "Y";
%put CLASS_VARS_CSV and CLASS_VARS_SPACE have &sqlobs rows;
QUIT;
%macro Initialise_Means_Anly_1;
/* Count the number of variables within the table matching the STATS = "Y" criteria */
Proc Sql noprint;
SYSECHO "Key Statistics - Step 14/21";
Select Count(*)
into :SEG_COUNT
From output.Contents_Summary
where STATS='Y';
Quit;
/* Create N number of &ANLY1_i. variables to insert variable names into */
%let SEG_COUNT = %left(&SEG_COUNT.);
%do i = 1 %to &SEG_COUNT.;
%global ANLY1_&i.;
%end;
/* Inserts Variable from table into macro variables created in above process */
PROC SQL noprint;
SYSECHO "Key Statistics - Step 15/21";
Select STRIP(VARIABLE)
Into :ANLY1_1 - :ANLY1_&SEG_COUNT.
From output.Contents_Summary
where STATS='Y'
;
QUIT;
%mend Initialise_Means_Anly_1;
%Initialise_Means_Anly_1;
%macro Anly_1_Variables;
/* Count the number of variables within the table matching the ANLY_IND = 1 criteria */
Proc Sql noprint;
SYSECHO "Key Statistics - Step 16/21";
Select Count(*)
into :COUNT_ANLY_1
From output.Contents_Summary
where ANLY_IND=1;
Quit;
/* Create N number of ANLY_IND_1_VAR&i. variables to insert variable names into */
%let COUNT_ANLY_1 = %left(&COUNT_ANLY_1.);
%do i = 1 %to &COUNT_ANLY_1.;
%global ANLY_IND_1_VAR&i.;
%end;
/* Inserts Variable from table into macro variables created in above process */
PROC SQL noprint;
SYSECHO "Key Statistics - Step 17/21";
Select STRIP(VARIABLE)
Into :ANLY_IND_1_VAR1 - :ANLY_IND_1_VAR&COUNT_ANLY_1.
From output.Contents_Summary
where ANLY_IND=1;
QUIT;
%mend Anly_1_Variables;
%Anly_1_Variables;
%macro Anly_2_Variables;
/* Count the number of variables within the table matching the ANLY_IND = 2 criteria */
Proc Sql noprint;
SYSECHO "Key Statistics - Step 18/21";
Select Count(*)
into :COUNT_ANLY_2
From output.Contents_Summary
where ANLY_IND=2;
Quit;
/* Create N number of ANLY_IND_2_VAR&i. variables to insert variable names into */
%let COUNT_ANLY_2 = %left(&COUNT_ANLY_2.);
%do i = 1 %to &COUNT_ANLY_2.;
%global ANLY_IND_2_VAR&i.;
%end;
/* Inserts Variable from table into macro variables created in above process */
PROC SQL noprint;
SYSECHO "Key Statistics - Step 19/21";
Select STRIP(VARIABLE)
Into :ANLY_IND_2_VAR1 - :ANLY_IND_2_VAR&COUNT_ANLY_2.
From output.Contents_Summary
where ANLY_IND=2;
QUIT;
%mend Anly_2_Variables;
%Anly_2_Variables;
%macro Anly_3_Variables;
/* Count the number of variables within the table matching the ANLY_IND = 3 criteria */
Proc Sql noprint;
SYSECHO "Key Statistics - Step 20/21";
Select Count(*)
into :COUNT_ANLY_3
From output.Contents_Summary
where ANLY_IND=3;
Quit;
/* Create N number of ANLY_IND_3_VAR&i. variables to insert variable names into */
%let COUNT_ANLY_3 = %left(&COUNT_ANLY_3.);
%do i = 1 %to &COUNT_ANLY_3.;
%global ANLY_IND_3_VAR&i.;
%end;
/* Inserts Variable from table into macro variables created in above process */
PROC SQL noprint;
SYSECHO "Key Statistics - Step 21/21";
Select STRIP(VARIABLE)
Into :ANLY_IND_3_VAR1 - :ANLY_IND_3_VAR&COUNT_ANLY_3.
From output.Contents_Summary
where ANLY_IND=3;
QUIT;
%mend Anly_3_Variables;
%Anly_3_Variables;
/*----------------------------------------------------------------------------*/
/*-- E N D -------------------------------------------------------------------*/
/*----------------------------------------------------------------------------*/ /* Call macro that will perform the percentile summary. This will breakdown all variables
with an ANLY_IND = 1 value into percentiles and perform N, NMISS, MIN, MAX, SUM, STD metrics on */
%Percentile_Summary;
/* Call macro that will populate summary tables for the all variables */
/* This macro is to be used for ANLY_IND = 2,3,4,5 values */
%one_way_summary(refi_lib.&Refi_DSN., &ANLY_IND_2_SPACE., Frequency_2, 2);
%one_way_summary(refi_lib.&Refi_DSN., &ANLY_IND_3_SPACE., Frequency_3, 3);
%one_way_summary(refi_lib.&Refi_DSN., &ANLY_IND_4_SPACE., Frequency_4, 4);
%one_way_summary(refi_lib.&Refi_DSN., &ANLY_IND_5_SPACE., Frequency_5, 5);
%one_way_summary(refi_lib.&Refi_DSN., &ANLY_IND_6_SPACE., Frequency_6, 6);
/* This macro combines all one way analyses conducted */
%Combine_Datasets; /*----------------------------------------------------------------------------*/
/*-- Initialise User Unputs --------------------------------------------------*/
/*----------------------------------------------------------------------------*/
options obs=max fullstimer mprint symbolgen mlogic compress=binary varinitchk=error MINOPERATOR;
/* Format List Location and dataset name */
libname form "/team/customer/Simplification/Internal Bureau Remediation" ACCESS=READONLY;
%let format_list = format_list;
/*--------------------------------------------------------------------*
* Dataset Location and libname type (Meta or Standard). See examples below:
* Meta Libname: LIBNAME ... META LIBRARY="..." METAOUT=data ACCESS=READONLY;
* Standard Libname: LIBNAME ... "..." ACCESS=READONLY
*---------------------------------------------------------------------*/
%let Lib_Type = standard;
%let Refi_Libname = /team/customer/Internal_Bureau/Marts;
%let Out_Libname = /team/customer/Simplification/Internal Bureau Remediation/Data/Profiling/Profiling Outputs;
/*--------------------------------------------------------------------*
* Dataset Name in location specified above
*---------------------------------------------------------------------*/
%let Refi_DSN = RLAD_PAYDAY_201609_NEW;
/*--------------------------------------------------------------------*
* Tag used to apply to output files - Data_Dictionary_&Tag.
* Use &Refi_DSN. to populate with dataset name
*---------------------------------------------------------------------*/
%let Tag = &Refi_DSN.;
/*--------------------------------------------------------------------*
* Sleep switch used to toggle sleep function on and off [Y/N]
* Log switch used to toggle log output on and off [Y/N]
* Expiry Date - Number of months to keep data [+ve integer]
*---------------------------------------------------------------------*/
%let sleep = N;
%let output_log = N;
%let Expiry_Months = 18;
/*--------------------------------------------------------------------*
* Below criteria decides which variables get detailed statistics.
* The threshholds for N_Distinct and Distinct_Pct_All may need tweaking,
depending on your data.
* N_Distinct refers to the maximum number of distinct values within a
variable.
* Distinct_Pct_All refers to the maximum ratio the number of distinct
values within a variable to all values in that same variable can take.
*--------------------------------------------------------------------*/
%let Max_N_Distinct = 25; /* Max number of distinct values */
%let Max_Distinct_Pct_All = 0.1; /* Max percent unique for all rows */
/*----------------------------------------------------------------------------*/
/*-- Initialise Macros -------------------------------------------------------*/
/*----------------------------------------------------------------------------*/
/* Means analysis using a format for each variable */
%macro Percentile_Summary;
/* If the count is populated, then run the percentile summary */
%if (%eval(&COUNT_ANLY_1.) ne 0) %then %do;
/* Kill means datasets in the WORK tempspace that may exist from previous runs */
proc datasets lib=work nolist;
SYSECHO "Percentile Analysis - ANLY_IND 1 - Step 1/4";
delete Percentile_Summary Anly_1_Summary;
run;
/* Run proc means for individual variables, using custom made formats for variable */
%do k = 1 %to &COUNT_ANLY_1.;
/* Create format for use in the proc means step below */
%Percentile(dsn=refi_lib.&Refi_DSN., var=&&ANLY_IND_1_VAR&k..);
%sleep(60); /* Test whether the system should go into sleep mode or not */
/* Analysis: PROC MEANS function for all numeric continuous data, at fine and total level */
PROC MEANS DATA = ranked_out noprint missing chartype COMPLETETYPES;
SYSECHO "Percentile Analysis - ANLY_IND 1 - Step 2/4 - Sub Step &k./&COUNT_ANLY_1.";
VAR &&ANLY_IND_1_VAR&k..;
CLASS VAR_SEG;
OUTPUT OUT = Anly_1_Summary (rename=(VAR_SEG=GROUPING))
N= N
NMISS= NMISS
MIN= MIN
MAX= MAX
MEAN= MEAN
SUM= SUM
STD= STD /noinherit;
RUN;
data Percentile_Summary;
SYSECHO "Percentile Analysis - ANLY_IND 1 - Step 2/4 - Sub Step &k./&COUNT_ANLY_1.";
/* Set original datastep and additions - N ne 1*/
%if &k. ne 1 %then %do;
set Percentile_Summary Anly_1_Summary;
%end;
/* Create variables and set lengths/labels. Set first batch of data available */
%else %if &k. eq 1 %then %do;
/* Assign Lengths and Formats*/
LENGTH ID 4 VARIABLE $32 GROUPING $11;
/* Assign Labels */
label
ID="ID Number"
VARIABLE="Classification Variable Name"
GROUPING="Classification Grouping"
;
/* Initialise with missing */
GROUPING = .;
set Anly_1_Summary;
%end;
/* Create column with relevant variable name and ID */
if ID = . then ID = &k.;
if VARIABLE = "" then VARIABLE = "&&ANLY_IND_1_VAR&k..";
if _TYPE_ = "0" then GROUPING = "Overall";
run;
%end;
/* Left join label onto dataset */
Proc SQL;
Create table output.Percentile_Summary (label="Expln=&Expiry_Months.m" Alter="IB") as
Select A.*
,B.Label
From Percentile_Summary A
Left Join output.Contents_Summary B
on A.Variable = B.Variable
;
Quit;
proc datasets library=work nolist;
SYSECHO "Percentile Analysis - ANLY_IND 1 - Step 4/4";
delete Anly_1_SUMMARY ranked ranked_out;
run;
%end;
%mend Percentile_Summary;
/* Macro to calculate the cutpoints for percentiles */
%Macro Percentile(
dsn /* Libname and dataset name - lib.dsetname */,
var /* Variable name to create percentiles on */);
/* Rank variable in question to create percentiles*/
proc rank data=&dsn.(keep=&var.) out=ranked ties=low;
SYSECHO "Formatting 1/4 - ANLY_IND 1 - Step 2/4 - Sub Step &k./&COUNT_ANLY_1.";
var &var.;
ranks Var_Rank;
run;
/* Sort data into ascending order using the ranked variable created above */
proc sort data=ranked;
SYSECHO "Formatting 2/4 - ANLY_IND 1 - Step 2/4 - Sub Step &k./&COUNT_ANLY_1.";
by Var_Rank;
run;
/* Add counter and determine splits for percentile calculation */
DATA ranked;
SYSECHO "Formatting 3/4 - ANLY_IND 1 - Step 2/4 - Sub Step &k./&COUNT_ANLY_1.";
/* Set non-missing values then missing values, so missing values are at the bottom*/
SET ranked(where=(Var_Rank ne .)) ranked(where=(Var_Rank eq .));
N = _N_; /* Counter */
IF (Var_Rank eq .) THEN N = .; /* Set N to missing for missing values */
RUN;
/* Step to create the segmentation for percentiles. See formatting in monitoring program */
PROC SQL;
SYSECHO "Formatting 4/4 - ANLY_IND 1 - Step 2/4 - Sub Step &k./&COUNT_ANLY_1.";
CREATE TABLE ranked_out(drop=N Var_Rank) AS
SELECT *,
CASE WHEN N = . THEN 'Missing'
WHEN N <= CEIL(MIN(N) + (MAX(N) - MIN(N) + 1)* 1/100 - 1) THEN '0% to 1%'
WHEN N <= MIN(N) + (MAX(N) - MIN(N) + 1)* 5/100 - 1 THEN '1% to 5%'
WHEN N <= MIN(N) + (MAX(N) - MIN(N) + 1)* 10/100 - 1 THEN '5% to 10%'
WHEN N <= MIN(N) + (MAX(N) - MIN(N) + 1)* 25/100 - 1 THEN '10% to 25%'
WHEN N <= MIN(N) + (MAX(N) - MIN(N) + 1)* 50/100 - 1 THEN '25% to 50%'
WHEN N <= MIN(N) + (MAX(N) - MIN(N) + 1)* 75/100 - 1 THEN '50% to 75%'
WHEN N <= MIN(N) + (MAX(N) - MIN(N) + 1)* 90/100 - 1 THEN '75% to 90%'
WHEN N <= MIN(N) + (MAX(N) - MIN(N) + 1)* 95/100 - 1 THEN '90% to 95%'
WHEN N <= FLOOR(MIN(N) + (MAX(N) - MIN(N) + 1)* 99/100 - 1) THEN '95% to 99%'
ELSE '99% to 100%'
END AS VAR_SEG /* Percentile Segmentation variable */
FROM ranked
;
QUIT;
%Mend Percentile;
/* Macro to create 1-way summaries */
%Macro one_way_summary(
dsetin /* Library and dataset name - lib.dset_name */,
varlist /* List of vars to feed through the proc freq procedure */,
dsetout /* Output library and dataset name - lib.dset_name */,
type /* ANLY_IND value */) /mindelimiter=',';
%sleep(60); /* Test whether the system should go into sleep mode or not */
/* Count the number of variables within the table matching the ANLY_IND = &type. criteria */
Proc Sql noprint;
SYSECHO "One Way Summary - ANLY_IND &type. - Step 1/4";
Select Count(*)
into :COUNT_ANLY_&type.
From output.Contents_Summary
where ANLY_IND=&type.;
Quit;
/* If the count is populated, then run the one way summary */
%if (%eval(&&COUNT_ANLY_&type..) ne 0) %then %do;
/* Housekeeping - Delete datasets */
proc datasets nodetails nolist;
SYSECHO "One Way Summary - ANLY_IND &type. - Step 2/4";
delete &dsetout.;
quit;
/* Create population format */
%if (&type. in (1, 5)) %then %do;
proc format;
value POP_FMT (multilabel)
. = "Missing"
other = "Populated"
;
run;
%end;
%else %if (&type. = 4) %then %do;
proc format;
value $ POP_FMT (multilabel)
' ' = "Missing"
other = "Populated"
;
run;
%end;
*loop through variable list;
%let i=1;
%do %while (%scan(&varlist., &i., " ") ^=%str());
%let var=%scan(&varlist., &i., " ");
%put &i. &var.;
*Cross tab;
proc freq data=&dsetin. noprint;
SYSECHO "One Way Summary - ANLY_IND &type. - Step 3/4 - Sub Step &i./&&COUNT_ANLY_&type..";
table &var./ out=temp1 missing;
%if (&type. in (1, 5)) %then %do;
format &var. POP_FMT.;
%end;
%else %if (&type. = 4) %then %do;
format &var. $POP_FMT.;
%end;
run;
*Get variable name as variable name;
data _null_;
SYSECHO "One Way Summary - ANLY_IND &type. - Step 3/4 - Sub Step &i./&&COUNT_ANLY_&type..";
set &dsetin. (obs=1);
call symput('var_name', vname(&var.));
run;
%put &var_name.;
*Add in variable name and label and store the levels as a text field;
data temp2;
SYSECHO "One Way Summary - ANLY_IND &type. - Step 3/4 - Sub Step &i./&&COUNT_ANLY_&type..";
length ANLY_IND $1. Variable $32. Grouping $50. Count 8. Percent 8.;
ANLY_IND = "&type.";
Variable = "&var_name.";
set temp1;
Grouping=input(&var., $50.);
%if (&type. in (1, 4, 5)) %then %do;
if grouping = "" then grouping = "Missing";
else grouping = "Populated";
%end;
percent=percent/100; * Stored as decimals instead of numbers, with format applied;
format percent PERCENTN12.4;
drop &var.;
run;
%put &var_name;
*Append datasets;
proc append data=temp2 base=&dsetout. force;
SYSECHO "One Way Summary - ANLY_IND &type. - Step 3/4 - Sub Step &i./&&COUNT_ANLY_&type..";
run;
/* drop temp tables */
proc datasets nodetails nolist;
SYSECHO "One Way Summary - ANLY_IND &type. - Step 3/4 - Sub Step &i./&&COUNT_ANLY_&type..";
delete temp1 temp2;
quit;
*Increment counter;
%let i=%eval(&i.+1);
%end;
/* Left join label onto dataset and store in perm location */
Proc SQL;
SYSECHO "One Way Summary - ANLY_IND &type. - Step 4/4";
Create table output.&dsetout. (label="Expln=&Expiry_Months.m" Alter="IB") as
Select A.*
,B.Label
From &dsetout. A
Left Join output.Contents_Summary B
on A.Variable = B.Variable
;
Quit;
%end;
%Mend;
%Macro Combine_Datasets;
proc format;
value $ Grouping_FMT (multilabel)
' ' = "Missing"
;
run;
data output.Frequency_Summary (label="Expln=&Expiry_Months.m" Alter="IB");
SYSECHO "Combine Datasets - Step 1/1";
length ANLY_IND $1. Variable $32. Label $256. Grouping $50. Count 8. Percent 8.;
set
%do i = 2 %to 6;
%if %sysfunc(exist(output.Frequency_&i)) %then %do;
output.Frequency_&i
%end;
%else %put Dataset Frequency_&i does not exist.;
%end;
;
Format Grouping Grouping_FMT.;
run;
%Mend Combine_Datasets;
%Macro Sleep(Buffer /* Number of minutes before 21:00 to pause code */);
%if %sysfunc(upcase(&sleep.)) = Y %then %do;
/* Sleep Code to remove change of 9pm error when running in nightqueue */
data _null_;
SYSECHO "Night Queue Test - &sleep.mins before 9:00pm";
timediff=input('22:00:00',time8.)-datetime()+date()*24*60*60;
timediff_nine=(input('21:00:00',time8.)-datetime()+date()*24*60*60)/60;
/* If Mins to 9:00 pm Less than the Buffer then sleep until 10:00 pm */
IF timediff_nine <= &Buffer. then
do;
sleeptime=timediff;
call sleep(sleeptime,1);
end;
run;
%end;
%Mend;
/* This macro will allow you to step through the lowercase letters of */
/* the alphabet on a %DO loop. */
%macro iterm(itr /* Iteration Number to convert to letter */);
%Global countvar_&itr.;
%let lst=a b c d e f g h i j k l m n o p q r s t u v w x y z
aa ab ac ad ae af ag ah ai aj ak al am an ao ap aq ar as at au av aw ax ay az;
%let countvar_&itr. = %scan(&lst.,&itr.);
%mend;
/*----------------------------------------------------------------------------*/
/*-- Define where the program is located -------------------------------------*/
/*----------------------------------------------------------------------------*/
/*Store log for each run*/
%Macro Output_Log;
%if %sysfunc(upcase(&output_log.))=Y %then %do;
proc printto log="&Out_Libname./&Refi_DSN./&Refi_DSN..log";
run;
%end;
%else %do;
%put No log output.;
%end;
%Mend Output_Log;
%Output_Log;
%macro LIBNAMES;
/*-- Setup libname locations based on the library type used --*/
%IF %SYSFUNC(lowcase(&Lib_Type.))=standard %THEN %DO;
LIBNAME refi_lib "&Refi_Libname." ACCESS=READONLY;
%END;
%ELSE %IF %SYSFUNC(lowcase(&Lib_Type.))=meta %THEN %DO;
LIBNAME refi_lib META LIBRARY="&Refi_Libname." METAOUT=data ACCESS=READONLY;
%END;
/* Location of output files - this can also create a folder */
options DLCREATEDIR; /* Turn on function to create library if no-exist */
LIBNAME output "&Out_Libname./&Refi_DSN.";
options NODLCREATEDIR; /* Turn off function to create library if exist */
%Mend LIBNAMES;
%LIBNAMES;
%sleep(120);
/*----------------------------------------------------------------------------*/
/*-- Contents of the REFI metadata and dataset -------------------------------*/
/*----------------------------------------------------------------------------*/
proc sql noprint;
SYSECHO "Key Statistics - Step 1/21";
%*-- List the SAS Metadata for this dataset --;
create table Contents_Listing as
select Name, Type, Length, strip(Format) as Format,
strip(Informat) as InFormat, strip(Label) as Label, varnum
from dictionary.columns
where libname = upcase('Refi_Lib') and memname = upcase("&Refi_DSN.")
order by upcase(Name);
%put Dataset has &sqlobs rows;
quit;
/*-- Custom dataset contents --*/
proc print data=Contents_Listing label uniform width=minimum noobs;
SYSECHO "Key Statistics - Step 2/21";
title "Contents of &Refi_DSN.";
run;
/*----------------------------------------------------------------------------*/
/*-- Look for various possible key columns -----------------------------------*/
/*----------------------------------------------------------------------------*/
%let Count_All_Vars =;
proc sql noprint stimer;
SYSECHO "Key Statistics - Step 3/21";
%*-- Count non-missing and unique rows in the dataset. --------------------;
%*-- Prepare the SQL statements to perform the counts. --------------------;
%*-- Handle issue where SAS variable name would be longer than 32 chars ---;
select "count(" || strip(Name) || ") as N_" ||
substr(left(Name),1,min(29, length(strip(Name)))) ||
", count(distinct " || strip(Name) || ") as ND_" ||
substr(left(Name),1,min(29, length(strip(Name))))
into :Count_All_Vars separated by ", "
from dictionary.columns
where libname = upcase('Refi_Lib') and memname = upcase("&Refi_DSN");
%put Count_All_Vars has &sqlobs rows;
quit;
/* Perform the dataset counts prepared above */
proc sql noprint stimer;
SYSECHO "Key Statistics - Step 4/21";
create table key_values_0 as
select count(*) as All_Rows
,&Count_All_Vars.
from refi_lib.&Refi_DSN.
%put Dataset has &sqlobs rows;
quit;
/*----------------------------------------------------------------------------*/
/*-- Create a more useable dataset for Key Column analysis -------------------*/
/*----------------------------------------------------------------------------*/
/*-- Exchange rows and columns --*/
proc transpose data=key_values_0 out=key_values_0_T;
SYSECHO "Key Statistics - Step 5/21";
run;
%sleep(120);
/*-- Derive other column counts from the transposed SQL results --*/
data key_values;
SYSECHO "Key Statistics - Step 6/21";
/*-- Order the variables and assign labels --*/
label
Variable=""
Count="Number of Rows Filled"
Filled="Percent of Rows Filled"
NMiss="Number of Rows Missing"
Miss_Pct="Percent of Rows Missing"
Unique="Number of Distinct Values"
Unique_Pct="Unique Percentage of Filled Rows"
Unique_Pct_All="Unique Percentage of All Rows"
Stats="Stats"
;
set key_values_0_T (rename=(_name_=Varname col1=Num));
length Variable $ 40 Stats $ 1;
retain Row_Count Variable Count NMiss;
drop Row_Count Varname Num;
format Filled Unique_Pct Miss_Pct Unique_Pct_All PERCENTN12.4;
/*-- Default value is N = No Statistics --*/
Stats = "N";
/*-- The first OBS contains the overall totals --*/
if _N_=1 then do;
Variable = Varname;
Count = Num;
Filled = 1;
Unique = Num;
Unique_Pct = 1;
Unique_Pct_All = 1;
NMiss = 0;
Miss_Pct = 0;
Row_Count = Num;
output;
end;
/*------------------------------------------------------------------------*
* Besides the first row, other rows contain, in order, records with the
* non-missing count, followed by another with the unique count.
* Other measures are derived from this data.
*------------------------------------------------------------------------*/
/*-- Got a new Variable ... Set the non-missing counts --*/
if (substr(Varname, 1, 2) = "N_") then do;
Variable = substr(Varname, 3);
Count = Num;
NMiss = Row_Count - Count;
end;
/*-- This row has unique counts, and most measures are computed here --*/
if (substr(Varname, 1, 3) = "ND_") then do;
Filled = Count / Row_Count;
Unique = Num;
/*-- This data is NOT all Missing values --*/
if (Count ^= 0) then do;
Unique_Pct = Num / Count;
Unique_Pct_All = Num / Row_Count;
Miss_Pct = NMiss / Row_Count;
end;
/*-- This data IS all Missing values, and cannot be summarized --*/
else do;
Unique_Pct = 0;
Unique_Pct_All = 0;
Miss_Pct = 1;
end;
/*--------------------------------------------------------------------*
* This criteria decides which variables get detailed statistics.
* The threshholds may need tweaking (at initialisation stages),
depending on your data.
*--------------------------------------------------------------------*/
if (Unique_Pct_All <= &Max_Distinct_Pct_All. and Unique < &Max_N_Distinct. and Count ^= 0) then
Stats = "Y";
output;
end;
run;
proc sort data=key_values;
SYSECHO "Key Statistics - Step 7/21";
by descending Count descending NMiss descending Unique Variable;
run;
/*-- The analysis so far, before the merge with the Contents dataset --*/
proc print data=key_values label uniform width=minimum noobs;
SYSECHO "Key Statistics - Step 8/21";
title "Counts and Uniqueness for Variables in &Refi_DSN.";
run;
/*-- Combine the Contents dataset with the Key Values analysis above ---------*/
proc sql noprint;
SYSECHO "Key Statistics - Step 9/21";
create table Contents_Summary as
select b.*, a.type, a.length, a.Format, a.InFormat, a.Label,
a.varnum
from Contents_Listing a, key_values b
where upcase(a.name) = upcase(b.Variable);
%put Dataset has &sqlobs rows;
quit;
/*--------------------------------------------------------------------*
* Pull in existing dataset and overwrite with additional indicator
* Create the analysis indicator, which will be used to drive which
monitoring tests are to be carried out on each variable. See below:
* Anly_ind = 1 Numeric - Continuous data
* Anly_ind = 2 Numeric - Segment/Indicator data
* Anly_ind = 3 Character - Segment/Indicator data
* Anly_ind = 4 Character - Continuous data
* Anly_ind = 5 Numeric - Date type data
* Anly_ind = 6 Other - Catchall - Must Explore
*--------------------------------------------------------------------*/
/* Macro to pull in and initialise formats from the formats table */
%macro initialise_formats;
/* Count the number of formats within the table matching the datetime criteria */
%global count_format;
Proc Sql noprint;
SYSECHO "Key Statistics - Step 10/21";
Select Count(*)
into :count_format
From form.&format_list.
where TYPE='datetime';
Quit;
/* Create N number of &FORMAT_i. variables to insert formats into */
%let count_format = %left(&count_format.);
%do i = 1 %to &count_format.;
%global FORMAT_&i.;
%end;
/* Insert formats from table into macro variables created in above process with quotation marks */
PROC SQL NOPRINT;
SYSECHO "Key Statistics - Step 11/21";
Select QUOTE(STRIP(SCAN_CRITERIA))
Into :FORMAT_1 - :FORMAT_&count_format.
From form.&format_list.
Where TYPE='datetime';
;
QUIT;
%put Formats Assigned to Macro Variables; %put;
%mend initialise_formats;
%initialise_formats;
/* Macro to execute the find process. Kept at macro level so we can use the &i. macro variable. */
%macro find;
/* If string cannot be found, then search = 0, else 1. */
%do i = 1 %to &count_format.;
search = max(find(Format, &&FORMAT_&i..), search);
%end;
%mend find;
data output.Contents_Summary (label="Expln=&Expiry_Months.m" Alter="IB" drop=search);
SYSECHO "Key Statistics - Step 12/21";
set Contents_Summary;
label
Anly_ind = "Data Type [Numeric/Character/Date/Other] and [Continuous/Discrete]"
;
/* Initialise the search variable to 0 then invoke the find macro */
search = 0;
%find;
/* Analysis indicator hierachy. This is used to determine which monitoring checks
to perform */
if (lowcase(type) = "num" AND search ^= 0) then Anly_ind = 5;
else if (lowcase(type) = "num" AND upcase(Stats) = "N") then Anly_ind = 1;
else if (lowcase(type) = "num" AND upcase(Stats) = "Y") then Anly_ind = 2;
else if (lowcase(type) = "char" AND upcase(Stats) = "Y") then Anly_ind = 3;
else if (lowcase(type) = "char" AND upcase(Stats) = "N") then Anly_ind = 4;
else Anly_ind = 6;
run;
/* Steps to group variables into macro variable to be used for processing later */
%global ANLY_IND_1_CSV ANLY_IND_1_SPACE
ANLY_IND_2_CSV ANLY_IND_2_SPACE
ANLY_IND_3_CSV ANLY_IND_3_SPACE
ANLY_IND_4_CSV ANLY_IND_4_SPACE
ANLY_IND_5_CSV ANLY_IND_5_SPACE
ANLY_IND_6_CSV ANLY_IND_6_SPACE
CLASS_VARS_CSV CLASS_VARS_SPACE
COUNT_ANLY_1 COUNT_ANLY_2
COUNT_ANLY_3 SEG_COUNT;
PROC SQL noprint;
SYSECHO "Key Statistics - Step 13/21";
%*-- Put all variables with ANLY_IND = 1 into macro variable for later --;
select Variable
,Variable
into :ANLY_IND_1_CSV separated by ", ",
:ANLY_IND_1_SPACE separated by " "
from output.Contents_Summary
where Anly_ind = 1;
%put ANLY_IND_1_CSV and ANLY_IND_1_SPACE have &sqlobs rows;
%*-- Put all variables with ANLY_IND = 2 into macro variable for later --;
select Variable
,Variable
into :ANLY_IND_2_CSV separated by ", ",
:ANLY_IND_2_SPACE separated by " "
from output.Contents_Summary
where Anly_ind = 2;
%put ANLY_IND_2_CSV and ANLY_IND_2_SPACE have &sqlobs rows;
%*-- Put all variables with ANLY_IND = 3 into macro variable for later --;
select Variable
,Variable
into :ANLY_IND_3_CSV separated by ", ",
:ANLY_IND_3_SPACE separated by " "
from output.Contents_Summary
where Anly_ind = 3;
%put ANLY_IND_3_CSV and ANLY_IND_3_SPACE have &sqlobs rows;
%*-- Put all variables with ANLY_IND = 4 into macro variable for later --;
select Variable
,Variable
into :ANLY_IND_4_CSV separated by ", ",
:ANLY_IND_4_SPACE separated by " "
from output.Contents_Summary
where Anly_ind = 4;
%put ANLY_IND_4_CSV and ANLY_IND_4_SPACE have &sqlobs rows;
%*-- Put all variables with ANLY_IND = 5 into macro variable for later --;
select Variable
,Variable
into :ANLY_IND_5_CSV separated by ", ",
:ANLY_IND_5_SPACE separated by " "
from output.Contents_Summary
where Anly_ind = 5;
%put ANLY_IND_5_CSV and ANLY_IND_5_SPACE have &sqlobs rows;
%*-- Put all variables with ANLY_IND = 6 into macro variable for later --;
select Variable
,Variable
into :ANLY_IND_6_CSV separated by ", ",
:ANLY_IND_6_SPACE separated by " "
from output.Contents_Summary
where Anly_ind = 6;
%put ANLY_IND_6_CSV and ANLY_IND_6_SPACE have &sqlobs rows;
%*-- Put all class variables into macro variable for later --;
select Variable
,Variable
into :CLASS_VARS_CSV separated by ", ",
:CLASS_VARS_SPACE separated by " "
from output.Contents_Summary
where upcase(Stats) = "Y";
%put CLASS_VARS_CSV and CLASS_VARS_SPACE have &sqlobs rows;
QUIT;
%macro Initialise_Means_Anly_1;
/* Count the number of variables within the table matching the STATS = "Y" criteria */
Proc Sql noprint;
SYSECHO "Key Statistics - Step 14/21";
Select Count(*)
into :SEG_COUNT
From output.Contents_Summary
where STATS='Y';
Quit;
/* Create N number of &ANLY1_i. variables to insert variable names into */
%let SEG_COUNT = %left(&SEG_COUNT.);
%do i = 1 %to &SEG_COUNT.;
%global ANLY1_&i.;
%end;
/* Inserts Variable from table into macro variables created in above process */
PROC SQL noprint;
SYSECHO "Key Statistics - Step 15/21";
Select STRIP(VARIABLE)
Into :ANLY1_1 - :ANLY1_&SEG_COUNT.
From output.Contents_Summary
where STATS='Y'
;
QUIT;
%mend Initialise_Means_Anly_1;
%Initialise_Means_Anly_1;
%macro Anly_1_Variables;
/* Count the number of variables within the table matching the ANLY_IND = 1 criteria */
Proc Sql noprint;
SYSECHO "Key Statistics - Step 16/21";
Select Count(*)
into :COUNT_ANLY_1
From output.Contents_Summary
where ANLY_IND=1;
Quit;
/* Create N number of ANLY_IND_1_VAR&i. variables to insert variable names into */
%let COUNT_ANLY_1 = %left(&COUNT_ANLY_1.);
%do i = 1 %to &COUNT_ANLY_1.;
%global ANLY_IND_1_VAR&i.;
%end;
/* Inserts Variable from table into macro variables created in above process */
PROC SQL noprint;
SYSECHO "Key Statistics - Step 17/21";
Select STRIP(VARIABLE)
Into :ANLY_IND_1_VAR1 - :ANLY_IND_1_VAR&COUNT_ANLY_1.
From output.Contents_Summary
where ANLY_IND=1;
QUIT;
%mend Anly_1_Variables;
%Anly_1_Variables;
%macro Anly_2_Variables;
/* Count the number of variables within the table matching the ANLY_IND = 2 criteria */
Proc Sql noprint;
SYSECHO "Key Statistics - Step 18/21";
Select Count(*)
into :COUNT_ANLY_2
From output.Contents_Summary
where ANLY_IND=2;
Quit;
/* Create N number of ANLY_IND_2_VAR&i. variables to insert variable names into */
%let COUNT_ANLY_2 = %left(&COUNT_ANLY_2.);
%do i = 1 %to &COUNT_ANLY_2.;
%global ANLY_IND_2_VAR&i.;
%end;
/* Inserts Variable from table into macro variables created in above process */
PROC SQL noprint;
SYSECHO "Key Statistics - Step 19/21";
Select STRIP(VARIABLE)
Into :ANLY_IND_2_VAR1 - :ANLY_IND_2_VAR&COUNT_ANLY_2.
From output.Contents_Summary
where ANLY_IND=2;
QUIT;
%mend Anly_2_Variables;
%Anly_2_Variables;
%macro Anly_3_Variables;
/* Count the number of variables within the table matching the ANLY_IND = 3 criteria */
Proc Sql noprint;
SYSECHO "Key Statistics - Step 20/21";
Select Count(*)
into :COUNT_ANLY_3
From output.Contents_Summary
where ANLY_IND=3;
Quit;
/* Create N number of ANLY_IND_3_VAR&i. variables to insert variable names into */
%let COUNT_ANLY_3 = %left(&COUNT_ANLY_3.);
%do i = 1 %to &COUNT_ANLY_3.;
%global ANLY_IND_3_VAR&i.;
%end;
/* Inserts Variable from table into macro variables created in above process */
PROC SQL noprint;
SYSECHO "Key Statistics - Step 21/21";
Select STRIP(VARIABLE)
Into :ANLY_IND_3_VAR1 - :ANLY_IND_3_VAR&COUNT_ANLY_3.
From output.Contents_Summary
where ANLY_IND=3;
QUIT;
%mend Anly_3_Variables;
%Anly_3_Variables;
/* Call macro that will perform the percentile summary. This will breakdown all variables
with an ANLY_IND = 1 value into percentiles and perform N, NMISS, MIN, MAX, SUM, STD metrics on */
%Percentile_Summary;
/* Call macro that will populate summary tables for the all variables */
/* This macro is to be used for ANLY_IND = 2,3,4,5 values */
%one_way_summary(refi_lib.&Refi_DSN., &ANLY_IND_2_SPACE., Frequency_2, 2);
%one_way_summary(refi_lib.&Refi_DSN., &ANLY_IND_3_SPACE., Frequency_3, 3);
%one_way_summary(refi_lib.&Refi_DSN., &ANLY_IND_4_SPACE., Frequency_4, 4);
%one_way_summary(refi_lib.&Refi_DSN., &ANLY_IND_5_SPACE., Frequency_5, 5);
%one_way_summary(refi_lib.&Refi_DSN., &ANLY_IND_6_SPACE., Frequency_6, 6);
/* This macro combines all one way analyses conducted */
%Combine_Datasets;
/*----------------------------------------------------------------------------*/
/*-- E N D -------------------------------------------------------------------*/
/*----------------------------------------------------------------------------*/ options obs=max fullstimer mprint symbolgen mlogic compress=binary varinitchk=error MINOPERATOR;
/* Input data */
%let Start_Date=01APR2019; /* Start date to loop data through */
%let Mth_Count=2; /* Number of months to monitor */
%let dset_name = strategic_data_YYYYMM; /* Case sensitive when creating libnames */
%let replace_string = YYYYMM; /* Specify what part of string above to replace with date */
%let Tag = strategic_data; /* Output Dataset Name */
/* Setup locations */
%let Location = /team/customer/James/Final_Strategic_data_04; /* Input Dataset Locations */
%let Out_Location = /team/customer/James/MARCH_APRIL; /* Output Dataset Location */
/* Libname setup for MoM analysis */
%Macro Libsetup;
/* Setup input locations for each month*/
%do i = 1 %to &Mth_Count.;
%let Date_ddmmmyyyy = %SysFunc(Putn(%SysFunc(INTNX(Month,"&Start_Date."d,%eval(1-&i.))),date9.));
%let Date_yyyy = %eval(%sysfunc(putn(%SYSFUNC(INTNX(MONTH,"&Date_ddmmmyyyy."d,0)),yymmn6.)));
%global dset_name_&i.;
%let dset_name_&i. = %SYSFUNC(TRANWRD(&dset_name., &replace_string., &Date_yyyy.));
Libname Lib_&i. "&Location./&&dset_name_&i.." ACCESS=READONLY;
%end;
/* Setup output location */
Libname lib_out "&Out_Location.";
%Mend Libsetup;
%Libsetup;
/* Pull all variables and groupings into one dataset and de-duplicate. This is to ensure all
variables and groupings are captured, despite if they exist on one dataset and not on another */
%Macro Contents_Dedup;
/* Pull in all data from different datasets */
data Contents_data;
set
%do i = 1 %to &Mth_Count.;
Lib_&i..contents_summary (keep=Variable Label)
%end;
;
Variable = UPCASE(Variable);
run;
/* De-duplicate data and sort by variable */
proc sort data=Contents_data nodupkey
Out=Contents_deduped;
by Variable;
run;
%Mend Contents_Dedup;
%Contents_Dedup;
/* Extract all data from various folders and concatenate together */
%macro Contents_Data_Merge;
%do i = 1 %to &Mth_Count.;
%iterm(&i.); /* Convert do loop counter to a character */
%end;
/* Pull in all data into one dataset - merge onto deduped data create above */
Proc SQL;
Create table MoM_Contents_Merge as
Select Contents_deduped.Variable as Variable_0
,Contents_deduped.Label as Label_0
%do i = 1 %to &Mth_Count.;
,&&countvar_&i...Variable as Variable_&i.
,&&countvar_&i...Count as Count_&i.
,&&countvar_&i...Filled as Filled_&i.
,&&countvar_&i...NMiss as NMiss_&i.
,&&countvar_&i...Miss_Pct as Miss_Pct_&i.
,&&countvar_&i...Unique as Unique_&i.
,&&countvar_&i...Unique_Pct as Unique_Pct_&i.
,&&countvar_&i...Unique_Pct_All as Unique_Pct_All_&i.
,&&countvar_&i...Anly_ind as Anly_ind_&i.
%end;
From Contents_deduped
%do i = 1 %to &Mth_Count.;
Left Join Lib_&i..contents_summary &&countvar_&i..
on UPCASE(Contents_deduped.Variable) = UPCASE(&&countvar_&i...Variable)
%end;
;
Quit;
%mend Contents_Data_Merge;
%Contents_Data_Merge;
/* Create comparison data */
%macro Contents_Comparison;
Data lib_out.&Tag._Contents;
set MoM_Contents_Merge; /* Pull in the merged data and calculate differences */
/* Multiple do loops used below to control the output structure */
/* Count - Differences Calculation */
%do i = 1 %to (&Mth_Count. - 1);
Count_diff_&i. = Count_&i. - Count_%eval(&i.+1);
label Count_diff_&i. = "Count: Number of differences";
%end;
%do i = 1 %to (&Mth_Count. - 1);
Count_pct_diff_&i. = Count_diff_&i./Count_%eval(&i.+1);
label Count_pct_diff_&i. = "Count: Percent difference";
Format Count_pct_diff_&i. PERCENTN12.4;
%end;
/* Filled - Differences Calculation */
%do i = 1 %to (&Mth_Count. - 1);
Filled_diff_&i. = Filled_&i. - Filled_%eval(&i.+1);
label Filled_diff_&i. = "Filled: Difference";
Format Filled_diff_&i. PERCENTN12.4;
%end;
/* NMiss - Differences Calculation */
%do i = 1 %to (&Mth_Count. - 1);
NMiss_diff_&i. = NMiss_&i. - NMiss_%eval(&i.+1);
label NMiss_diff_&i. = "NMiss: Number of differences";
%end;
%do i = 1 %to (&Mth_Count. - 1);
NMiss_pct_diff_&i. = NMiss_diff_&i./NMiss_%eval(&i.+1);
label NMiss_pct_diff_&i. = "NMiss: Percent difference";
Format NMiss_pct_diff_&i. PERCENTN12.4;
%end;
/* Miss_Pct - Differences Calculation */
%do i = 1 %to (&Mth_Count. - 1);
Miss_Pct_diff_&i. = Miss_Pct_&i. - Miss_Pct_%eval(&i.+1);
label Miss_Pct_diff_&i. = "Miss_Pct: Difference";
Format Miss_Pct_diff_&i. PERCENTN12.4;
%end;
/* Unique - Differences Calculation */
%do i = 1 %to (&Mth_Count. - 1);
Unique_diff_&i. = Unique_&i. - Unique_%eval(&i.+1);
label Unique_diff_&i. = "Unique: Number of differences";
%end;
%do i = 1 %to (&Mth_Count. - 1);
Unique_pct_diff_&i. = Unique_diff_&i./Unique_%eval(&i.+1);
label Unique_pct_diff_&i. = "Unique: Percent difference";
Format Unique_pct_diff_&i. PERCENTN12.4;
%end;
/* Unique_Pct - Differences Calculation */
%do i = 1 %to (&Mth_Count. - 1);
Unique_Pct_diff2_&i. = Unique_Pct_&i. - Unique_Pct_%eval(&i.+1);
label Unique_Pct_diff2_&i. = "Unique_Pct: Difference";
Format Unique_Pct_diff2_&i. PERCENTN12.4;
%end;
/* Unique_Pct_All - Differences Calculation */
%do i = 1 %to (&Mth_Count. - 1);
Unique_Pct_All_diff_&i. = Unique_Pct_All_&i. - Unique_Pct_All_%eval(&i.+1);
label Unique_Pct_All_diff_&i. = "Unique_Pct_All: Difference";
Format Unique_Pct_All_diff_&i. PERCENTN12.4;
%end;
/* Anly_ind - Differences Calculation */
%do i = 1 %to (&Mth_Count. - 1);
Anly_ind_diff_&i. = Anly_ind_&i. - Anly_ind_%eval(&i.+1);
label Anly_ind_diff_&i. = "Anly_ind: Difference";
%end;
run;
%mend Contents_Comparison;
%Contents_Comparison; options obs=max fullstimer mprint symbolgen mlogic compress=binary varinitchk=error MINOPERATOR;
/* Input data */
%let Start_Date=01NOV2018; /* Start date to loop data through */
%let Mth_Count=18; /* Number of months to monitor */
%let dset_name = values_YYYYMM; /* Case sensitive when creating libnames */
%let replace_string = YYYYMM; /* Specify what part of string above to replace with date */
%let Tag = VALUES; /* Output Dataset Name */
/* Setup locations */
%let Location = /team/customer/Simplification/Internal Bureau Remediation/Data/Profiling/Profiling Outputs; /* Input Dataset Locations */
%let Out_Location = /team/customer/Simplification/Internal Bureau Remediation/Data/Monitoring/201802; /* Output Dataset Location */
/* Libname setup for MoM analysis */
%Macro Libsetup;
/* Setup input locations for each month*/
%do i = 1 %to &Mth_Count.;
%let Date_ddmmmyyyy = %SysFunc(Putn(%SysFunc(INTNX(Month,"&Start_Date."d,%eval(1-&i.))),date9.));
%let Date_yyyy = %eval(%sysfunc(putn(%SYSFUNC(INTNX(MONTH,"&Date_ddmmmyyyy."d,0)),yymmn6.)));
%global dset_name_&i.;
%let dset_name_&i. = %SYSFUNC(TRANWRD(&dset_name., &replace_string., &Date_yyyy.));
Libname Lib_&i. "&Location./&&dset_name_&i.." ACCESS=READONLY;
%end;
/* Setup output location */
Libname lib_out "&Out_Location.";
%Mend Libsetup;
%Libsetup;
/* Pull all variables and groupings into one dataset and de-duplicate. This is to ensure all
variables and groupings are captured, despite if they exist on one dataset and not on another */
%Macro Percentile_Dedup;
/* Pull in all data from different datasets */
data Percentile_data;
set
%do i = 1 %to &Mth_Count.;
Lib_&i..percentile_summary (keep=Variable Grouping Label)
%end;
;
Variable = UPCASE(Variable);
run;
/* De-duplicate data and sort by variable then grouping */
proc sort data=Percentile_data nodupkey
Out=Percentile_deduped;
by Variable Grouping;
run;
%Mend Percentile_Dedup;
%Percentile_Dedup;
/* Extract all data from various folders and concatenate together */
%macro Percentile_Data_Merge;
%do i = 1 %to &Mth_Count.;
%iterm(&i.); /* Convert do loop counter to a character */
%end;
/* Pull in all data into one dataset - merge onto deduped data create above */
Proc SQL;
Create table MoM_Percentile_Merge as
Select Percentile_deduped.Variable as Variable_0
,Percentile_deduped.Label as Label_0
,Percentile_deduped.Grouping as Grouping_0
%do i = 1 %to &Mth_Count.;
,&&countvar_&i...Variable as Variable_&i.
,&&countvar_&i...Grouping as Grouping_&i.
,&&countvar_&i..._TYPE_ as Type_&i.
,&&countvar_&i..._FREQ_ as Frequency_&i.
,&&countvar_&i...N as N_&i.
,&&countvar_&i...NMISS as NMISS_&i.
,&&countvar_&i...MIN as MIN_&i.
,&&countvar_&i...MAX as MAX_&i.
,&&countvar_&i...MEAN as MEAN_&i.
,&&countvar_&i...SUM as SUM_&i.
,&&countvar_&i...STD as STD_&i.
%end;
From Percentile_deduped
%do i = 1 %to &Mth_Count.;
Left Join Lib_&i..percentile_summary &&countvar_&i..
on (UPCASE(Percentile_deduped.Variable) = UPCASE(&&countvar_&i...Variable)) AND
(UPCASE(Percentile_deduped.GROUPING) = UPCASE(&&countvar_&i...GROUPING))
%end;
;
Quit;
%mend Percentile_Data_Merge;
%Percentile_Data_Merge;
/* Create comparison data */
%macro Percentile_Comparison;
Data lib_out.&Tag._Percentile;
set MoM_Percentile_Merge; /* Pull in the merged data and calculate differences */
/* Multiple do loops used below to control the output structure */
/* N - Differences Calculation */
%do i = 1 %to (&Mth_Count. - 1);
N_diff_&i. = N_&i. - N_%eval(&i.+1);
label N_diff_&i. = "N: Number of differences";
%end;
/* N - % Differences Calculation */
%do i = 1 %to (&Mth_Count. - 1);
N_pct_diff_&i. = N_diff_&i./N_%eval(&i.+1);
label N_pct_diff_&i. = "N: Percent difference";
Format N_pct_diff_&i. PERCENTN12.4;
%end;
/* NMISS - Differences Calculation */
%do i = 1 %to (&Mth_Count. - 1);
NMISS_diff_&i. = NMISS_&i. - NMISS_%eval(&i.+1);
label NMISS_diff_&i. = "NMISS: Number of differences";
%end;
/* NMISS - % Differences Calculation */
%do i = 1 %to (&Mth_Count. - 1);
NMISS_pct_diff_&i. = NMISS_diff_&i./NMISS_%eval(&i.+1);
label NMISS_pct_diff_&i. = "NMISS: Percent difference";
Format NMISS_pct_diff_&i. PERCENTN12.4;
%end;
/* MIN - Differences Calculation */
%do i = 1 %to (&Mth_Count. - 1);
MIN_diff_&i. = MIN_&i. - MIN_%eval(&i.+1);
label MIN_diff_&i. = "MIN: Number of differences";
%end;
/* MIN - % Differences Calculation */
%do i = 1 %to (&Mth_Count. - 1);
MIN_pct_diff_&i. = MIN_diff_&i./MIN_%eval(&i.+1);
label MIN_pct_diff_&i. = "MIN: Percent difference";
Format MIN_pct_diff_&i. PERCENTN12.4;
%end;
/* MAX - Differences Calculation */
%do i = 1 %to (&Mth_Count. - 1);
MAX_diff_&i. = MAX_&i. - MAX_%eval(&i.+1);
label MAX_diff_&i. = "MAX: Number of differences";
%end;
/* MAX - % Differences Calculation */
%do i = 1 %to (&Mth_Count. - 1);
MAX_pct_diff_&i. = MAX_diff_&i./MAX_%eval(&i.+1);
label MAX_pct_diff_&i. = "MAX: Percent difference";
Format MAX_pct_diff_&i. PERCENTN12.4;
%end;
/* MAX - Differences Calculation */
%do i = 1 %to (&Mth_Count. - 1);
MAX_diff_&i. = MAX_&i. - MAX_%eval(&i.+1);
label MAX_diff_&i. = "MAX: Number of differences";
%end;
/* MAX - % Differences Calculation */
%do i = 1 %to (&Mth_Count. - 1);
MAX_pct_diff_&i. = MAX_diff_&i./MAX_%eval(&i.+1);
label MAX_pct_diff_&i. = "MAX: Percent difference";
Format MAX_pct_diff_&i. PERCENTN12.4;
%end;
/* MEAN - Differences Calculation */
%do i = 1 %to (&Mth_Count. - 1);
MEAN_diff_&i. = MEAN_&i. - MEAN_%eval(&i.+1);
label MEAN_diff_&i. = "MEAN: Number of differences";
%end;
/* MEAN - % Differences Calculation */
%do i = 1 %to (&Mth_Count. - 1);
MEAN_pct_diff_&i. = MEAN_diff_&i./MEAN_%eval(&i.+1);
label MEAN_pct_diff_&i. = "MEAN: Percent difference";
Format MEAN_pct_diff_&i. PERCENTN12.4;
%end;
/* SUM - Differences Calculation */
%do i = 1 %to (&Mth_Count. - 1);
SUM_diff_&i. = SUM_&i. - SUM_%eval(&i.+1);
label SUM_diff_&i. = "SUM: Number of differences";
%end;
/* SUM - % Differences Calculation */
%do i = 1 %to (&Mth_Count. - 1);
SUM_pct_diff_&i. = SUM_diff_&i./SUM_%eval(&i.+1);
label SUM_pct_diff_&i. = "SUM: Percent difference";
Format SUM_pct_diff_&i. PERCENTN12.4;
%end;
/* STD - Differences Calculation */
%do i = 1 %to (&Mth_Count. - 1);
STD_diff_&i. = STD_&i. - STD_%eval(&i.+1);
label STD_diff_&i. = "STD: Number of differences";
%end;
/* STD - % Differences Calculation */
%do i = 1 %to (&Mth_Count. - 1);
STD_pct_diff_&i. = STD_diff_&i./STD_%eval(&i.+1);
label STD_pct_diff_&i. = "STD: Percent difference";
Format STD_pct_diff_&i. PERCENTN12.4;
%end;
run;
%mend Percentile_Comparison;
%Percentile_Comparison; options obs=max fullstimer mprint symbolgen mlogic compress=binary varinitchk=error MINOPERATOR;
/* Input data */
%let Start_Date=01NOV2018; /* Start date to loop data through */
%let Mth_Count=12; /* Number of months to monitor */
%let dset_name = values_YYYYMM; /* Case sensitive when creating libnames */
%let replace_string = YYYYMM; /* Specify what part of string above to replace with date */
%let Tag = VALUES; /* Output Dataset Name */
/* Setup locations */
%let Location = /team/customer/Simplification/Internal Bureau Remediation/Data/Profiling/Profiling Outputs; /* Input Dataset Locations */
%let Out_Location = /team/customer/Simplification/Internal Bureau Remediation/Data/Monitoring/201802; /* Output Dataset Location */
/* Libname setup for MoM analysis */
%Macro Libsetup;
/* Setup input locations for each month*/
%do i = 1 %to &Mth_Count.;
%let Date_ddmmmyyyy = %SysFunc(Putn(%SysFunc(INTNX(Month,"&Start_Date."d,%eval(1-&i.))),date9.));
%let Date_yyyy = %eval(%sysfunc(putn(%SYSFUNC(INTNX(MONTH,"&Date_ddmmmyyyy."d,0)),yymmn6.)));
%global dset_name_&i.;
%let dset_name_&i. = %SYSFUNC(TRANWRD(&dset_name., &replace_string., &Date_yyyy.));
Libname Lib_&i. "&Location./&&dset_name_&i.." ACCESS=READONLY;
%end;
/* Setup output location */
Libname lib_out "&Out_Location.";
%Mend Libsetup;
%Libsetup;
/* Pull all variables and groupings into one dataset and de-duplicate. This is to ensure all
variables and groupings are captured, despite if they exist on one dataset and not on another */
%Macro frequency_Dedup;
/* Pull in all data from different datasets */
data frequency_data;
set
%do i = 1 %to &Mth_Count.;
Lib_&i..frequency_summary (keep=Variable Grouping Label)
%end;
;
Variable = UPCASE(Variable);
run;
/* De-duplicate data and sort by variable then grouping */
proc sort data=frequency_data nodupkey
Out=frequency_deduped;
by Variable Grouping;
run;
%Mend frequency_Dedup;
%frequency_Dedup;
/* Extract all data from various folders and concatenate together */
%macro Frequency_Data_Merge;
%do i = 1 %to &Mth_Count.;
%iterm(&i.); /* Convert do loop counter to a character */
%end;
/* Pull in all data into one dataset - merge onto deduped data create above */
Proc SQL;
Create table MoM_frequency_Merge as
Select frequency_deduped.Variable as Variable_0
,frequency_deduped.Label as Label_0
,frequency_deduped.Grouping as Grouping_0
%do i = 1 %to &Mth_Count.;
,&&countvar_&i...Variable as Variable_&i.
,&&countvar_&i...Label as Label_&i.
,&&countvar_&i...Grouping as Grouping_&i.
,&&countvar_&i...Count as Count_&i.
,&&countvar_&i...Percent as Percent_&i.
,&&countvar_&i...ANLY_IND as Anly_Ind_&i.
%end;
From frequency_deduped
%do i = 1 %to &Mth_Count.;
Left Join Lib_&i..frequency_summary &&countvar_&i..
on (UPCASE(frequency_deduped.Variable) = UPCASE(&&countvar_&i...Variable)) AND
(UPCASE(frequency_deduped.GROUPING) = UPCASE(&&countvar_&i...GROUPING))
%end;
;
Quit;
%mend Frequency_Data_Merge;
%Frequency_Data_Merge;
/* Create comparison data */
%macro Frequency_Comparison;
Data lib_out.&Tag._Frequency;
set MoM_Frequency_Merge; /* Pull in the merged data and calculate differences */
/* Multiple do loops used below to control the output structure */
/* Count - Differences Calculation */
%do i = 1 %to (&Mth_Count. - 1);
Count_diff_&i. = Count_&i. - Count_%eval(&i.+1);
label Count_diff_&i. = "Count: Number of differences";
%end;
/* Count - % Differences Calculation */
%do i = 1 %to (&Mth_Count. - 1);
Count_pct_diff_&i. = Count_diff_&i./Count_%eval(&i.+1);
label Count_pct_diff_&i. = "Count: Percent difference";
Format Count_pct_diff_&i. PERCENTN12.4;
%end;
/* Percent - Differences Calculation */
%do i = 1 %to (&Mth_Count. - 1);
Percent_diff_&i. = Percent_&i. - Percent_%eval(&i.+1);
label Percent_diff_&i. = "Percent: Number of differences";
Format Percent_diff_&i. PERCENTN12.4;
%end;
/* Anly_ind - Differences Calculation */
%do i = 1 %to (&Mth_Count. - 1);
Anly_ind_diff_&i. = Anly_ind_&i. - Anly_ind_%eval(&i.+1);
label Anly_ind_diff_&i. = "Anly_ind: Difference";
%end;
run;
%mend Frequency_Comparison;
%Frequency_Comparison;
... View more