BookmarkSubscribeRSS Feed
☑ This topic is solved. Need further help from the community? Please sign in and ask a new question.

hi all:
how may I simplify the code? It works but there might be more efficient way. thanks in advance.

 

options mprint mlogic;
%macro pd(ord,ds,ovar,fvar,v1,v2,v3,v4);

data &ds (keep=subject &fvar.: ds _lastf);
length ds $10.;
   set new.&ds(keep= subject 
                   %if &ord in (-1,-2) %then %do; &v1.:  %end;
                   %else %if &ord=1    %then %do; &v1.: &v2.: %end;
                   %else %if &ord=2    %then %do; &v1.: &v2.: &v3.: %end;
                   %else %if &ord=3    %then %do; &v1.: &v2.: &v3.: &v4.:%end;
                   );		
		  ds="&ds.";
		 %if &ord=-2 %then %do;
		  _lasf=&fvar.;
	   %end;

	   %if &ord ne -2 %then %do;
			if 	&v1._dd      ^=. then &ovar.=put(&v1._yyyy, 4.)||'-'||put(&v1._mm, z2.)||'-'||put(&v1._dd, z2.);
			else if &v1._mm  ^=. then &ovar.=put(&v1._yyyy, 4.)||'-'||put(&v1._mm, z2.)||'-01';
			else if &v1._yyyy^=. then &ovar.=put(&v1._yyyy, 4.)||'-01'||'-01';
			&fvar.=input(&ovar.,yymmdd10.);	format &fvar. yymmdd10.;
			_lastf=&fvar.;
		 %end;

		 %else %if &ord>=1 %then %do;		 	 
			if 	&v2._dd^=.        then &ovar.2=put(&v2._yyyy, 4.)||'-'||put(&v2._mm, z2.)||'-'||put(&v2._dd, z2.);
			else if &v2._mm^=.    then &ovar.2=put(&v2._yyyy, 4.)||'-'||put(&v2._mm, z2.)||'-01';
			else if &v2._yyyy^=.  then &ovar.2=put(&v2._yyyy, 4.)||'-01'||'-01';
			&fvar.2=input(&ovar.2,yymmdd10.);	format &fvar.2 yymmdd10.;

			 array dt &fvar. &fvar.2;
			  do over dt;
			  if dt=. then dt="01JAN1900"d;
			 end;
			 _lastf=max(of dt[*]);
		%end;
		 %if &ord>=2 %then %do;
			    if 	&v3._dd^=.    then &ovar.3=put(&v3._yyyy, 4.)||'-'||put(&v3._mm, z2.)||'-'||put(&v3._dd, z2.);
			else if &v3._mm^=.    then &ovar.3=put(&v3._yyyy, 4.)||'-'||put(&v3._mm, z2.)||'-01';
			else if &v3._yyyy^=.  then &ovar.3=put(&v3._yyyy, 4.)||'-01'||'-01';
			&fvar.3=input(&ovar.3,yymmdd10.);	format &fvar.3 yymmdd10.;

			 array dt2 &fvar. &fvar.2 &fvar.3;
			  do over dt2;
			  if dt2=. then dt2="01JAN1900"d;
			 end;
			 _lastf=max(of dt2[*]);
		%end;
		%if &ord=3 %then %do;
			    if 	&v4._dd^=.    then &ovar.4=put(&v4._yyyy, 4.)||'-'||put(&v4._mm, z2.)||'-'||put(&v4._dd, z2.);
			else if &v4._mm^=.    then &ovar.4=put(&v4._yyyy, 4.)||'-'||put(&v4._mm, z2.)||'-01';
			else if &v4._yyyy^=.  then &ovar.4=put(&v4._yyyy, 4.)||'-01'||'-01';
			&fvar.4=input(&ovar.4,yymmdd10.);	format &fvar.4 yymmdd10.;

       array dt3 &fvar. &fvar.2 &fvar.3 &fvar.4;
			 do over dt3;
			  if dt3=. then dt3="01JAN1900"d;
			 end;
			 _lastf=max(of dt3[*]);
		%end;	
    format _lastf yymmdd10.; 
run;

proc sort;by subject decending _lastf;run;
proc sort data=&ds. out=&ds.f nodupkey;by subject;run;
run;

%if &ord<=-1 %then %do;
proc append base=af  Data=&ds.f;
%end;
%else %if &ord=1 %then %do;
proc append base=bf  Data=&ds.f;
%end;
%else %if &ord=2 %then %do;
proc append base=cf  Data=&ds.f;
%end;
 
%else %if &ord=3 %then %do;
proc append base=df  Data=&ds.f;
%end;
run;
%mend;
/*pd(ord,ds, nvar,ovar,fvar,v1,v2,v3,v4);*/	 
%pd(3,a,  lstdt, last, adat, a2dat,a3dat,a4dat)
%pd(2,b,  lstdt, last, b1dat,b2dat,b3dat)
%pd(1,c,  lstdt, last, c1dat,c2dat, , ) 
%pd(-1,d, lstdt, last, ddat , , , )	
%pd(-2,e, lstdt, last, edat , , , )	

 

1 ACCEPTED SOLUTION

Accepted Solutions
Tom
Super User Tom
Super User

Define what you mean by efficient?   For the code to RUN? Or for the programmer to generate?  If you use macro to generate same code you were already running there is no change in how it will actually run.  It might make it easier for the programmer.  Or it might just add complexity and make it harder for the programmer.

 

Does this new code cover all of the possible m,d,y variables?

For this particular application your code looks over complicated.  If some observations have missing values those missing values will not impact the maximum date calculated.

data temp.have;
  set temp.a temp.b temp.c indsname=indsname;
  ds = indsname ;
  _adate=mdy(adat_mm, adat_dd,adat_yyyy);
  _edate=mdy(edat_mm, edat_dd,edat_yyyy); 
  _brdate=mdy(brdat_mm, brdat_dd,brdat_yyyy);
  _idate=mdy(idat_mm, idat_dd,idat_yyyy);
  _bndate=mdy(bndat_mm, bndat_dd,bndat_yyyy); 
  _tdate=mdy(tdat_mm,tdat_dd,tdat_yyyy);
  _pdate=mdy(pdat_mm, pdat_dd,pdat_yyyy); 
  _maxdate=max(of _adate _edate _brdate _idate _bndate _tdate _pdate );
  format _maxdate _adate _edate _brdate _idate _bndate _tdate _pdate yymmdd10.;
  keep id ds _maxdate _adate _edate _brdate _idate _bndate _tdate _pdate;	
run;

You could in theory make a macro that takes BR as input and generates 

_brdate=mdy(brdat_mm, brdat_dd,brdat_yyyy);

but why?  It would just make the code more confusing.  Are these names going to change in the future? Why would they change?

 

If you want to eliminate the notes about generating the missing values you could just confirm that you actually have month day and year values.  So if the variables are numeric you could do:

if 0=nmiss(adat_mm, adat_dd,adat_yyyy) then _adate=mdy(adat_mm, adat_dd,adat_yyyy);

 It the notes are on the MAX() function call you could check there also.

if 0=nmiss(of  _adate _edate _brdate _idate _bndate _tdate _pdate ) then
   _maxdate=max(of _adate _edate _brdate _idate _bndate _tdate _pdate )
;

View solution in original post

19 REPLIES 19
Tom
Super User Tom
Super User

What does it DO?  Please explain in words the purpose of the code.

 

If you want to pass a list of variable names just use ONE parameter.

If you want to pass in a dataset name pass in the whole dataset name.

%macro pd(ds,ovar,fvar,varlist);
%local count lastvar ;
%let varlist=%sysfunc(compbl(&varlist));
%let count=%sysfunc(countw(&varlist,%str( )));
%let lastvar=%scan(&varlist,-1,%str( ));
...
   set &ds(keep= subject %sysfunc(tranwrd(&varlist,%str( ),%str(: ))): );
....
%mend pd;
%pd
(ds=new.a
,ovar=lstdt
,fvar=last
,varlist=adat a2dat a3dat a4dat
)

 

purpleclothlady
Pyrite | Level 9
Hi Tom:
Thank you for quick help. please see below. thanks

The purpose of the code is to find the LATEST date from mulitple datasets by Subject.
First: To create a date variable by dat_yyyy, dat_mm, dat_dd
from multiple datasets and each datasets has different number of date variables , such as
Dataset a: has 4 dates , b has 3.....

Second: Then find out the latest date from Each data sets (eg. Data set a: a1dat, a2dat, a3dat, a4dat.) - a3dat is the last date

Third: Get the latest date from all datasets by Subject (eg from Dataset a, b, c ,d), the last date is from Dataset c-c1dat
Tom
Super User Tom
Super User

Where is the list of datasets?

How are you determining which variables are dates?  It kind of looks like you have to tell it which variables have dates?

 

If you want the latest date per SUBJECT across all observations in ONE dataset and 

  1. You know the dataset name
  2. You know the variable names (or prefixes as it kind of looks like you are doing)
  3. And the data is sorted by SUBJECT

Then just do:

%let ds=HAVE;
%let varlist=avar1-avar4 labdate visitdate;
data want;
  set &ds;
  by subject ;
  retain latest;
  if first.subject the latest=.;
  latest=max(of latest &varlist);
  if last.subject;
  keep subject latest;
  format latest date9.;
run;

 

purpleclothlady
Pyrite | Level 9
%macro pd(ord,datasetsname,newvar,last,v1,v2,v3,v4);
/*****
   adat, a2dat... they have to first use mdy function to make it a sasdate, currently these date variables are prefixed with adat, 
such as adat_yyyy, adat_mm, adat_dd , 
then find out the latest date from data sets a, b, c, d, e 
****/
%pd(3,a,  lstdt, last, adat, a2dat,a3dat,a4dat)
%pd(2,b,  lstdt, last, b1dat,b2dat,b3dat)
%pd(1,c,  lstdt, last, c1dat,c2dat, , ) 
%pd(-1,d, lstdt, last, ddat , , , )	
%pd(-2,e, lstdt, last, edat , , , )	

Hi Tom:

Where is the list of datasets?

Answer: the list of data sets are in the macro %pd- a, b , c ,d, e

How are you determining which variables are dates?  It kind of looks like you have to tell it which variables have dates?

Answer: the macro %pd had parameters : v1, v2, v3,v4

Is this clear now? thanks again.

Purple

 

 

Tom
Super User Tom
Super User

You didn't really answer the question about how YOU know which datasets and which variables.  Instead you just said what I already saw in the current code which is that you had to TELL the macro which dataset to use and which variables to use.

 

Is the list of datasets always A B C D E ?

Is the list of variable names always some letter followed by DAT_ and either YYYY, MM or DD ?

Why are there three variables per date? (is there any hope of make simplifications before the data gets to SAS)

 

Why are you passing in that -2,-1,1,2,3 parameter when the macro can just COUNT how many variables you passed in?

It seems that 1,2,3 is just a count of how many variables you passed.  

What do the negative numbers mean? What is difference between -1 and 1?  Between -2 and -1? 

Why no code to handle zero?

 

Explain what it means, not what the current code does.

 

What does that first data step do?  The one with so much macro code being used to generate it.

How is it helping to get to the answer?

Does it just convert the M,D,Y variables into actual dates?  Does it transpose the data?  Does it find the max? 

 

What do the multiple PROC SORT steps do?

 

Why are you creating 4 different datasets in the last block of macro code that is being driven by the ORD variable?

Now the ORD variable seems to have a different meaning than it did in the first data step.  

Rather than just meaning how many variables to process it seems to meaning what to name the output dataset.

If the name of the output dataset changes why not just pass it in as a parameter like the input dataset?

Or derive it based on the value of the ORD parameter?  You seem to be mapping -2 -> AF , -1 ->AF, 1->BF ...

%let base = %scan(AF AF ZERO BF CF DF,&ord+3);
proc append base=&base  Data=&ds.f;
run;

 

 

 

purpleclothlady
Pyrite | Level 9

Hi Tom:

I answered all questions in the comment section in the code. 

please check if this is clear now. 

thanks

Purple


options mprint mlogic;
%macro pd(ord,ds,ovar,fvar,v1,v2,v3,v4);
/*-----Is the list of datasets always A B C D E ?: 
***Answer: There are 30 datasets with different names. I just give an example. 

-----Is the list of variable names always some letter followed by DAT_ and either YYYY, MM or DD ?
***Answer: Yes , eg. birthdat_yyyy , birthdat_mm, birthdat_dd
-----Why are there three variables per date? (is there any hope of make simplifications before the data gets to SAS)
*** Answer: there are about 30 data sets, the below is the code I used to get the date from yyyy, dd, mm. 
it is not pretty though. ---Why are you passing in that -2,-1,1,2,3 parameter when the macro can just COUNT how many variables you passed in? It seems that 1,2,3 is just a count of how many variables you passed. ***Right, we can get a better way. ---What do the negative numbers mean? What is difference between -1 and 1? Between -2 and -1? Answer: I updated the code. this ord is used to count number of variables. ---Why no code to handle zero? Answer: what do you mean ? ---What does that first data step do? The one with so much macro code being used to generate it. Answer: there are many variables in each dataset, I only need to Keep a few of them and the number of variables are different in the data sets. ---How is it helping to get to the answer? Does it just convert the M,D,Y variables into actual dates?
Does it transpose the data? Does it find the max? What do the multiple PROC SORT steps do? Answer: First need Actual dates, the ultimate goal is to find from the 30 data sets the latest date by Subject. ---Why are you creating 4 different datasets in the last block of macro code that is being driven by the ORD variable? Now the ORD variable seems to have a different meaning than it did in the first data step. Rather than just meaning how many variables to process it seems to meaning what to name the output dataset. If the name of the output dataset changes why not just pass it in as a parameter like the input dataset? Or derive it based on the value of the ORD parameter? You seem to be mapping -2 -> AF , -1 ->AF, 1->BF ... Answer: Need to append all 30 datasets, please see Step 3, if I only use one proc append,
there is a log WARNING: Variable LAST4 was not found on DATA file. /* Step 1). IMPUTE PARTIAL DATES FROM MANY DATA SETS WHICH HAD DIFFERENT NUMBER OF DATES; */ data &ds (keep=subject &fvar.: ds _lastf); length ds $10.; set ae06june.&ds(keep= subject %if &ord=1 %then %do; &v1.: %end; %else %if &ord=2 %then %do; &v1.: &v2.: %end; %else %if &ord=3 %then %do; &v1.: &v2.: &v3.: %end; %else %if &ord=4 %then %do; &v1.: &v2.: &v3.: &v4.:%end; ); /*IF USE MDY , THEN LOG HAS "NOTE: MISSING VALUES, HOW TO REMOVE THE LOG NOTE OF "MISSING VALUES...."*/ if &v1._dd ^=. then &ovar.=put(&v1._yyyy, 4.)||'-'||put(&v1._mm, z2.)||'-'||put(&v1._dd, z2.); else if &v1._mm ^=. then &ovar.=put(&v1._yyyy, 4.)||'-'||put(&v1._mm, z2.)||'-01'; else if &v1._yyyy^=. then &ovar.=put(&v1._yyyy, 4.)||'-01'||'-01'; &fvar.=input(&ovar.,yymmdd10.); format &fvar. yymmdd10.; _lastf=&fvar.; %if &ord>=2 %then %do; if &v2._dd^=. then &ovar.2=put(&v2._yyyy, 4.)||'-'||put(&v2._mm, z2.)||'-'||put(&v2._dd, z2.); else if &v2._mm^=. then &ovar.2=put(&v2._yyyy, 4.)||'-'||put(&v2._mm, z2.)||'-01'; else if &v2._yyyy^=. then &ovar.2=put(&v2._yyyy, 4.)||'-01'||'-01'; &fvar.2=input(&ovar.2,yymmdd10.); format &fvar.2 yymmdd10.; array dt &fvar. &fvar.2; do over dt; if dt=. then dt="01JAN1900"d; end; _lastf=max(of dt[*]); %end; %if &ord>=3 %then %do; if &v3._dd^=. then &ovar.3=put(&v3._yyyy, 4.)||'-'||put(&v3._mm, z2.)||'-'||put(&v3._dd, z2.); else if &v3._mm^=. then &ovar.3=put(&v3._yyyy, 4.)||'-'||put(&v3._mm, z2.)||'-01'; else if &v3._yyyy^=. then &ovar.3=put(&v3._yyyy, 4.)||'-01'||'-01'; &fvar.3=input(&ovar.3,yymmdd10.); format &fvar.3 yymmdd10.; array dt2 &fvar. &fvar.2 &fvar.3; do over dt2; if dt2=. then dt2="01JAN1900"d; end; _lastf=max(of dt2[*]); %end; %if &ord=4 %then %do; if &v4._dd^=. then &ovar.4=put(&v4._yyyy, 4.)||'-'||put(&v4._mm, z2.)||'-'||put(&v4._dd, z2.); else if &v4._mm^=. then &ovar.4=put(&v4._yyyy, 4.)||'-'||put(&v4._mm, z2.)||'-01'; else if &v4._yyyy^=. then &ovar.4=put(&v4._yyyy, 4.)||'-01'||'-01'; &fvar.4=input(&ovar.4,yymmdd10.); format &fvar.4 yymmdd10.; array dt3 &fvar. &fvar.2 &fvar.3 &fvar.4; do over dt3; if dt3=. then dt3="01JAN1900"d; end; _lastf=max(of dt3[*]); %end; format _lastf yymmdd10.; run; /* Step 2).GET THE LATEST DATE FOR EACH DATA SET;*/ proc sort;by subject decending _lastf;run; proc sort data=&ds. out=&ds.f nodupkey;by subject;run; run; /* Step 3). APPEND ALL DATA SETS; BUT THIS STEP HAS TO USE PROC APPEND */ data all; set af bf cf df ef ; run; /* Step 4) FIND THE LASTEST DATE FROM ALL DATA SETS*/ proc sort data=all;by subject decending _lastf;run; data all; set all; by subject; if last.subject; run; %mend; /*pd(ord,ds, nvar,ovar,fvar,v1,v2,v3,v4);*/ %pd(4,e, lstdt, last, bicdat, cicdat,picdat,ficdat) %pd(3,d, lstdt,last, hopstdat,hopendat,testdat) %pd(2,c, lstdt, last, castdat, caendat, , ) %pd(1,b, lstdt, last, bmdat , , , ) %pd(1,a, lstdt, last, cldat , , , )
PaigeMiller
Diamond | Level 26

 I think you keep avoiding the question @Tom has asked, which is: "You didn't really answer the question about how YOU know which datasets and which variables."

 

 What you said is NOT an answer:

 

/*-----Is the list of datasets always A B C D E ?:
***Answer: There are 30 datasets with different names. I just give an example.

 

The programmer has to write code that selects data set names, but its still not clear how you know what data set names the program will select each time it runs. Please explain this. (Is it all data sets in a folder? or data set whose name begins with DATA_ and has the date appended such as DATA_02MAR2022 so you want all data set this month? or other? Please EXPLAIN)

--
Paige Miller
purpleclothlady
Pyrite | Level 9

@PaigeMiller @Tom:

I got it 😋 why you guys didn't understand. Sorry for the confusion. 

Here is the code I used to get

"I  think you keep avoiding the question @Tom has asked, which is: "You didn't really answer the question about how YOU know which datasets and which variables."

data lastd;
set sashelp.vcolumn(keep =memname libname memtype name
where=(libname="WORK" and memtype="DATA" and ( name like ('%DAT%'))));
run;

select distinct memname
into :dslist separated by " "
from lastd;
quit;
%put &dslist;

Purple

Kurt_Bremser
Super User

@purpleclothlady wrote:

@PaigeMiller @Tom:

I got it 😋 why you guys didn't understand. Sorry for the confusion. 

Here is the code I used to get

"I  think you keep avoiding the question @Tom has asked, which is: "You didn't really answer the question about how YOU know which datasets and which variables."

data lastd;
set sashelp.vcolumn(keep =memname libname memtype name
where=(libname="WORK" and memtype="DATA" and ( name like ('%DAT%'))));
run;

select distinct memname
into :dslist separated by " "
from lastd;
quit;
%put &dslist;

Purple


You can do this much easier from dictionary.tables:

proc sql noprint,
select memname into :dslist separated by " "
from dictionary.tables
where libname = "WORK" and memtype = "DATA" and name like '%DAT%';
quit;

 

Tom
Super User Tom
Super User

So you want to determine the variables to use by looking for variables named *DAT_YYYY or *DAT_MM or *DAT_DD in any dataset in the WORK library?

 

So first get the list of variables.  Parse out the basename and that extension. So a variable named ADAT_YYYY would have a basename of A and an extension of YYYY.

Then transpose to convert the list to one observation per dataset/basename combination.

Then generate code to set each dateset (once for each date triplet) and rename the variables to all be called YYYY, MM and DD.

Then use a data step to read ALL of the data form all of the observations and convert the year month date values into dates and find the maximum per subject.

proc sql noprint;
create variables as
select catx('.',libname,memname) as dsname
     , substrn(name,1,index(upcase(name),'DAT_')-1) as basename
     , substrn(name,index(upcase(name),'DAT_')+4) as extension
     , nliteral(upcase(name)) as name
from dictionary.columns
where libname = "WORK" 
  and memtype = "DATA"
  and ((upcase(trim(name)) like '%DAT^_YYYY' escape '^'
    or (upcase(trim(name)) like '%DAT^_MM' escape '^'
    or (upcase(trim(name)) like '%DAT^_DD' escape '^'
      )
order by 1,2,3
;
quit;

proc transpose data=variables out=members(drop=_name_);
  by dsname basename;
  id extension;
  var name;
run;

filename code temp;
data _null_;
  set members end=eof;
  where 0=cmiss(of yyyy mm dd);
  if _n_=1 then put 'set' @;
  put @5 dsname '(keep=subject ' yyyy mm dd 
         'rename=(' yyyy '=yyyy ' mm '=mm ' dd '=dd))'
  ;
  if eof then put ';' ;
run;

data want;
  %include code / source2;
  where 0=cmiss(of yyyy mm dd);
  by subject;
  retain maxdate ;
  if first.subject then maxdate=.;
  maxdate=max(maxdate,mdy(mm,dd,yyy));
  if last.subject;
  format maxdate date9.;
  keep subject maxdate;
run;

If the data is not sorted by SUBJECT then you might want to create that last step as a VIEW and then use PROC SUMMARY to find the max.

data all / view=all;
  %include code / source2;
  where 0=cmiss(of yyyy mm dd);
  maxdate=max(maxdate,mdy(mm,dd,yyy));
  format maxdate date9.;
  keep subject maxdate;
run;
proc summary data=all nway ;
  class subject ;
  var maxdate ;
  output out=want max= ;
run;

 

purpleclothlady
Pyrite | Level 9

Tom and Kurt:
That is exactly what I need. thanks all for make the puzzle resolved. super. !
I need to test the code and let you know.

thanks again,

Purple

purpleclothlady
Pyrite | Level 9

Hi all:

I tested the code, maybe it is much easier by using the dataset and the current code, it worked but need a lot of repeat codes. a macro will be more efficient? 

 

data temp.have;
length ds $10;
set temp.a(in=a) temp.b(in=b) temp.c(in=c);
/*Q1: GIVEN THAT WE KNOW DATASET NAME AND VARIABLE NAMES BY USING DICTIONAY.
      BUT THERE ARE ABOUT 40 DATASETS LOOK LIKE THIS, HOW TO READ IN THESE DATASETS AND CREATE DATE USING MDY FUNCTION 
      WITHOUT SHOWING LOG--NOTE MISSING VALUES:
      " MISSING VALUES WERE GENERATED AS A RESULT OF PERFORMING AN OPERATION ON MISSING VALUES.
      EACH PLACE IS GIVEN BY: (NUMBER OF TIMES) AT (LINE):(COLUMN).
      2 AT 1403:14    10 AT 1416:15 "
*/
		if  a then do;
		  _adate=mdy(adat_mm, adat_dd,adat_yyyy);
		  _edate=mdy(edat_mm, edat_dd,edat_yyyy); 
		 ds="DATA SET A";
		end;

		else if  b then do;
			 _brdate=mdy(brdat_mm, brdat_dd,brdat_yyyy);
      ds="DATA SET B";
     end;

    else if c then do;
			 _idate=mdy(idat_mm, idat_dd,idat_yyyy);
		   _bndate=mdy(bndat_mm, bndat_dd,bndat_yyyy); 
		   _tdate=mdy(tdat_mm,tdat_dd,tdat_yyyy);
		   _pdate=mdy(pdat_mm, pdat_dd,pdat_yyyy); 
		 ds="DATA SET C" ;
		end;
		 
		_maxdate=max(of _numeric_);
		format _: mmddyy10.;
		keep id ds _maxdate _a: _bn: _i: _t: _p: _br:;	
run;

/*proc sort data=temp.have;by id _maxdate;run;*/

proc summary data=temp.have nway ;
  class id ;
  var _maxdate ;
  output out=temp.want max=;
run;
Tom
Super User Tom
Super User

Define what you mean by efficient?   For the code to RUN? Or for the programmer to generate?  If you use macro to generate same code you were already running there is no change in how it will actually run.  It might make it easier for the programmer.  Or it might just add complexity and make it harder for the programmer.

 

Does this new code cover all of the possible m,d,y variables?

For this particular application your code looks over complicated.  If some observations have missing values those missing values will not impact the maximum date calculated.

data temp.have;
  set temp.a temp.b temp.c indsname=indsname;
  ds = indsname ;
  _adate=mdy(adat_mm, adat_dd,adat_yyyy);
  _edate=mdy(edat_mm, edat_dd,edat_yyyy); 
  _brdate=mdy(brdat_mm, brdat_dd,brdat_yyyy);
  _idate=mdy(idat_mm, idat_dd,idat_yyyy);
  _bndate=mdy(bndat_mm, bndat_dd,bndat_yyyy); 
  _tdate=mdy(tdat_mm,tdat_dd,tdat_yyyy);
  _pdate=mdy(pdat_mm, pdat_dd,pdat_yyyy); 
  _maxdate=max(of _adate _edate _brdate _idate _bndate _tdate _pdate );
  format _maxdate _adate _edate _brdate _idate _bndate _tdate _pdate yymmdd10.;
  keep id ds _maxdate _adate _edate _brdate _idate _bndate _tdate _pdate;	
run;

You could in theory make a macro that takes BR as input and generates 

_brdate=mdy(brdat_mm, brdat_dd,brdat_yyyy);

but why?  It would just make the code more confusing.  Are these names going to change in the future? Why would they change?

 

If you want to eliminate the notes about generating the missing values you could just confirm that you actually have month day and year values.  So if the variables are numeric you could do:

if 0=nmiss(adat_mm, adat_dd,adat_yyyy) then _adate=mdy(adat_mm, adat_dd,adat_yyyy);

 It the notes are on the MAX() function call you could check there also.

if 0=nmiss(of  _adate _edate _brdate _idate _bndate _tdate _pdate ) then
   _maxdate=max(of _adate _edate _brdate _idate _bndate _tdate _pdate )
;
purpleclothlady
Pyrite | Level 9

@Tom @Kurt_Bremser @PaigeMiller :

Thank you all for your continuous support. I appreciate that and learnt a lot.🙏

Here is the final code I worked and it gave me the correct result.

One thing I need to improve: Since this is for many different name of date variables, I would also like to know how to create a macro to use mdy function. But with that being said, I did finish the project.  

purple

/*---STEP 1: FIND OUT HOW MANY DATA SETS AND VARIABLE NAME WITH CONTAINS '%DAT%'---*/
proc sql noprint;
create table havecol as
select *, cats(strip(libname),".", memname) as newmem
from dictionary.columns
where libname = "TEMP" and memtype = "DATA" and 
       ( (name like '%DAT_YYYY') or ( name like '%DAT_MM') or ( name like '%DAT_DD')  );
quit;

/*---STEP 2: PUT INTO MACRO VAR THE DATA SET NAME ---*/
proc sql noprint;
 select count(distinct newmem) 
        into :dsn  trimmed
 from havecol;

 select distinct newmem
        into  :ds1 - :ds&dsn 
 from havecol;
 quit;
%put &dsn  &ds1 &ds2 &ds3 ;


/*---STEP 3: APPEND ALL DATA SETS ---*/
options mprint mlogic;
%macro h2;
 data temp.have;
 length source $10;
  set 
    %do i=1 %to &dsn;
        &&ds&i  indsname=indsname
		 %end;
		;
		SOURCE=indsname;

	 /*--A. CREATE DATE VARIABLE BASED ON YYYY, MM DD--*/
   _adate=mdy(adat_mm, adat_dd,adat_yyyy);
  _edate=mdy(edat_mm, edat_dd,edat_yyyy); 
  _brdate=mdy(brdat_mm, brdat_dd,brdat_yyyy);
  _hospstndate=mdy(hospstdat_mm, hospstdat_dd,hospstdat_yyyy); 
  _dsstdate=mdy(dsstdat_mm,dsstdat_dd,dsstdat_yyyy);
  _awardate=mdy(awardat_mm, awardat_dd,awardat_yyyy);

	/*--B. FIND OUT MAX VALUE OF DATE PER ROW--*/
   	_maxdate=max(of _numeric_);
		format _: mmddyy10.;
		keep id source _maxdate _:;	
run;
%mend;
%h2;

proc summary data=have2 nway ;
  class id ;
  var _maxdate ;
  output out=temp.want(keep=id _maxdate) max=;
run;

 

hackathon24-white-horiz.png

The 2025 SAS Hackathon has begun!

It's finally time to hack! Remember to visit the SAS Hacker's Hub regularly for news and updates.

Latest Updates

How to Concatenate Values

Learn how use the CAT functions in SAS to join values from multiple variables into a single value.

Find more tutorials on the SAS Users YouTube channel.

SAS Training: Just a Click Away

 Ready to level-up your skills? Choose your own adventure.

Browse our catalog!

Discussion stats
  • 19 replies
  • 2774 views
  • 3 likes
  • 5 in conversation