Hi Adithya,
I'd say it very much depends on the use case for this code. If this is just a one off then "anything" that returns the desired result will do and eventually what you've got is already good enough. If you need something re-usable that you or someone else needs to maintain in the future then you might want to spend a bit more effort.
If you're in the end after some repeated process that maintains a change history or some change log then you might want to look into Slowly Changing Dimensions and the like.
Below some code where I believe it does what you have been asking for. If the structure of your new table differs from the old table (=added or removed variables) then then code will only compare the variables that exist in both tables BUT it will also add an indicator that there is a structural change.
/** 1. init environment **/
%let path=%sysfunc(pathname(path));
options dlcreatedir;
libname old "&path/old";
libname new "&path/new";
libname result "&path/result";
/** 2. create sample data **/
data old.TableA old.TableB old.TableC;
infile datalines dsd truncover;
input record_id $ var1 var2 $;
datalines;
a,1,x
b,1,x
c,1,x
d,1,x
e,1,x
;
data new.TableA(drop=var3) new.TableC(drop=var1) new.TableD;
infile datalines dsd truncover;
input record_id $ var1 var2 $;
var3=99;
datalines;
b,2,x
c,1,y
d,2,y
e,1,x
f,1,x
;
/** 3. Macro definition table analysis **/
%macro doit(
tbl_out=
, tbl_old=
, tbl_new=
, key_vars=
, check_vars=
, tbl_struc_change=
, change_ind_var=change_ind
);
data _null_;
length var $32 term $70 expr $32000;
call symputx('order_expr', translate(compbl("&key_vars"),',',' '), 'l');
if upcase("&tbl_struc_change") ne 'Y' then call symputx('chng_case','1','l');
else call symputx('chng_case','2','l');
do i=1 by 1 while(not missing(scan("&key_vars",i,' ')) );
var=scan("&key_vars",i,' ');
term=cats('n.',var,'=o.',var);
if i=1 then expr=term;
else expr=catx(' ',expr,'and',term);
end;
call symputx('on_expr', expr, 'l');
call missing(expr);
do i=1 by 1 while(not missing(scan("&check_vars",i,' ')) );
var=scan("&check_vars",i,' ');
term=cats('n.',var,'=o.',var);
if i=1 then expr=term;
else expr=catx(' ',expr,'and',term);
end;
call symputx('case_expr', expr, 'l');
run;
/* change_ind: u=update, i=insert/new, s=same/unchanged */
proc sql feedback;
create table &tbl_out as
select
n.*,
case
when n.record_id ne o.record_id then "I&chng_case"
else
case(&case_expr)
when 0 then "U&chng_case"
else "S&chng_case"
end
end as &change_ind_var length=2
from &tbl_new n left join &tbl_old o
on &on_expr
order by &order_expr
;
quit;
%mend;
/* test call */
/*
%doit(
tbl_out=result.TableA
, tbl_old=old.TableA
, tbl_new=new.TableA
, key_vars=record_id
, check_vars=var1 var2
, tbl_struc_change=n
);
*/
/** 4. identify tables and columns for analysis **/
proc format;
value new_old
1='New Only'
2='Old Only'
other='Both'
;
run;
proc sql;
create table work.driver as
select
coalesce(new.memname, old.memname) as memname
, upcase(coalesce(new.name, old.name)) as name
, case
when missing(max(old.memname)) then 1
when missing(max(new.memname)) then 2
else 3
end as table_newold_ind length=3 format=new_old.
, case
when missing(old.name) then 1
when missing(new.name) then 2
else 3
end as variable_newold_ind length=3 format=new_old.
, case
when( min(coalesce(new.name,'0'))='0' or min(coalesce(old.name,'0'))='0' ) then 'Y'
else 'N'
end as table_struct_change_ind length=1
from
(select memname, name from dictionary.columns where libname='NEW') new
full join
(select memname, name from dictionary.columns where libname='OLD') old
on new.memname=old.memname and upcase(new.name)=upcase(old.name)
group by
calculated memname
order by memname, name
;
quit;
/** 5. call macro once per table to be analysed **/
filename codegen temp;
data _null_;
stop;
file codegen;
run;
data _null_;
/* file print;*/
file codegen mod;
set work.driver;
by memname name;
if table_newold_ind=3;
length check_vars $32000;
retain check_vars;
if upcase(name) not in ('RECORD_ID') and variable_newold_ind=3 then
check_vars=catx(' ',check_vars,name);
if last.memname then
do;
put
'%doit(' /
' tbl_out=result.' memname /
' , tbl_old=old.' memname /
' , tbl_new=new.' memname /
' , key_vars=record_id' /
' , check_vars=' check_vars /
' , tbl_struc_change=' table_struct_change_ind /
' );' /
;
call missing(check_vars);
end;
run;
%include codegen /source2;
filename codegen clear;
/** 6. Print driver and result tables **/
proc format;
value $change_ind
'S1'='No change - Identical Table Structures'
'U1'='Update - Identical Table Structures'
'I1'='Insert - Identical Table Structures'
'S2'='No change - Changed Table Structures'
'U2'='Update - Changed Table Structures'
'I2'='Insert - Changed Table Structures'
;
run;
title 'work.driver - for tables that exist in both libraries';
proc print data=work.driver;
where table_newold_ind=3;
run;
title 'result.tableA';
proc print data=result.tableA;
format change_ind $change_ind.;
run;
title 'result.tableC';
proc print data=result.tableC;
format change_ind $change_ind.;
run;
title;
And you would then have a 3rd library with these combined tables? Should the combined tables be data replications (=new tables) or views?
Any approach will certainly have to somehow query the dictionary tables/sashelp views and then generate code.
@chinna0369 wrote:
Yes, I would like to create a new library with all combined datasets from old and new library. They should have all records from old and new library with a new variable to indicate it's from new or old.
Before trying to begin creating a program to create a program you need know what program you need to create.
So how would you stack just two of these datasets into one?
Would something like this work:
data want.ds1;
length source $8 inds $41 ;
set old.ds1 new.ds1 indsname=inds;
source=scan(inds,1,'.');
run;
If so then perhaps you should make a macro that takes DS1 as an input parameter and generates that code.
%macro stack(ds);
data want.&ds;
length source $8 inds $41 ;
set old.&ds new.&ds indsname=inds;
source=scan(inds,1,'.');
run;
%mend stack;
Then you can just generate a list of common members between the two libraries.
proc sql;
create table dslist as
select a.memname
from dictionary.members a
inner join dictionary.members b
on a.libname='NEW' and b.libname='OLD' and a.memname=b.memname
;
quit;
And then use that list to generate one call to the macro for each matching pair of datasets.
data _null_;
set dslist;
call execute(cats('%nrstr(%stack)(',memname,')'));
run;
Hi,
Thank you for your reply!
Yes, but I am getting all records from both datasets. but I only want records from new transfer. And if any of the variable value is updated compared to old transfer then we need to create a new variable which indicates "updated". I am using below code and how can I compare all variables and to identify which variables is updated?
%macro stack(ds=, drop=);
data &ds.1;
length source $8 inds $41 ;
set old.&ds(in=a) db.&ds(in=b) indsname=inds;
if a then flag='ZZZ';
if b then flag='AAA';
source=scan(inds,1,'.');
run;
data want;
set &ds.1;
drop &drop.;
run;
proc sort nodupkey dupout=chk; by _all_; run;
%mend stack;
proc sql;
create table dslist as
select a.memname
from dictionary.members a
inner join dictionary.members b
on a.libname='DB' and b.libname='OLD' and a.memname=b.memname
;
quit;
%stack(ds=BONE_DENSITY_DXA, drop=source record_id record_type record_status record_status_text date_created date_modified record_deleted);
Thanks,
Adithya
Tracking CHANGES is a totally different operation than STACKING.
Do you actually need to have variables that indicate changes? Or do you just want the new data and a REPORT that shows the changes?
Hi,
Sorry for the confusion. Here are the details I am looking for:
1. I need to create a new library with all datasets which are in new transfer.
2. If there are same in datasets in old transfer and new transfer then we need to identify the changes.
3. If there are any changes for the dataset in new transfer such as variable value is updated or a new variable is added then we need to create a new variable with a value of "updated".
Hope this is clear this time 🙂
Thanks,
Adithya
Detecting CHANGE at the OBSERVATION level is HARD.
Is that what you want to do?
Or do you just want a report that says which DATSETS had any changes?
Do the datasets have key variables that uniquely identify every observation? That is critical to be able to check for changes that the observation level.
Sorry for the confusion. Here are the details I am looking for:
1. I need to create a new library with all datasets which are in new transfer.
2. If there are same in datasets in old transfer and new transfer then we need to identify the changes.
3. If there are any changes for the dataset in new transfer such as variable value is updated or a new variable is added then we need to create a new variable with a value of "updated".
At least how I'm reading things your requirements keep changing and I'm still not clear what you actually want.
I suggest that before writing any macro code let's first create sample code for a single case using sample data that's representative for your real data.
Please confirm that below is representative sample data and that the WANT table is the desired result using this sample data. If not then please amend the provided data and share it with us.
%let path=%sysfunc(pathname(path));
options dlcreatedir;
libname old "&path/old";
libname new "&path/new";
libname result "&path/result";
data old.TableA;
infile datalines dsd truncover;
input key $ var1 var2 $;
datalines;
a,1,x
b,1,x
c,1,x
d,1,x
e,1,x
;
data new.TableA;
infile datalines dsd truncover;
input key $ var1 var2 $;
datalines;
b,2,x
c,1,y
d,2,y
e,1,x
f,1,x
;
/* change_ind: u=update, i=insert/new, s=same/unchanged */
data result.TableA_WANT;
infile datalines dsd truncover;
input key $ var1 var2 $ change_ind $;
datalines;
b,2,x,u
c,1,y,u
d,2,y,u
e,1,x,s
f,1,x,i
;
If above is representative of what you have and want then below a first stab how you could code to get the WANT table.
/* change_ind: u=update, i=insert/new, s=same/unchanged */
proc sql;
create table result.TableA_WANT2 as
select
n.*,
case
when n.key ne o.key then 'i'
else
/* case(hashing('sha256',catx('|',n.var1,n.var2))=hashing('sha256',catx('|',o.var1,o.var2)))*/
case(n.var1=o.var1 and n.var2=o.var2)
when 0 then 'u'
else 's'
end
end as change_ind
from new.tableA n left join old.tableA o
on n.key=o.key
order by n.key
;
quit;
If using a SQL will sufficiently perform depends on your data volumes. How many rows do your tables have?
You also mention "or a new variable is added":
Would you mark all rows with a matching key as updated in such a case? Or would you eventually want some other indicator value? - Two actually: one new variable but other matching variables the same, or a new variable plus at least one of the other variables changed as well.
...and what about a variable removed?
Hi Patrik,
First of all thank you so much for your reply!
I know I am very bad at explaining the things about what I want, sorry for that. But you got it finally. Yes, that is the output I am looking for. But my data is bit challenging. I have 30 datasets in each library to compare and my key variable is record_id and I have lot of variables to compare. Right now, I am using below code, is there any effective way?
%macro data_comp(inds=, sort=);
proc sort data=old.&inds.
out=old1(drop=&varsdrop.);
by _all_;
run;
proc sort data=db.&inds.
out=new1(drop=&varsdrop.);
by _all_;
run;
proc sort data=old1; by &sort.; run;
proc sort data=new1; by &sort.; run;
data both_new;
merge new1(in=a) old1(in=b keep=&sort.);
by &sort.;
if a and b;
run;
data only_new;
length flag $50.;
merge new1(in=a) old1(in=b keep=&sort.);
by &sort.;
if a and not b;
flag="New";
run;
proc compare b=old1 c=both_new out=check(where=(_type_="COMPARE")) outnoequal outbase outcomp outdif;
run;
data updated_new;
length flag $50.;
merge both_new(in=a) check(in=b keep=&sort.);
by &sort.;
if a and b then flag="Updated";
run;
data nlib.&inds.;
set only_new updated_new;
run;
proc sort; by &sort.; run;
%mend;
%data_comp(inds=ds1, sort=record_id);
Thanks,
Adithya
Hi Adithya,
I'd say it very much depends on the use case for this code. If this is just a one off then "anything" that returns the desired result will do and eventually what you've got is already good enough. If you need something re-usable that you or someone else needs to maintain in the future then you might want to spend a bit more effort.
If you're in the end after some repeated process that maintains a change history or some change log then you might want to look into Slowly Changing Dimensions and the like.
Below some code where I believe it does what you have been asking for. If the structure of your new table differs from the old table (=added or removed variables) then then code will only compare the variables that exist in both tables BUT it will also add an indicator that there is a structural change.
/** 1. init environment **/
%let path=%sysfunc(pathname(path));
options dlcreatedir;
libname old "&path/old";
libname new "&path/new";
libname result "&path/result";
/** 2. create sample data **/
data old.TableA old.TableB old.TableC;
infile datalines dsd truncover;
input record_id $ var1 var2 $;
datalines;
a,1,x
b,1,x
c,1,x
d,1,x
e,1,x
;
data new.TableA(drop=var3) new.TableC(drop=var1) new.TableD;
infile datalines dsd truncover;
input record_id $ var1 var2 $;
var3=99;
datalines;
b,2,x
c,1,y
d,2,y
e,1,x
f,1,x
;
/** 3. Macro definition table analysis **/
%macro doit(
tbl_out=
, tbl_old=
, tbl_new=
, key_vars=
, check_vars=
, tbl_struc_change=
, change_ind_var=change_ind
);
data _null_;
length var $32 term $70 expr $32000;
call symputx('order_expr', translate(compbl("&key_vars"),',',' '), 'l');
if upcase("&tbl_struc_change") ne 'Y' then call symputx('chng_case','1','l');
else call symputx('chng_case','2','l');
do i=1 by 1 while(not missing(scan("&key_vars",i,' ')) );
var=scan("&key_vars",i,' ');
term=cats('n.',var,'=o.',var);
if i=1 then expr=term;
else expr=catx(' ',expr,'and',term);
end;
call symputx('on_expr', expr, 'l');
call missing(expr);
do i=1 by 1 while(not missing(scan("&check_vars",i,' ')) );
var=scan("&check_vars",i,' ');
term=cats('n.',var,'=o.',var);
if i=1 then expr=term;
else expr=catx(' ',expr,'and',term);
end;
call symputx('case_expr', expr, 'l');
run;
/* change_ind: u=update, i=insert/new, s=same/unchanged */
proc sql feedback;
create table &tbl_out as
select
n.*,
case
when n.record_id ne o.record_id then "I&chng_case"
else
case(&case_expr)
when 0 then "U&chng_case"
else "S&chng_case"
end
end as &change_ind_var length=2
from &tbl_new n left join &tbl_old o
on &on_expr
order by &order_expr
;
quit;
%mend;
/* test call */
/*
%doit(
tbl_out=result.TableA
, tbl_old=old.TableA
, tbl_new=new.TableA
, key_vars=record_id
, check_vars=var1 var2
, tbl_struc_change=n
);
*/
/** 4. identify tables and columns for analysis **/
proc format;
value new_old
1='New Only'
2='Old Only'
other='Both'
;
run;
proc sql;
create table work.driver as
select
coalesce(new.memname, old.memname) as memname
, upcase(coalesce(new.name, old.name)) as name
, case
when missing(max(old.memname)) then 1
when missing(max(new.memname)) then 2
else 3
end as table_newold_ind length=3 format=new_old.
, case
when missing(old.name) then 1
when missing(new.name) then 2
else 3
end as variable_newold_ind length=3 format=new_old.
, case
when( min(coalesce(new.name,'0'))='0' or min(coalesce(old.name,'0'))='0' ) then 'Y'
else 'N'
end as table_struct_change_ind length=1
from
(select memname, name from dictionary.columns where libname='NEW') new
full join
(select memname, name from dictionary.columns where libname='OLD') old
on new.memname=old.memname and upcase(new.name)=upcase(old.name)
group by
calculated memname
order by memname, name
;
quit;
/** 5. call macro once per table to be analysed **/
filename codegen temp;
data _null_;
stop;
file codegen;
run;
data _null_;
/* file print;*/
file codegen mod;
set work.driver;
by memname name;
if table_newold_ind=3;
length check_vars $32000;
retain check_vars;
if upcase(name) not in ('RECORD_ID') and variable_newold_ind=3 then
check_vars=catx(' ',check_vars,name);
if last.memname then
do;
put
'%doit(' /
' tbl_out=result.' memname /
' , tbl_old=old.' memname /
' , tbl_new=new.' memname /
' , key_vars=record_id' /
' , check_vars=' check_vars /
' , tbl_struc_change=' table_struct_change_ind /
' );' /
;
call missing(check_vars);
end;
run;
%include codegen /source2;
filename codegen clear;
/** 6. Print driver and result tables **/
proc format;
value $change_ind
'S1'='No change - Identical Table Structures'
'U1'='Update - Identical Table Structures'
'I1'='Insert - Identical Table Structures'
'S2'='No change - Changed Table Structures'
'U2'='Update - Changed Table Structures'
'I2'='Insert - Changed Table Structures'
;
run;
title 'work.driver - for tables that exist in both libraries';
proc print data=work.driver;
where table_newold_ind=3;
run;
title 'result.tableA';
proc print data=result.tableA;
format change_ind $change_ind.;
run;
title 'result.tableC';
proc print data=result.tableC;
format change_ind $change_ind.;
run;
title;
Join us for SAS Innovate 2025, our biggest and most exciting global event of the year, in Orlando, FL, from May 6-9. Sign up by March 14 for just $795.
Learn how use the CAT functions in SAS to join values from multiple variables into a single value.
Find more tutorials on the SAS Users YouTube channel.
Ready to level-up your skills? Choose your own adventure.