Solved: Stack all datasets from two libraries

chinna0369 · Posted 09-27-2023 08:09 AM

Hi all,

I have two libraries called old and new. And most of the datasets have same names, so I would like create a new library which includes all those datasets. Below are the conditions:
1. If they have dataset with same name then stack them in new dataset with same name and create a variable called indenting with “old” and “new” values to indicate that record is from old library or new library.
2. We need all datasets from old and new. If the dataset is available in old library and not in new library then we identify variable value should be old for that dataset. Same for the dataset which is only in new library.

Can you help me with this. I am using proc sql with dictionary tables but I feel like there should be some other way to call all the datasets from both libraries and stack them together.

Thanks,
Adithya

Patrick · Posted 10-04-2023 11:48 PM

Hi Adithya,

I'd say it very much depends on the use case for this code. If this is just a one off then "anything" that returns the desired result will do and eventually what you've got is already good enough. If you need something re-usable that you or someone else needs to maintain in the future then you might want to spend a bit more effort.

If you're in the end after some repeated process that maintains a change history or some change log then you might want to look into Slowly Changing Dimensions and the like.

Below some code where I believe it does what you have been asking for. If the structure of your new table differs from the old table (=added or removed variables) then then code will only compare the variables that exist in both tables BUT it will also add an indicator that there is a structural change.


/** 1. init environment **/
%let path=%sysfunc(pathname(path));
options dlcreatedir;
libname old "&path/old";
libname new "&path/new";
libname result "&path/result";

/** 2. create sample data **/
data old.TableA old.TableB old.TableC;
  infile datalines dsd truncover;
  input record_id $ var1 var2 $;
  datalines;
a,1,x
b,1,x
c,1,x
d,1,x
e,1,x
;

data new.TableA(drop=var3) new.TableC(drop=var1) new.TableD;
  infile datalines dsd truncover;
  input record_id $ var1 var2 $;
  var3=99;
  datalines;
b,2,x
c,1,y
d,2,y
e,1,x
f,1,x
;


/** 3. Macro definition table analysis **/
%macro doit(
    tbl_out=
  , tbl_old=
  , tbl_new=
  , key_vars=
  , check_vars=
  , tbl_struc_change=
  , change_ind_var=change_ind
  );

  data _null_;
    length var $32 term $70 expr $32000;

    call symputx('order_expr', translate(compbl("&key_vars"),',',' '), 'l');

    if upcase("&tbl_struc_change") ne 'Y' then call symputx('chng_case','1','l');
    else call symputx('chng_case','2','l');

    do i=1 by 1 while(not missing(scan("&key_vars",i,' ')) );
      var=scan("&key_vars",i,' ');
      term=cats('n.',var,'=o.',var);
      if i=1 then expr=term;
      else expr=catx(' ',expr,'and',term);
    end;
    call symputx('on_expr', expr, 'l');

    call missing(expr);
    do i=1 by 1 while(not missing(scan("&check_vars",i,' ')) );
      var=scan("&check_vars",i,' ');
      term=cats('n.',var,'=o.',var);
      if i=1 then expr=term;
      else expr=catx(' ',expr,'and',term);
    end;
    call symputx('case_expr', expr, 'l');

  run;


  /* change_ind: u=update, i=insert/new, s=same/unchanged */
  proc sql feedback;
    create table &tbl_out as
    select 
      n.*,
      case 
        when n.record_id ne o.record_id then "I&chng_case"
        else
          case(&case_expr)
            when 0 then "U&chng_case"
            else "S&chng_case"
            end
        end as &change_ind_var length=2
    from &tbl_new n left join &tbl_old o
      on &on_expr
    order by &order_expr
    ;
  quit;
%mend;

/* test call */
/*
%doit(
    tbl_out=result.TableA
  , tbl_old=old.TableA
  , tbl_new=new.TableA
  , key_vars=record_id
  , check_vars=var1 var2
  , tbl_struc_change=n
  );
*/

/** 4. identify tables and columns for analysis **/
proc format;
  value new_old
    1='New Only'
    2='Old Only'
    other='Both'
  ;
run;

proc sql; 
  create table work.driver as
  select 
      coalesce(new.memname, old.memname) as memname
    , upcase(coalesce(new.name, old.name)) as name
    , case
        when missing(max(old.memname)) then 1
        when missing(max(new.memname)) then 2
        else 3
        end as table_newold_ind length=3 format=new_old.
    , case
        when missing(old.name) then 1
        when missing(new.name) then 2
        else 3
        end as variable_newold_ind length=3 format=new_old.
    , case
        when( min(coalesce(new.name,'0'))='0' or min(coalesce(old.name,'0'))='0' ) then 'Y'
        else 'N'
        end as table_struct_change_ind length=1
  from 
    (select memname, name from dictionary.columns where libname='NEW') new
    full join
    (select memname, name from dictionary.columns where libname='OLD') old
    on new.memname=old.memname and upcase(new.name)=upcase(old.name)
  group by 
    calculated memname
  order by memname, name
  ;
quit;


/** 5. call macro once per table to be analysed **/
filename codegen temp;
data _null_;
  stop;
  file codegen;
run;
data _null_;
/*  file print;*/
  file codegen mod;
  set work.driver;
  by memname name;
  if table_newold_ind=3;

  length check_vars $32000;
  retain check_vars;
  if upcase(name) not in ('RECORD_ID') and variable_newold_ind=3 then
    check_vars=catx(' ',check_vars,name);

  if last.memname then
    do;
      put 
        '%doit(' /
        '    tbl_out=result.' memname /
        '  , tbl_old=old.' memname /
        '  , tbl_new=new.' memname /
        '  , key_vars=record_id' /
        '  , check_vars=' check_vars /
        '  , tbl_struc_change=' table_struct_change_ind /
        '  );' /
        ;

      call missing(check_vars);
    end;
run;
%include codegen /source2;
filename codegen clear;


/** 6. Print driver and result tables **/
proc format;
  value $change_ind
    'S1'='No change - Identical Table Structures'
    'U1'='Update    - Identical Table Structures'
    'I1'='Insert    - Identical Table Structures'
    'S2'='No change - Changed Table Structures'
    'U2'='Update    - Changed Table Structures'
    'I2'='Insert    - Changed Table Structures'
    ;
run;

title 'work.driver - for tables that exist in both libraries';
proc print data=work.driver;
  where table_newold_ind=3;
run;
title 'result.tableA';
proc print data=result.tableA;
  format change_ind $change_ind.;
run;
title 'result.tableC';
proc print data=result.tableC;
  format change_ind $change_ind.;
run;
title;

View solution in original post

Patrick · Posted 09-27-2023 08:17 AM

And you would then have a 3rd library with these combined tables? Should the combined tables be data replications (=new tables) or views?

Any approach will certainly have to somehow query the dictionary tables/sashelp views and then generate code.

chinna0369 · Posted 09-27-2023 08:24 AM

Yes, I would like to create a new library with all combined datasets from old and new library. They should have all records from old and new library with a new variable to indicate it's from new or old.

Tom · Posted 09-27-2023 08:42 AM

@chinna0369 wrote:
Yes, I would like to create a new library with all combined datasets from old and new library. They should have all records from old and new library with a new variable to indicate it's from new or old.

Before trying to begin creating a program to create a program you need know what program you need to create.

So how would you stack just two of these datasets into one?

Would something like this work:

data want.ds1;
  length source $8 inds $41 ;
  set old.ds1 new.ds1 indsname=inds;
  source=scan(inds,1,'.');
run;

If so then perhaps you should make a macro that takes DS1 as an input parameter and generates that code.

%macro stack(ds);
data want.&ds;
  length source $8 inds $41 ;
  set old.&ds new.&ds indsname=inds;
  source=scan(inds,1,'.');
run;
%mend stack;

Then you can just generate a list of common members between the two libraries.

proc sql;
create table dslist as
  select a.memname
  from dictionary.members a 
  inner join dictionary.members b
  on a.libname='NEW' and b.libname='OLD' and a.memname=b.memname
;
quit;

And then use that list to generate one call to the macro for each matching pair of datasets.

data _null_;
  set dslist;
  call execute(cats('%nrstr(%stack)(',memname,')'));
run;

chinna0369 · Posted 10-03-2023 11:33 AM

Hi,

Thank you for your reply!

Yes, but I am getting all records from both datasets. but I only want records from new transfer. And if any of the variable value is updated compared to old transfer then we need to create a new variable which indicates "updated". I am using below code and how can I compare all variables and to identify which variables is updated?

%macro stack(ds=, drop=);
data &ds.1;
  length source $8 inds $41 ;
  set old.&ds(in=a) db.&ds(in=b) indsname=inds;
  if a then flag='ZZZ';
  if b then flag='AAA';
  source=scan(inds,1,'.');
run;

data want;
	set &ds.1;
	drop &drop.;
run;
proc sort nodupkey dupout=chk; by _all_; run;
%mend stack;

proc sql;
create table dslist as
  select a.memname
  from dictionary.members a 
  inner join dictionary.members b
  on a.libname='DB' and b.libname='OLD' and a.memname=b.memname
;
quit;

%stack(ds=BONE_DENSITY_DXA, drop=source record_id record_type record_status record_status_text date_created date_modified record_deleted);

Thanks,

Adithya

Tom · Posted 10-03-2023 11:42 AM

Tracking CHANGES is a totally different operation than STACKING.

Do you actually need to have variables that indicate changes? Or do you just want the new data and a REPORT that shows the changes?

chinna0369 · Posted 10-03-2023 11:53 AM

Hi,

Sorry for the confusion. Here are the details I am looking for:

1. I need to create a new library with all datasets which are in new transfer.

2. If there are same in datasets in old transfer and new transfer then we need to identify the changes.

3. If there are any changes for the dataset in new transfer such as variable value is updated or a new variable is added then we need to create a new variable with a value of "updated".

Hope this is clear this time 🙂

Thanks,

Adithya

Tom · Posted 10-03-2023 12:24 PM

Detecting CHANGE at the OBSERVATION level is HARD.

Is that what you want to do?

Or do you just want a report that says which DATSETS had any changes?

Do the datasets have key variables that uniquely identify every observation? That is critical to be able to check for changes that the observation level.

Patrick · Posted 10-03-2023 08:17 PM

Sorry for the confusion. Here are the details I am looking for:

1. I need to create a new library with all datasets which are in new transfer.

2. If there are same in datasets in old transfer and new transfer then we need to identify the changes.

3. If there are any changes for the dataset in new transfer such as variable value is updated or a new variable is added then we need to create a new variable with a value of "updated".

At least how I'm reading things your requirements keep changing and I'm still not clear what you actually want.

I suggest that before writing any macro code let's first create sample code for a single case using sample data that's representative for your real data.

Please confirm that below is representative sample data and that the WANT table is the desired result using this sample data. If not then please amend the provided data and share it with us.

%let path=%sysfunc(pathname(path));
options dlcreatedir;
libname old "&path/old";
libname new "&path/new";
libname result "&path/result";

data old.TableA;
  infile datalines dsd truncover;
  input key $ var1 var2 $;
  datalines;
a,1,x
b,1,x
c,1,x
d,1,x
e,1,x
;

data new.TableA;
  infile datalines dsd truncover;
  input key $ var1 var2 $;
  datalines;
b,2,x
c,1,y
d,2,y
e,1,x
f,1,x
;

/* change_ind: u=update, i=insert/new, s=same/unchanged */
data result.TableA_WANT;
  infile datalines dsd truncover;
  input key $ var1 var2 $ change_ind $;
  datalines;
b,2,x,u
c,1,y,u
d,2,y,u
e,1,x,s
f,1,x,i
;

If above is representative of what you have and want then below a first stab how you could code to get the WANT table.

/* change_ind: u=update, i=insert/new, s=same/unchanged */
proc sql;
  create table result.TableA_WANT2 as
  select 
    n.*,
    case 
      when n.key ne o.key then 'i'
      else
/*        case(hashing('sha256',catx('|',n.var1,n.var2))=hashing('sha256',catx('|',o.var1,o.var2)))*/
        case(n.var1=o.var1 and n.var2=o.var2)
          when 0 then 'u'
          else 's'
          end
      end as change_ind
  from new.tableA n left join old.tableA o
    on n.key=o.key
  order by n.key
  ;
quit;

If using a SQL will sufficiently perform depends on your data volumes. How many rows do your tables have?

You also mention "or a new variable is added":

Would you mark all rows with a matching key as updated in such a case? Or would you eventually want some other indicator value? - Two actually: one new variable but other matching variables the same, or a new variable plus at least one of the other variables changed as well.

...and what about a variable removed?

chinna0369 · Posted 10-04-2023 11:56 AM

Hi Patrik,

First of all thank you so much for your reply!

I know I am very bad at explaining the things about what I want, sorry for that. But you got it finally. Yes, that is the output I am looking for. But my data is bit challenging. I have 30 datasets in each library to compare and my key variable is record_id and I have lot of variables to compare. Right now, I am using below code, is there any effective way?

%macro data_comp(inds=, sort=);
proc sort data=old.&inds. 
out=old1(drop=&varsdrop.); 
by _all_; 
run;

proc sort data=db.&inds. 
out=new1(drop=&varsdrop.); 
by _all_; 
run;

proc sort data=old1; by &sort.; run;
proc sort data=new1; by &sort.; run;

data both_new;
	merge new1(in=a) old1(in=b keep=&sort.);
	by &sort.;
	if a and b;
run;

data only_new;
	length flag $50.;
	merge new1(in=a) old1(in=b keep=&sort.);
	by &sort.;
	if a and not b;
	flag="New";
run;

proc compare b=old1 c=both_new out=check(where=(_type_="COMPARE")) outnoequal outbase outcomp outdif;
run;

data updated_new;
	length flag $50.;
	merge both_new(in=a) check(in=b keep=&sort.);
	by &sort.;
	if a and b then flag="Updated";
run;

data nlib.&inds.;
	set only_new updated_new;
run;
proc sort; by &sort.; run;
%mend;

%data_comp(inds=ds1, sort=record_id);

Thanks,

Adithya

Patrick · Posted 10-04-2023 11:48 PM

Hi Adithya,

I'd say it very much depends on the use case for this code. If this is just a one off then "anything" that returns the desired result will do and eventually what you've got is already good enough. If you need something re-usable that you or someone else needs to maintain in the future then you might want to spend a bit more effort.

If you're in the end after some repeated process that maintains a change history or some change log then you might want to look into Slowly Changing Dimensions and the like.

Below some code where I believe it does what you have been asking for. If the structure of your new table differs from the old table (=added or removed variables) then then code will only compare the variables that exist in both tables BUT it will also add an indicator that there is a structural change.


/** 1. init environment **/
%let path=%sysfunc(pathname(path));
options dlcreatedir;
libname old "&path/old";
libname new "&path/new";
libname result "&path/result";

/** 2. create sample data **/
data old.TableA old.TableB old.TableC;
  infile datalines dsd truncover;
  input record_id $ var1 var2 $;
  datalines;
a,1,x
b,1,x
c,1,x
d,1,x
e,1,x
;

data new.TableA(drop=var3) new.TableC(drop=var1) new.TableD;
  infile datalines dsd truncover;
  input record_id $ var1 var2 $;
  var3=99;
  datalines;
b,2,x
c,1,y
d,2,y
e,1,x
f,1,x
;


/** 3. Macro definition table analysis **/
%macro doit(
    tbl_out=
  , tbl_old=
  , tbl_new=
  , key_vars=
  , check_vars=
  , tbl_struc_change=
  , change_ind_var=change_ind
  );

  data _null_;
    length var $32 term $70 expr $32000;

    call symputx('order_expr', translate(compbl("&key_vars"),',',' '), 'l');

    if upcase("&tbl_struc_change") ne 'Y' then call symputx('chng_case','1','l');
    else call symputx('chng_case','2','l');

    do i=1 by 1 while(not missing(scan("&key_vars",i,' ')) );
      var=scan("&key_vars",i,' ');
      term=cats('n.',var,'=o.',var);
      if i=1 then expr=term;
      else expr=catx(' ',expr,'and',term);
    end;
    call symputx('on_expr', expr, 'l');

    call missing(expr);
    do i=1 by 1 while(not missing(scan("&check_vars",i,' ')) );
      var=scan("&check_vars",i,' ');
      term=cats('n.',var,'=o.',var);
      if i=1 then expr=term;
      else expr=catx(' ',expr,'and',term);
    end;
    call symputx('case_expr', expr, 'l');

  run;


  /* change_ind: u=update, i=insert/new, s=same/unchanged */
  proc sql feedback;
    create table &tbl_out as
    select 
      n.*,
      case 
        when n.record_id ne o.record_id then "I&chng_case"
        else
          case(&case_expr)
            when 0 then "U&chng_case"
            else "S&chng_case"
            end
        end as &change_ind_var length=2
    from &tbl_new n left join &tbl_old o
      on &on_expr
    order by &order_expr
    ;
  quit;
%mend;

/* test call */
/*
%doit(
    tbl_out=result.TableA
  , tbl_old=old.TableA
  , tbl_new=new.TableA
  , key_vars=record_id
  , check_vars=var1 var2
  , tbl_struc_change=n
  );
*/

/** 4. identify tables and columns for analysis **/
proc format;
  value new_old
    1='New Only'
    2='Old Only'
    other='Both'
  ;
run;

proc sql; 
  create table work.driver as
  select 
      coalesce(new.memname, old.memname) as memname
    , upcase(coalesce(new.name, old.name)) as name
    , case
        when missing(max(old.memname)) then 1
        when missing(max(new.memname)) then 2
        else 3
        end as table_newold_ind length=3 format=new_old.
    , case
        when missing(old.name) then 1
        when missing(new.name) then 2
        else 3
        end as variable_newold_ind length=3 format=new_old.
    , case
        when( min(coalesce(new.name,'0'))='0' or min(coalesce(old.name,'0'))='0' ) then 'Y'
        else 'N'
        end as table_struct_change_ind length=1
  from 
    (select memname, name from dictionary.columns where libname='NEW') new
    full join
    (select memname, name from dictionary.columns where libname='OLD') old
    on new.memname=old.memname and upcase(new.name)=upcase(old.name)
  group by 
    calculated memname
  order by memname, name
  ;
quit;


/** 5. call macro once per table to be analysed **/
filename codegen temp;
data _null_;
  stop;
  file codegen;
run;
data _null_;
/*  file print;*/
  file codegen mod;
  set work.driver;
  by memname name;
  if table_newold_ind=3;

  length check_vars $32000;
  retain check_vars;
  if upcase(name) not in ('RECORD_ID') and variable_newold_ind=3 then
    check_vars=catx(' ',check_vars,name);

  if last.memname then
    do;
      put 
        '%doit(' /
        '    tbl_out=result.' memname /
        '  , tbl_old=old.' memname /
        '  , tbl_new=new.' memname /
        '  , key_vars=record_id' /
        '  , check_vars=' check_vars /
        '  , tbl_struc_change=' table_struct_change_ind /
        '  );' /
        ;

      call missing(check_vars);
    end;
run;
%include codegen /source2;
filename codegen clear;


/** 6. Print driver and result tables **/
proc format;
  value $change_ind
    'S1'='No change - Identical Table Structures'
    'U1'='Update    - Identical Table Structures'
    'I1'='Insert    - Identical Table Structures'
    'S2'='No change - Changed Table Structures'
    'U2'='Update    - Changed Table Structures'
    'I2'='Insert    - Changed Table Structures'
    ;
run;

title 'work.driver - for tables that exist in both libraries';
proc print data=work.driver;
  where table_newold_ind=3;
run;
title 'result.tableA';
proc print data=result.tableA;
  format change_ind $change_ind.;
run;
title 'result.tableC';
proc print data=result.tableC;
  format change_ind $change_ind.;
run;
title;

Stack all datasets from two libraries

Re: Stack all datasets from two libraries

Re: Stack all datasets from two libraries

Re: Stack all datasets from two libraries

Re: Stack all datasets from two libraries

Re: Stack all datasets from two libraries

Re: Stack all datasets from two libraries

Re: Stack all datasets from two libraries

Re: Stack all datasets from two libraries

Re: Stack all datasets from two libraries

Re: Stack all datasets from two libraries

Re: Stack all datasets from two libraries

Registration is open

SAS Training: Just a Click Away