Solved: Flagging column differences among rows with the same ID

jnivi · Posted 06-13-2023 10:54 PM

Hi there,

I have a very large dataset with 40 variables, where an ID can repeat more than once. When an ID appears more than once, is there a way I can flag differences in variable values between the rows?

ID Fruit Color Vehicle Year

1 Apple Blue Car 2001

1 Apple Purple Car 2001

2 Banana Red Van 2000

2 Banana Blue Car 2000

In the above example, I would like to flag that color differs for ID 1 , and that color and vehicle differs for ID 2.

Thank you for any suggestions that you might be able to offer!

Ksharp · Posted 06-14-2023 07:59 AM

data have;
input ID       ( Fruit          Color           Vehicle  ) ($)          Year;
cards;
1          Apple        Blue              Car                   2001
1          Apple       Purple           Car                   2001
2         Banana    Red                Van                  2000
2        Banana    Blue               Car                    2000
;

proc transpose data=have(obs=0) out=temp;
var _all_;
run;
proc sql noprint;
select cat('count(distinct ',_name_,') as ',_name_) into :count separated by ' ,' from temp;

create table temp2 as
select id as _id,&count. from have group by id;
quit;
proc transpose data=temp2 out=want(where=(col1 ne 1));
by _id;
run;

View solution in original post

FreelanceReinh · Posted 06-14-2023 07:02 AM

Hi @jnivi,

How about creating a dataset containing the ID and a character variable DIFF: a string of flags which are "1" if the corresponding variable in your dataset has two or more different values for that ID and "0" otherwise? So the result for your sample data would look like this:

For ID 1 only the second variable (after ID), i.e. Color, exhibits a difference. For ID 2 it's also the third, Vehicle.

data have;
input ID Fruit $ Color $ Vehicle $ Year;
cards;
1 Apple Blue Car 2001
1 Apple Purple Car 2001
2 Banana Red Van 2000
2 Banana Blue Car 2000
;

filename compvals temp;

data _null_;
file compvals;
set sashelp.vcolumn end=last;
where libname='WORK' & memname='HAVE' & name ne 'ID';
pos=varnum-1;
put 'if ' name 'ne lag(' name +(-1) ') then substr(diff,' pos +(-1) ',1)="1";';
if last then call symputx('nvars',_n_);
run;

data want(keep=id diff);
do until(last.id);
  set have;
  by id;
  length diff $&nvars;
  %inc compvals;
  if first.id then diff=repeat('0',&nvars-1);
end;
run;

If you are using the VALIDVARNAME=ANY system option and non-standard variable names (e.g., names containing blanks), the code above needs to be tweaked a bit (NLITERAL function).

Ksharp · Posted 06-14-2023 07:59 AM

data have;
input ID       ( Fruit          Color           Vehicle  ) ($)          Year;
cards;
1          Apple        Blue              Car                   2001
1          Apple       Purple           Car                   2001
2         Banana    Red                Van                  2000
2        Banana    Blue               Car                    2000
;

proc transpose data=have(obs=0) out=temp;
var _all_;
run;
proc sql noprint;
select cat('count(distinct ',_name_,') as ',_name_) into :count separated by ' ,' from temp;

create table temp2 as
select id as _id,&count. from have group by id;
quit;
proc transpose data=temp2 out=want(where=(col1 ne 1));
by _id;
run;

jnivi · Posted 06-21-2023 12:39 PM

Thank you!! This was very helpful and worked! Some of the differences between the records with same ID was actually due to missingness. Is there a way that this code can be modified so that "NA" values don't contribute to the difference in variable values amongst records with the same ID?

Ksharp · Posted 06-22-2023 08:36 AM

/*
You could set "NA" be missing before counting it.
*/
data have;
input ID       ( Fruit          Color           Vehicle  ) ($)          Year;
cards;
1          Apple        Blue              Car                   2001
1          Apple       NA           Car                   2001
2         Banana    Red                Van                  2000
2        Banana    Blue               Car                    2000
;

data have;
 set have;
 array x{*} $ _character_;
do i=1 to dim(x);
 if x{i}="NA" then call missing(x{i});
end;
drop i;
run;

proc transpose data=have(obs=0) out=temp;
var _all_;
run;
proc sql noprint;
select cat('count(distinct ',_name_,') as ',_name_) into :count separated by ' ,' from temp;

create table temp2 as
select id as _id,&count. from have group by id;
quit;
proc transpose data=temp2 out=want(where=(col1 ne 1));
by _id;
run;

Flagging column differences among rows with the same ID

Re: Flagging column differences among rows with the same ID

Re: Flagging column differences among rows with the same ID

Re: Flagging column differences among rows with the same ID

Re: Flagging column differences among rows with the same ID

Re: Flagging column differences among rows with the same ID

Flagging column differences among rows with the same ID

Re: Flagging column differences among rows with the same ID

Re: Flagging column differences among rows with the same ID

Re: Flagging column differences among rows with the same ID

Re: Flagging column differences among rows with the same ID

Re: Flagging column differences among rows with the same ID

Registration is open

SAS Training: Just a Click Away