Re: Identifying duplicated of pair of id

rsva · Posted 10-02-2015 02:50 PM

Greetings,

I have a data similar to below

id1 id2

--------------------

ds246 jk900

kl78 gh456

jk900 ds246

ko000 rt234

gh456 kl789

nm879 th45

If you look closely, 1st and 3rd records are the same pair, but the ids are flipped. I need to identify such pairs to deduplicate the data. I mean instead of counting the pair of ids twice, I need to count them once. Any guidance would help.

Thanks.

rsva

Astounding · Posted 10-02-2015 02:57 PM

Assuming that you are free to play with the data and the program completes in a reasonable amount of time, here's one way.

data want;

set have;

if id1 < id2 then do;

id1_sorted = id1;

id2_sorted = id2;

end;

else do;

id1_sorted = id2;

id2_sorted = id1;

end;

run;

proc sort data=want NODUPKEY;

by id1_sorted id2_sorted;

run;

Optionally, you could then drop id1_sorted and id2_sorted;

Patrick · Posted 10-05-2015 09:53 PM

Just another way of doing it using a hash table. This code assumes that you want to keep only a single row per unique combination of ID1 & ID2.

data have;
input id1 $ id2 $;
datalines;
ds246 jk900
kl78 gh456
jk900 ds246
ko000 rt234
gh456 kl789
nm879 th45
;
run;

data want;
set have;
if _n_=1 then
do;
dcl hash h1();
h1.defineKey('id1','id2');
h1.defineDone();
end;
if h1.check() ne 0 then
do;
output;
h1.add();
h1.add(key:id2,key:id1, data&colon;id2, data&colon;id1);
end;
run;

Reeza · Posted 10-02-2015 02:59 PM

How big is your data?
A simple way is to create two new variables and use a proc sql with distinct or proc sort with no dupkey.

/*Set up fake data*/
data have;
input id1  $ id2 $;
cards;
ds246       jk900
kl78          gh456
jk900        ds246
ko000       rt234
gh456      kl789
nm879     th45
;

/*Assign to new variables in fixed order*/
data want;
set have;
if id1<id2 then do;
	id3=id1;
	id4=id2;
end;
else do;
id3=id2;
id4=id1;
end;
run;

*Unique via SQL;
proc sql; 
create table want_sql as 
select distinct id3, id4
from want
order by id3, id4;
quit;

*unique records via Sort;
Proc sort data=want (keep=id3 id4) out=want_sort nodupkey;
by id3 id4;
run;

MikeZdeb · Posted 10-02-2015 08:08 PM

Hi, another idea ...

data x;
input id1 $ id2 $;
cards;
ds246 jk900
kl78 gh456
jk900 ds246
ko000 rt234
gh456 kl789
nm879 th45
;

data y;
set x;
y1 = id1;
y2 = id2;
array y(2) ;
call sortc(of y(*));
run;

proc sort data=y out=y (drop=y: ) nodupkey;
by y: ;

run;

data set y ...

Obs id1 id2

1     ds246    jk900
2     kl78     gh456
3     gh456    kl789
4     ko000    rt234
5     nm879    th45

If changing the order of ID1 and ID2 doesn't matter, simpler code ...

data y;
set x;
array id(2) ;
call sortc(of id(*));
run;

proc sort data=y nodupkey;
by id: ;
run;

hbi · Posted 10-05-2015 12:37 PM

[edited to change MAX() statement to equality test within CASE WHEN ... ]

Hi,

This PROC SQL solution works well with pairs, since a pair of values will always have a minimum and maximum.

If one were dealing with n-tuples (id1, id2, id3, etc.), then a different solution would be needed.

-- hbi

PROC SQL;
    CREATE TABLE want AS 
    SELECT DISTINCT CASE WHEN id1 <= id2 THEN id1 ELSE id2 END AS id1_fixed
                  , CASE WHEN id1 <  id2 THEN id2 ELSE id1 END AS id2_fixed
    FROM have;
QUIT;

PROC SQL;
    SELECT COUNT(*) AS num_distinct_obs
    FROM want;
QUIT;

hbi · Posted 10-05-2015 01:12 PM

This version is just for fun. It creates a hash for each pair of IDs. This version is only useful if you want a count of the number of distinct ID pairs.

PROC SQL;
  /* create unique MD5 hash based on ID pair */
  CREATE TABLE id_pairs_hashed AS 
  SELECT CASE WHEN id1 < id2 THEN PUT(MD5(CATS(id1, "|", id2)), $hex32.)
              ELSE                PUT(MD5(CATS(id2, "|", id1)), $hex32.)
         END AS id_pair_md5 length=32
  FROM have;
QUIT;


PROC SQL;
  SELECT COUNT(DISTINCT id_pair_md5) AS num_distinct_obs
  FROM id_pairs_hashed;
QUIT;

Classroom Training Available!