Compare values across multiple variables and flag duplicates

sivakoya · Posted 07-14-2017 01:13 PM

ID	cell(char-variable)	home(char-variable)	work(char-variable)	Dup_phone	All_same	Dup_from
1	(XXX)XXX-XXXX	(XXX)XXX-XXXX	(YYY)YYY-YYYY	Y	N	CELL_HOME
2	(ZZZ)ZZZ-ZZZZ	(YYY)YYY-YYYY	(ZZZ)ZZZ-ZZZZ	Y	N	CELL_WORK
3	(ZZZ)ZZZ-ZZZZ	(YYY)YYY-YYYY	(YYY)YYY-YYYY	Y	N	HOME_WORK
4	(XXX)XXX-XXXX	(XXX)XXX-XXXX	(XXX)XXX-XXXX	Y	Y	ALL_DUP
5	(XXX)XXX-XXXX	(YYY)YYY-YYYY	(ZZZ)ZZZ-ZZZZ	N	N	NO_DUP
6	(XXX)XXX-XXXX	missing	missing	N	N	NO_DUP

I have above dataset with ID, cell, home, work variables. For each ID, I need to look if we have any duplicate phone information and and also identify which ones are duplicates. I need to be able to create last three columns of above data. MIssing values shouldnt be accounted for equality. Any quick logic using data step? Thanks!

kiranv_ · Posted 07-14-2017 02:27 PM

something like this

data have;
infile datalines truncover ;
informat id cell $12. home $12. work $12.;;
input 
ID cell $ home $ work $ ;
datalines;
1 (XXX)XXX-XXXX (XXX)XXX-XXXX (YYY)YYY-YYYY 
2 (ZZZ)ZZZ-ZZZZ (YYY)YYY-YYYY (ZZZ)ZZZ-ZZZZ 
3 (ZZZ)ZZZ-ZZZZ (YYY)YYY-YYYY (YYY)YYY-YYYY 
4 (XXX)XXX-XXXX (XXX)XXX-XXXX (XXX)XXX-XXXX 
5 (XXX)XXX-XXXX (YYY)YYY-YYYY (ZZZ)ZZZ-ZZZZ 
6 (XXX)XXX-XXXX
;



proc sql;
select  cell, 
        home ,
         work,
		case when ((cell = home) and (cell is not missing and home is not missing))or 
          			((home =work) and (cell is not missing and home is not missing)) or
         			 ((cell =work)and (cell is not missing and home is not missing)) then 'Y'
        	else 'N'
		end as dup_phone,
		case when cell = home and home =work and cell is not missing and home is not missing and work is not missing then 'Y'
   			 else 'N'
		end as all_same, 
		case 
			when  cell = home and home =work and cell is not missing and home is not missing and work is not missing then 'ALL_DUP'
            when ((cell = home) and (cell is not missing and home is not missing)) then 'CELL_HOME'
            when((home =work) and (cell is not missing and home is not missing)) then 'HOME_WORK'
            when ((cell =work)and (cell is not missing and home is not missing)) then 'CELL_WORK'
            
			else 'NO_DUP'
       end as Dup_from 
	from  have;

Ksharp · Posted 07-15-2017 09:29 AM

data have;
infile datalines truncover ;
informat id cell $12. home $12. work $12.;;
input 
ID cell $ home $ work $ ;
datalines;
1 (XXX)XXX-XXXX (XXX)XXX-XXXX (YYY)YYY-YYYY 
2 (ZZZ)ZZZ-ZZZZ (YYY)YYY-YYYY (ZZZ)ZZZ-ZZZZ 
3 (ZZZ)ZZZ-ZZZZ (YYY)YYY-YYYY (YYY)YYY-YYYY 
4 (XXX)XXX-XXXX (XXX)XXX-XXXX (XXX)XXX-XXXX 
5 (XXX)XXX-XXXX (YYY)YYY-YYYY (ZZZ)ZZZ-ZZZZ 
6 (XXX)XXX-XXXX
;
run;

data want;
 if _n_=1 then do;
  length k $ 100;
  declare hash h();
  h.definekey('k');
  h.definedone();
 end;
set have;
length dup_phone all_same $ 1 dup_from  temp $ 200;
array x{*} $ cell home work;
do i=1 to dim(x);
  k=x{i};
  h.replace();
end;

if h.num_items=1 then do;
  if cmiss(of x{*})=0 then do;dup_phone='Y';dup_from='All_dup';end;
   else do;dup_phone='N';dup_from='All_missing';end;
  all_same='Y';
end;
 else do;
        dup_phone='N';dup_from='No_dup';
              do i=1 to dim(x)-1;
			   do j=i+1 to dim(x);
                 if not missing(x{i}) and not missing(x{j}) and x{i}=x{j} then do; 
                    yes=1; dup_phone='Y';temp=catx('|',temp,cats(vname(x{i}),'_',vname(x{j})));
                 end;
			   end;
			  end; 
        if yes then dup_from=temp; 
        all_same='N';
	  end;
h.clear();
drop i j k temp yes;
run;

Compare values across multiple variables and flag duplicates

Re: Compare values across multiple variables and flag duplicates

Re: Compare values across multiple variables and flag duplicates

Compare values across multiple variables and flag duplicates

Re: Compare values across multiple variables and flag duplicates

Re: Compare values across multiple variables and flag duplicates

Click image to register for webinar

Classroom Training Available!