topic Re: Remove duplicate ids using group by and an indicator based on values in another column? in SAS Programming

Remove duplicate ids using group by and an indicator based on values in another column?

N3w_kid29 — Sun, 19 Jan 2020 18:03:29 GMT

Hi I posted this question on another forum. I asked how to do it in R. I would like to do it in SAS as well.

Ids are duplicated because of multiple types in another column. I would like to remove duplicate ids and have an indicator column for specific types instead. Here's what I have and need (written in R):

have<-data.frame(id=c(1,1,2,3,3,3,4,5,5,6))
have$type<-c("healthy","healthy","injury1","healthy","injury2",
         "injury1","healthy","injury2","healthy","injury2")

need<-data.frame(id=c(1,2,3,4,5,6))
need$injury_ind<-c(0,1,1,0,1,1)

Solution (written in R):

have %>%
   group_by(id) %>% 
   summarise(injury_id = +(any(type %in% c('injury1', 'injury2'))))
# A tibble: 6 x 2
#     id injury_id
#  <dbl>     <int>
#1     1         0
#2     2         1
#3     3         1
#4     4         0
#5     5         1
#6     6         1

Any help would be appreciated.

Re: Remove duplicate ids using group by and an indicator based on values in another column?

unison — Sun, 19 Jan 2020 19:38:42 GMT

Try this.

*Have;
data have;
input id type $ @@;
datalines;
1 healthy 1 healthy
2 injury1
3 healthy 3 injury2 3 injury1
4 healthy 
5 injury2 5 healthy 
6 injury2
;
run;

*Your desired/expected output;
data desired;
input id injury_id @@;
datalines;
1 0 2 1 3 1
4 0 5 1 6 1
;
run;

*Flag injuries;
data want;
set have;
injury_id = (type in ('injury1' 'injury2'));
run;

*Sort by id, descending injury_id (since injury_id=1 is wanted over injury_id=0);
proc sort data=want; by id descending injury_id; run;
*Sort by id -- deduplicate output;
proc sort data=want(drop=type) nodupkey; by id; run;

*Compare desired to want to verify that the output is expected;
proc compare base=desired compare=want;
run;

Re: Remove duplicate ids using group by and an indicator based on values in another column?

ed_sas_member — Sun, 19 Jan 2020 19:42:56 GMT

data have;
	input id type $;
	datalines;
1 healthy
1 healthy
2 injury1
3 healthy
3 injury2
3 injury1
4 healthy
5 injury2
5 healthy
6 injury2
;
run;

/*STEP1 */
data have_flag;
	set have;
	by ID;
	if type in ("injury1", "injury2") then flag = 1;
	else flag = 0;
run;

/*STEP2 */
proc means data=have_flag max noprint;
	var flag;
	class ID;
	ways 1;
	output out=want (drop=_type_ _freq_) max=injury_ind;
run;

/*OR*/

proc sql;
	create table want as
	select ID, max(flag) as injury_ind
	from have_flag
	group by ID;
quit;

Re: Remove duplicate ids using group by and an indicator based on values in another column?

hashman — Sun, 19 Jan 2020 22:12:30 GMT

@N3w_kid29:

Using the hash object, it can be done in a single pass without the assumption that the input is sorted and without sorting. The ordered:"a" argument tag causes the hash items to be stored in order by ID - and appear such in the output - but it doesn't involve any sorting overhead because the hash items are inserted into the underlying AVL trees in order, so while the output is rendered, they are merely interleaved. (The same sorting effect can be achieved by using argument tag hashexp:0 instead of ordered:"a", as in this case, there's only one AVL tree, and it is traversed in ascending order. However, if the input data are sizable, it is less efficient.)

data have ;                                                                                                                             
  input id type $ ;                                                                                                                     
  cards ;                                                                                                                               
3  injury2                                                                                                                              
2  injury1                                                                                                                              
6  injury2                                                                                                                              
3  injury1                                                                                                                              
1  healthy                                                                                                                              
4  healthy                                                                                                                              
3  healthy                                                                                                                              
5  injury2                                                                                                                              
1  healthy                                                                                                                              
5  healthy                                                                                                                              
;                                                                                                                                       
run ;                                                                                                                                   
                                                                                                                                        
data _null_ ;                                                                                                                           
  dcl hash h (ordered:"a") ;                                                                                                           
  h.definekey ("id") ;                                                                                                                  
  h.definedata ("id", "injury_id") ;                                                                                                    
  h.definedone () ;                                                                                                                     
  do until (z) ;                                                                                                                        
    set have end = z ;                                                                                                                  
    _id = type =: "injury" ;                                                                                                            
    if h.find() ne 0 then injury_id = ._ ;                                                                                              
    if _id > injury_id then h.replace (key:id, data:id, data:_id) ;                                                                     
  end ;                                                                                                                                 
  h.output (dataset:"want") ;                                                                                                           
run ;

Kind regards

Paul D.