Removing repeat IDs

Melk · Posted 10-11-2017 05:10 PM

I have a data set structured as follows:

ID repeat firstID

1 0

2 0

3 1 2

4 0

5 0

6 1 4,5

So I need to keep only the new IDs for patients who were repeats. In this example, I would need to remove 2, 4, 5. Some have multiple previous IDs, so there is a comma separating them. What is the most efficient way to do this?

ChrisBrooks · Posted 10-11-2017 08:32 PM

Making some assumptions about the format of the data and the variables you want to keep in the output this should give you what you're asking for

data have;
	length id $1 repeat 8. firstid $3;
	infile datalines dlm='09'x missover truncover;
	input id repeat firstid;
	datalines;
1	0
2	0
3	1	2
4	0
5	0
6	1	4,5
;
run;

proc sql;
	create table repeats
	as select firstid
	from have
	where firstid ne "";
quit;

data repeatslong(keep=id);
	set repeats;
	if count(firstid,",") > 0 then do;
		do i = 1 to (count(firstid,",")+1);
			id=scan(firstid,i,",");
			output;
		end;
	end;
	else do;
		id=firstid;
		output;
	end;
run;

proc sql;
	create table want
	as select id, repeat
	from have
	where id not in
		(select id
		 from repeatslong)
	;
quit;

Patrick · Posted 10-11-2017 09:16 PM

@Melk

I tend to use SAS hash tables to create black-lists or white-lists. Below an example.

data have;
	length id $1 repeat 8. firstid $3;
	infile datalines dlm='09'x truncover;
	input id repeat firstid;
	datalines;
1	0
2	0
3	1	2
4	0
5	0
6	1	4,5
;
run;

data want(drop=_:);

  if _n_=1 then
    do;
      dcl hash h1();
      h1.defineKey('id');
      h1.defineDone();

      do until(last);
        set have(keep=firstid) end=last;
        if not missing(firstid) then
          do _i=1 by 1;
            id=scan(firstid,_i,',');
            if missing(id) then leave;
            h1.ref();
          end;
      end;
    end;

    set have;
    if h1.check() then output;

run;

s_lassen · Posted 10-12-2017 07:16 AM

@Melk:

You can also do it with a relatively simple datastep solution:

data have;
  length id $1 repeat 8. firstid $3;
  infile cards truncover;
  input id repeat firstid;
cards;
1 0
2 0
3 1 2
4 0
5 0
6 1 4,5
;
run;

data drop;
  set have;
  where firstid;
  /* parse firstid */
  do _N_=1 to countw(firstid,',');
    id=scan(firstid,_N_,',');
    output;
    end;
  keep id;
run;

proc sort data=drop;
  by id;
run;

data want;
  merge have drop(in=drop);
  by id;
  if not drop;
run;

Removing repeat IDs

Re: Removing repeat IDs

Re: Removing repeat IDs

Re: Removing repeat IDs

Removing repeat IDs

Re: Removing repeat IDs

Re: Removing repeat IDs

Re: Removing repeat IDs

Click image to register for webinar

Classroom Training Available!