Solved: Re: assign unique ID for each individual with missing data and multipl...

daufoi · Posted 01-02-2023 06:36 PM

I'm trying to assign a unique ID for each individual. Each individual may have some missing data and multiple records. My thoughts are to subset the data first by patID and assign them consecutive numbers. Then, subset the data that has DOB but missing ID, and assign them consecutive numbers. Then, subset the data that has a case number but missing ID and missing DOB. I am having difficulty merging this caseID data back to the original data set. I tried several things (merge, proc sql join, proc sql update) but I am going in circles. Any advice on this?

Have:

patID caseNUM patDOB
1 39 15-Jan-00
1 39 15-Jan-00
1 39 15-Jan-00
2 61 16-Feb-01
3 57 17-Mar-02
3 57 17-Mar-02
. 95 18-Apr-03
. 94 19-Apr-04
. 94 19-Apr-04
. 37 21-Apr-05
. 57 .
. 90 .
. 96 .
. 96 .
. 96 .
. 96 .
. 96 .

Want:

patID caseNUM patDOB caseID
1 39 15-Jan-00 1
1 39 15-Jan-00 1
1 39 15-Jan-00 1
2 61 16-Feb-01 2
3 57 17-Mar-02 3
3 57 17-Mar-02 3
. 95 18-Apr-03 4
. 94 19-Apr-04 5
. 94 19-Apr-04 5
. 37 21-Apr-05 6
. 57 . 7
. 90 . 8
. 96 . 9
. 96 . 9
. 96 . 9
. 96 . 9
. 96 . 9

Here is the code I have so far. I can accept changes to this code but due to the nature of the data, I need to assign the caseID first by patID, then by patDOB, then by caseNUM.

data fake_data;
input patID $1 caseNUM 3. patDOB :date9. ;
format patDOB date9.;
datalines;
1	39	15Jan2000
1	39	15Jan2000
1	39	15Jan2000
2	61	16Feb2001
3	57	17Mar2002
3	57	17Mar2002
.	95	18Apr2003
.	94	19Apr2004
.	94	19Apr2004
.	37	21Apr2005
.	57	.
.	90	.
.	96	.
.	96	.
.	96	.
.	96	.
.	96	.
;


*generate caseID for non-missing patID;
data no_missing;
	set fake_data;
	where not missing(patID);
run;

proc sort data=no_missing nodupkey; by patID; run;
data caseID_1;
	set no_missing;
	caseID = _n_;
	keep patID caseID;
run;

*generate caseID for missing patID, non-missing DOB;
data missing_patID;
	set fake_data;
	where missing(patID) and not missing(patDOB);
run;

proc sort data=missing_patID nodupkey; by patDOB; run;
data caseID_2;
	set missing_patID;
	caseID = _n_ + 3;
	keep patID patDOB caseID;
run;

*generate caseID for missing patID and missing DOB;
data missing_patDOB;
	set fake_data;
	where missing(patID) and missing(patDOB);
run;

proc sort data=missing_patDOB nodupkey; by caseNUM; run;
data caseID_3;
	set missing_patDOB;
	caseID = _n_ + 6;
	keep patID patDOB caseNUM caseID;
run;

daufoi · Posted 01-02-2023 11:39 PM

@Patrickunfortunately neither of these options would work because the multiple patients have the same caseNUM. That's why I need to assign the caseID first by patID, then by patDOB, then by caseNUM. Doing it that way the caseNUM won't be shared between patients. proc sql update works, I was just missing 1 line.

proc sql;
create table add_caseID as
	select F.*, C.caseID
	from fake_data as F
	left join caseID_1 as C
    on C.patID = F.patID;
update add_caseID as O
	set caseID = (select caseID from caseID_2 as U where missing(O.patID) and O.patDOB = U.patDOB)
		where missing(O.patID)
			and O.patDOB in (select patDOB from caseID_2);
update add_caseID as O
	set caseID = (select caseID from caseID_3 as U where missing(O.patID) and missing(O.patDOB) and O.caseNUM = U.caseNUM)
		where missing(O.patID)
			and missing(O.patDOB)
			and O.caseNUM in (select caseNUM from caseID_3);
quit;

View solution in original post

Patrick · Posted 01-02-2023 10:09 PM

Below two options.

Data HAVE.

data have;
  infile datalines truncover;
  input patID caseNUM patDOB:date9.;
  format patDOB date9.;
datalines;
1 39 15-Jan-00
1 39 15-Jan-00
1 39 15-Jan-00
2 61 16-Feb-01
3 57 17-Mar-02
3 57 17-Mar-02
. 95 18-Apr-03
. 94 19-Apr-04
. 94 19-Apr-04
. 37 21-Apr-05
. 57 .
. 90 .
. 96 .
. 96 .
. 96 .
. 96 .
. 96 .
;

Option 1

data want(drop=_:);
  if _n_=1 then
    do;
      length caseID 8;
      dcl hash h1();
      h1.defineKey('patID','caseNUM');
      h1.defineData('caseID');
      h1.defineDone();
    end;
  set have;
  if h1.find() ne 0 then 
    do;
      caseID=h1.num_items + 1;
      _rc=h1.add();
    end;
run;

Option 2

proc sort data=have out=want;
  by patID caseNUM;
run;

proc sort nodupkey data=want out=cases(keep=patID caseNUM);
  by patID caseNUM;
run;

data want;
  merge want cases;
  by patID caseNUM;
  if first.caseNUM then caseID+1;
run;

daufoi · Posted 01-02-2023 11:39 PM

@Patrickunfortunately neither of these options would work because the multiple patients have the same caseNUM. That's why I need to assign the caseID first by patID, then by patDOB, then by caseNUM. Doing it that way the caseNUM won't be shared between patients. proc sql update works, I was just missing 1 line.

proc sql;
create table add_caseID as
	select F.*, C.caseID
	from fake_data as F
	left join caseID_1 as C
    on C.patID = F.patID;
update add_caseID as O
	set caseID = (select caseID from caseID_2 as U where missing(O.patID) and O.patDOB = U.patDOB)
		where missing(O.patID)
			and O.patDOB in (select patDOB from caseID_2);
update add_caseID as O
	set caseID = (select caseID from caseID_3 as U where missing(O.patID) and missing(O.patDOB) and O.caseNUM = U.caseNUM)
		where missing(O.patID)
			and missing(O.patDOB)
			and O.caseNUM in (select caseNUM from caseID_3);
quit;

Patrick · Posted 01-03-2023 02:18 AM

@daufoi Then your sample data wasn't sufficiently representative because the first option I posted returns exactly what you've shown us in your WANT data.

If DoB is also a key then you would just need to add it as well to the hash key - or the sort key for option 2.

I still believe that what I've posted will do the job. From what you describe you've got nothing else than a composite key (PatientID, CaseNum, DoB) and you need a caseID per unique value combination of this composite key. And if looking at things this way then code as simple as below should already do the job.

data have;
  infile datalines truncover;
  input patID caseNUM patDOB:date9.;
  format patDOB date9.;
datalines;
1 39 15-Jan-00
1 39 15-Jan-00
1 39 15-Jan-00
2 61 16-Feb-01
3 57 17-Mar-02
3 57 17-Mar-02
. 95 18-Apr-03
. 94 19-Apr-04
. 94 19-Apr-04
. 37 21-Apr-05
. 57 .
. 90 .
. 96 .
. 96 .
. 96 .
. 96 .
. 96 .
;

proc sort data=have out=want;
  by patID caseNUM patDOB;
run;

data want;
  set want;
  by patID caseNUM patDOB;
  if first.patDOB then caseID+1;
run;

Ksharp · Posted 01-03-2023 06:40 AM

data fake_data;
input patID $ caseNUM  patDOB :date9. ;
format patDOB date9.;
datalines;
1 39 15Jan2000
1 39 15Jan2000
1 39 15Jan2000
2 61 16Feb2001
3 57 17Mar2002
3 57 17Mar2002
. 95 18Apr2003
. 94 19Apr2004
. 94 19Apr2004
. 37 21Apr2005
. 57 .
. 90 .
. 96 .
. 96 .
. 96 .
. 96 .
. 96 .
;

options missing=' ';
data temp;
 set fake_data;
 group=coalescec(vvalue(patID),vvalue(patDOB),vvalue(caseNUM));
run;
data want;
 set temp;
 by group notsorted;
 caseID+first.group;
 drop group;
run;

daufoi · Posted 01-03-2023 02:19 PM

@Patrickyou're right. It does do what I need. I had added patients that use the same caseNUM and yours does exactly what I need!

@Ksharpalso does what is intended!

Thanks to both! Both your code is more efficient than mine! 3 solutions to the same problem! hah!

assign unique ID for each individual with missing data and multiple records

Re: assign unique ID for each individual with missing data and multiple records

Re: assign unique ID for each individual with missing data and multiple records

Re: assign unique ID for each individual with missing data and multiple records

Re: assign unique ID for each individual with missing data and multiple records

Re: assign unique ID for each individual with missing data and multiple records

Re: assign unique ID for each individual with missing data and multiple records

Classroom Training Available!