Solved: Re: How to identify family linkages within and across generations?

al15 · Posted 07-31-2023 03:49 PM

I have a dataset that contains information on multiple family generations. I am fortunate that everyone in the dataset has an ID number, and mothers and fathers are identified.

I need to organize the dataset so that I can identify families in which the mother, father and child have information in my dataset (triads and dyads). I need to identify generations of individuals as well.

I have tried using sql, adapting the great code by @Ksharp from the post “identifying sibling structures in dataset” and other steps, but none of these approaches yields what I need.

I am well and truly stuck, and any and all advice is appreciated. I am using SAS Enterprise v7.15.

Here is what I have:

id	sex	yob	momid	dadid
77	M	1929	.	.
69	F	1940	.	.
52	F	1961	69	77
99	F	1962	69	77
16	F	1963	31	12
22	M	1980	16	29
68	F	1984	52	21
72	M	1992	52	97
55	M	1994	52	97
11	F	1995	99	37
20	F	1999	99	37
98	F	2019	68	24

And this is what I want:

id	sex	yob	momid	dadid	child	generation	famid	mom_in	dad_in
77	M	1929	.	.	1	0	1	0	0
69	F	1940	.	.	1	0	1	0	0
52	F	1961	69	77	1	1	1	1	1
99	F	1962	69	77	1	1	1	1	1
16	F	1963	31	12	1	0	2	0	0
22	M	1980	16	29	0	1	2	1	0
68	F	1984	52	21	1	2	1	1	0
72	M	1992	52	97	0	2	1	1	0
55	M	1994	52	97	0	2	1	1	0
11	F	1995	99	37	0	2	1	1	0
20	F	1999	99	37	0	2	1	1	0
98	F	2019	68	24	0	3	1	1	0

The id variables are id, momid, dadid. Sex is M/F, and yob is year of birth. Child identifies individuals in the id column who have a child, generation identifies the generation of the family (starting at 0), famid is a family identification, and mom_in means that a mother is in the id column and dad_in means that a father is in the id column.

Below is what I've tried:

data have;
input id sex $ year momid dadid;
datalines;
77 M 1929 . .
69 F 1940 . .
52 F 1961 69 77
99 F 1962 69 77
16 F 1963 31 12
22 M 1980 16 29
68 F 1984 52 21
72 M 1992 52 97
55 M 1994 52 97
11 F 1995 99 37
20 F 1999 99 37
98 F 2019 68 24
;
run;


/*Identifies individuals who are mothers or fathers in the dataset (e.g. the id column).*/
data mom;
set have (drop=id);
rename momid=id;
run;
proc sort; by id; run;
data dad;
set have (drop=id);
rename dadid=id;
run;
proc sort; by id; run;

/*moms*/
data a;
merge have (in=a) mom (in=b);
by id;
if a and b;
mom_in=1;
run;
data aa;
set a;
rename id=momid;
run;

/*dads*/
data b;
merge have (in=a) dad (in=b);
by id;
if a and b;
dad_in=1;
run;
data bb;
set b;
rename id=dadid;
run;

proc sort data=have; by momid; run;

data have2;
merge have aa;
by momid;
run;

proc sort data=have2; by dadid; run;

data have3;
merge have2 bb;
by dadid;
run;



/*I don't know how to expand this code to make families based on mothers AND fathers*/ 

/*from is parent, to is child*/
data have;
set have;
rename momid=from;
rename id=to;
run;

/*Family ids*/
data full;
  set have end=last;
  if _n_ eq 1 then do;
   declare hash h();
    h.definekey('node');
     h.definedata('node');
     h.definedone();
  end;
  output;
  node=from; h.replace();
  from=to; to=node; output;
  node=from; h.replace();
  if last then h.output(dataset:'node');
  drop node;
run;


data want/*(keep=node household)*/;
declare hash ha(ordered:'a');
declare hiter hi('ha');
ha.definekey('count');
ha.definedata('last');
ha.definedone();
declare hash _ha(hashexp: 16);
_ha.definekey('key');
_ha.definedone();

if 0 then set full;
declare hash from_to(dataset:'full',hashexp:20,multidata:'y');
 from_to.definekey('from');
 from_to.definedata('to');
 from_to.definedone();

if 0 then set node;
declare hash no(dataset:'node');
declare hiter hi_no('no');
 no.definekey('node');
 no.definedata('node');
 no.definedone();
 

do while(hi_no.next()=0);
 household+1; output;
 count=1;
 key=node;_ha.add();
 last=node;ha.add();
 rc=hi.first();
 do while(rc=0);
   from=last;rx=from_to.find();
   do while(rx=0);
     key=to;ry=_ha.check();
      if ry ne 0 then do;
       node=to;output;rr=no.remove(key:node);
       key=to;_ha.add();
       count+1;
       last=to;ha.add();
      end;
      rx=from_to.find_next();
   end;
   rc=hi.next();
end;
ha.clear();_ha.clear();
end;
stop;
run;

Ksharp · Posted 08-01-2023 08:24 AM

data x;
input id sex $ year momid dadid;
datalines;
77 M 1929 . .
69 F 1940 . .
52 F 1961 69 77
99 F 1962 69 77
16 F 1963 31 12
22 M 1980 16 29
68 F 1984 52 21
72 M 1992 52 97
55 M 1994 52 97
11 F 1995 99 37
20 F 1999 99 37
98 F 2019 68 24
;
run;

/*************************/
/*********FamID**********/
data have;
 set x;
from=id;
to=momid;if not missing(to) then output;
to=dadid;if not missing(to) then output;
keep  from  to ;
run;
data full;
  set have end=last;
  if _n_ eq 1 then do;
   declare hash h();
    h.definekey('node');
     h.definedata('node');
     h.definedone();
  end;
  output;
  node=from; h.replace();
  from=to; to=node; output;
  node=from; h.replace();
  if last then h.output(dataset:'node');
  drop node;
run;


data famid(keep=node household rename=(household=famid));
declare hash ha(ordered:'a');
declare hiter hi('ha');
ha.definekey('count');
ha.definedata('last');
ha.definedone();
declare hash _ha(hashexp: 16);
_ha.definekey('key');
_ha.definedone();

if 0 then set full;
declare hash from_to(dataset:'full(where=(from is not missing and to is not missing))',hashexp:20,multidata:'y');
 from_to.definekey('from');
 from_to.definedata('to');
 from_to.definedone();

if 0 then set node;
declare hash no(dataset:'node');
declare hiter hi_no('no');
 no.definekey('node');
 no.definedata('node');
 no.definedone();
 

do while(hi_no.next()=0);
 household+1; output;
 count=1;
 key=node;_ha.add();
 last=node;ha.add();
 rc=hi.first();
 do while(rc=0);
   from=last;rx=from_to.find();
   do while(rx=0);
     key=to;ry=_ha.check();
      if ry ne 0 then do;
       node=to;output;rr=no.remove(key:node);
       key=to;_ha.add();
       count+1;
       last=to;ha.add();
      end;
      rx=from_to.find_next();
   end;
   rc=hi.next();
end;
ha.clear();_ha.clear();
end;
stop;
run;










/*************************/
/*********Other variables**********/
data have;
 set x;
_start=id;
_end=momid; output;
_end=dadid; output;
keep  _end  _start ;
run;
proc sql;
create table min_id as
select *
 from have 
  where _start not in (select _end from have);
quit;
data want(keep=path);
if _n_ eq 1 then do;
length path _path  $ 400 ;
if 0 then set have;
declare hash ha(hashexp:20,dataset:'have(where=(_start is not missing and _end is not missing))',multidata:'y');
ha.definekey('_start');
ha.definedata('_end');
ha.definedone();
declare hash pa(ordered:'y');
declare hiter hi_path('pa');
pa.definekey('n');
pa.definedata('n','path');
pa.definedone();
end;
set min_id;
count=1;n=1;_n=1;
path=catx('|',_start,_end);
pa.add();output;
do while(hi_path.next()=0);
 if n ne 1 then pa.remove(key:_n);_n=n;
 _path=path;   
 _start=scan(path,-1,'|');
 rc=ha.find();  
 do while(rc=0);
  if not findw(path,strip(_end),'|') then do;
   if length(path)+length(_end)+1 gt lengthc(path) then do;
    putlog 'ERROR: The length of path and _path are set too short';
    stop;
   end; 
   count+1;n=count;
   path=catx('|',path,_end);
   pa.add(); output;
   path=_path;
 end; 
  rc=ha.find_next();
end;
end;
pa.clear();
run;

data want1;
if _n_=1 then do;
 if 0 then set x(keep=id);
 declare hash h4(dataset:'x(keep=id)');
 h4.definekey('id');
 h4.definedone();
end;
 set want;
 _id=scan(path,1,'|');
 first=input(scan(path,-1,'|'),best.);
 if h4.check(key:first)=0 then found=1;
 drop id first;
run;
proc sql;
create table want2 as
select * 
 from want1 
  group by _id
   having countw(path,'|')=max(countw(path,'|'));
quit;
data want3;
 set want2;
 n=countw(path,'|');
 do i=1 to n;
   name=scan(path,i,'|');
   if missing(found) then generation=n-i-1;
    else    generation=n-i;
   output;
 end;
 drop i n  path;
run;
data final_want;
if _n_=1 then do;
 if 0 then set want3;
 declare hash h1(dataset:'want3');
 h1.definekey('_id');
 h1.definedone();
 declare hash h2(dataset:'want3');
 h2.definekey('name');
 h2.definedata('generation');
 h2.definedone();
 if 0 then set famid;
 declare hash h3(dataset:'famid');
 h3.definekey('node');
 h3.definedata('famid');
 h3.definedone();
 if 0 then set x;
 declare hash h4(dataset:'x');
 h4.definekey('id');
 h4.definedone();
end;
set x;
_id=put(id,best. -l);
child=ifn(h1.check()=0,0,1);

call missing(generation);
rc=h2.find(key:_id);

call missing(famid);
rc=h3.find(key:id);

if h4.check(key:momid)=0 then mon_in=1;
 else mon_in=0;

if h4.check(key:dadid)=0 then dad_in=1;
 else dad_in=0;
drop _id found name rc node;
run;

View solution in original post

Ksharp · Posted 08-01-2023 08:24 AM

data x;
input id sex $ year momid dadid;
datalines;
77 M 1929 . .
69 F 1940 . .
52 F 1961 69 77
99 F 1962 69 77
16 F 1963 31 12
22 M 1980 16 29
68 F 1984 52 21
72 M 1992 52 97
55 M 1994 52 97
11 F 1995 99 37
20 F 1999 99 37
98 F 2019 68 24
;
run;

/*************************/
/*********FamID**********/
data have;
 set x;
from=id;
to=momid;if not missing(to) then output;
to=dadid;if not missing(to) then output;
keep  from  to ;
run;
data full;
  set have end=last;
  if _n_ eq 1 then do;
   declare hash h();
    h.definekey('node');
     h.definedata('node');
     h.definedone();
  end;
  output;
  node=from; h.replace();
  from=to; to=node; output;
  node=from; h.replace();
  if last then h.output(dataset:'node');
  drop node;
run;


data famid(keep=node household rename=(household=famid));
declare hash ha(ordered:'a');
declare hiter hi('ha');
ha.definekey('count');
ha.definedata('last');
ha.definedone();
declare hash _ha(hashexp: 16);
_ha.definekey('key');
_ha.definedone();

if 0 then set full;
declare hash from_to(dataset:'full(where=(from is not missing and to is not missing))',hashexp:20,multidata:'y');
 from_to.definekey('from');
 from_to.definedata('to');
 from_to.definedone();

if 0 then set node;
declare hash no(dataset:'node');
declare hiter hi_no('no');
 no.definekey('node');
 no.definedata('node');
 no.definedone();
 

do while(hi_no.next()=0);
 household+1; output;
 count=1;
 key=node;_ha.add();
 last=node;ha.add();
 rc=hi.first();
 do while(rc=0);
   from=last;rx=from_to.find();
   do while(rx=0);
     key=to;ry=_ha.check();
      if ry ne 0 then do;
       node=to;output;rr=no.remove(key:node);
       key=to;_ha.add();
       count+1;
       last=to;ha.add();
      end;
      rx=from_to.find_next();
   end;
   rc=hi.next();
end;
ha.clear();_ha.clear();
end;
stop;
run;










/*************************/
/*********Other variables**********/
data have;
 set x;
_start=id;
_end=momid; output;
_end=dadid; output;
keep  _end  _start ;
run;
proc sql;
create table min_id as
select *
 from have 
  where _start not in (select _end from have);
quit;
data want(keep=path);
if _n_ eq 1 then do;
length path _path  $ 400 ;
if 0 then set have;
declare hash ha(hashexp:20,dataset:'have(where=(_start is not missing and _end is not missing))',multidata:'y');
ha.definekey('_start');
ha.definedata('_end');
ha.definedone();
declare hash pa(ordered:'y');
declare hiter hi_path('pa');
pa.definekey('n');
pa.definedata('n','path');
pa.definedone();
end;
set min_id;
count=1;n=1;_n=1;
path=catx('|',_start,_end);
pa.add();output;
do while(hi_path.next()=0);
 if n ne 1 then pa.remove(key:_n);_n=n;
 _path=path;   
 _start=scan(path,-1,'|');
 rc=ha.find();  
 do while(rc=0);
  if not findw(path,strip(_end),'|') then do;
   if length(path)+length(_end)+1 gt lengthc(path) then do;
    putlog 'ERROR: The length of path and _path are set too short';
    stop;
   end; 
   count+1;n=count;
   path=catx('|',path,_end);
   pa.add(); output;
   path=_path;
 end; 
  rc=ha.find_next();
end;
end;
pa.clear();
run;

data want1;
if _n_=1 then do;
 if 0 then set x(keep=id);
 declare hash h4(dataset:'x(keep=id)');
 h4.definekey('id');
 h4.definedone();
end;
 set want;
 _id=scan(path,1,'|');
 first=input(scan(path,-1,'|'),best.);
 if h4.check(key:first)=0 then found=1;
 drop id first;
run;
proc sql;
create table want2 as
select * 
 from want1 
  group by _id
   having countw(path,'|')=max(countw(path,'|'));
quit;
data want3;
 set want2;
 n=countw(path,'|');
 do i=1 to n;
   name=scan(path,i,'|');
   if missing(found) then generation=n-i-1;
    else    generation=n-i;
   output;
 end;
 drop i n  path;
run;
data final_want;
if _n_=1 then do;
 if 0 then set want3;
 declare hash h1(dataset:'want3');
 h1.definekey('_id');
 h1.definedone();
 declare hash h2(dataset:'want3');
 h2.definekey('name');
 h2.definedata('generation');
 h2.definedone();
 if 0 then set famid;
 declare hash h3(dataset:'famid');
 h3.definekey('node');
 h3.definedata('famid');
 h3.definedone();
 if 0 then set x;
 declare hash h4(dataset:'x');
 h4.definekey('id');
 h4.definedone();
end;
set x;
_id=put(id,best. -l);
child=ifn(h1.check()=0,0,1);

call missing(generation);
rc=h2.find(key:_id);

call missing(famid);
rc=h3.find(key:id);

if h4.check(key:momid)=0 then mon_in=1;
 else mon_in=0;

if h4.check(key:dadid)=0 then dad_in=1;
 else dad_in=0;
drop _id found name rc node;
run;

al15 · Posted 08-01-2023 09:15 AM

@Ksharp, thank you so much for your help!! Your code is brilliant and solves a problem that I've been struggling with for too long. I really appreciate that you took the time to answer my question.

How to identify family linkages within and across generations?

Re: How to identify family linkages within and across generations?

Re: How to identify family linkages within and across generations?

Re: How to identify family linkages within and across generations?

Registration is open

SAS Training: Just a Click Away