I have a dataset that contains information on multiple family generations. I am fortunate that everyone in the dataset has an ID number, and mothers and fathers are identified.
I need to organize the dataset so that I can identify families in which the mother, father and child have information in my dataset (triads and dyads). I need to identify generations of individuals as well.
I have tried using sql, adapting the great code by @Ksharp from the post “identifying sibling structures in dataset” and other steps, but none of these approaches yields what I need.
I am well and truly stuck, and any and all advice is appreciated. I am using SAS Enterprise v7.15.
Here is what I have:
id | sex | yob | momid | dadid |
77 | M | 1929 | . | . |
69 | F | 1940 | . | . |
52 | F | 1961 | 69 | 77 |
99 | F | 1962 | 69 | 77 |
16 | F | 1963 | 31 | 12 |
22 | M | 1980 | 16 | 29 |
68 | F | 1984 | 52 | 21 |
72 | M | 1992 | 52 | 97 |
55 | M | 1994 | 52 | 97 |
11 | F | 1995 | 99 | 37 |
20 | F | 1999 | 99 | 37 |
98 | F | 2019 | 68 | 24 |
And this is what I want:
id | sex | yob | momid | dadid | child | generation | famid | mom_in | dad_in |
77 | M | 1929 | . | . | 1 | 0 | 1 | 0 | 0 |
69 | F | 1940 | . | . | 1 | 0 | 1 | 0 | 0 |
52 | F | 1961 | 69 | 77 | 1 | 1 | 1 | 1 | 1 |
99 | F | 1962 | 69 | 77 | 1 | 1 | 1 | 1 | 1 |
16 | F | 1963 | 31 | 12 | 1 | 0 | 2 | 0 | 0 |
22 | M | 1980 | 16 | 29 | 0 | 1 | 2 | 1 | 0 |
68 | F | 1984 | 52 | 21 | 1 | 2 | 1 | 1 | 0 |
72 | M | 1992 | 52 | 97 | 0 | 2 | 1 | 1 | 0 |
55 | M | 1994 | 52 | 97 | 0 | 2 | 1 | 1 | 0 |
11 | F | 1995 | 99 | 37 | 0 | 2 | 1 | 1 | 0 |
20 | F | 1999 | 99 | 37 | 0 | 2 | 1 | 1 | 0 |
98 | F | 2019 | 68 | 24 | 0 | 3 | 1 | 1 | 0 |
The id variables are id, momid, dadid. Sex is M/F, and yob is year of birth. Child identifies individuals in the id column who have a child, generation identifies the generation of the family (starting at 0), famid is a family identification, and mom_in means that a mother is in the id column and dad_in means that a father is in the id column.
Below is what I've tried:
data have;
input id sex $ year momid dadid;
datalines;
77 M 1929 . .
69 F 1940 . .
52 F 1961 69 77
99 F 1962 69 77
16 F 1963 31 12
22 M 1980 16 29
68 F 1984 52 21
72 M 1992 52 97
55 M 1994 52 97
11 F 1995 99 37
20 F 1999 99 37
98 F 2019 68 24
;
run;
/*Identifies individuals who are mothers or fathers in the dataset (e.g. the id column).*/
data mom;
set have (drop=id);
rename momid=id;
run;
proc sort; by id; run;
data dad;
set have (drop=id);
rename dadid=id;
run;
proc sort; by id; run;
/*moms*/
data a;
merge have (in=a) mom (in=b);
by id;
if a and b;
mom_in=1;
run;
data aa;
set a;
rename id=momid;
run;
/*dads*/
data b;
merge have (in=a) dad (in=b);
by id;
if a and b;
dad_in=1;
run;
data bb;
set b;
rename id=dadid;
run;
proc sort data=have; by momid; run;
data have2;
merge have aa;
by momid;
run;
proc sort data=have2; by dadid; run;
data have3;
merge have2 bb;
by dadid;
run;
/*I don't know how to expand this code to make families based on mothers AND fathers*/
/*from is parent, to is child*/
data have;
set have;
rename momid=from;
rename id=to;
run;
/*Family ids*/
data full;
set have end=last;
if _n_ eq 1 then do;
declare hash h();
h.definekey('node');
h.definedata('node');
h.definedone();
end;
output;
node=from; h.replace();
from=to; to=node; output;
node=from; h.replace();
if last then h.output(dataset:'node');
drop node;
run;
data want/*(keep=node household)*/;
declare hash ha(ordered:'a');
declare hiter hi('ha');
ha.definekey('count');
ha.definedata('last');
ha.definedone();
declare hash _ha(hashexp: 16);
_ha.definekey('key');
_ha.definedone();
if 0 then set full;
declare hash from_to(dataset:'full',hashexp:20,multidata:'y');
from_to.definekey('from');
from_to.definedata('to');
from_to.definedone();
if 0 then set node;
declare hash no(dataset:'node');
declare hiter hi_no('no');
no.definekey('node');
no.definedata('node');
no.definedone();
do while(hi_no.next()=0);
household+1; output;
count=1;
key=node;_ha.add();
last=node;ha.add();
rc=hi.first();
do while(rc=0);
from=last;rx=from_to.find();
do while(rx=0);
key=to;ry=_ha.check();
if ry ne 0 then do;
node=to;output;rr=no.remove(key:node);
key=to;_ha.add();
count+1;
last=to;ha.add();
end;
rx=from_to.find_next();
end;
rc=hi.next();
end;
ha.clear();_ha.clear();
end;
stop;
run;
data x;
input id sex $ year momid dadid;
datalines;
77 M 1929 . .
69 F 1940 . .
52 F 1961 69 77
99 F 1962 69 77
16 F 1963 31 12
22 M 1980 16 29
68 F 1984 52 21
72 M 1992 52 97
55 M 1994 52 97
11 F 1995 99 37
20 F 1999 99 37
98 F 2019 68 24
;
run;
/*************************/
/*********FamID**********/
data have;
set x;
from=id;
to=momid;if not missing(to) then output;
to=dadid;if not missing(to) then output;
keep from to ;
run;
data full;
set have end=last;
if _n_ eq 1 then do;
declare hash h();
h.definekey('node');
h.definedata('node');
h.definedone();
end;
output;
node=from; h.replace();
from=to; to=node; output;
node=from; h.replace();
if last then h.output(dataset:'node');
drop node;
run;
data famid(keep=node household rename=(household=famid));
declare hash ha(ordered:'a');
declare hiter hi('ha');
ha.definekey('count');
ha.definedata('last');
ha.definedone();
declare hash _ha(hashexp: 16);
_ha.definekey('key');
_ha.definedone();
if 0 then set full;
declare hash from_to(dataset:'full(where=(from is not missing and to is not missing))',hashexp:20,multidata:'y');
from_to.definekey('from');
from_to.definedata('to');
from_to.definedone();
if 0 then set node;
declare hash no(dataset:'node');
declare hiter hi_no('no');
no.definekey('node');
no.definedata('node');
no.definedone();
do while(hi_no.next()=0);
household+1; output;
count=1;
key=node;_ha.add();
last=node;ha.add();
rc=hi.first();
do while(rc=0);
from=last;rx=from_to.find();
do while(rx=0);
key=to;ry=_ha.check();
if ry ne 0 then do;
node=to;output;rr=no.remove(key:node);
key=to;_ha.add();
count+1;
last=to;ha.add();
end;
rx=from_to.find_next();
end;
rc=hi.next();
end;
ha.clear();_ha.clear();
end;
stop;
run;
/*************************/
/*********Other variables**********/
data have;
set x;
_start=id;
_end=momid; output;
_end=dadid; output;
keep _end _start ;
run;
proc sql;
create table min_id as
select *
from have
where _start not in (select _end from have);
quit;
data want(keep=path);
if _n_ eq 1 then do;
length path _path $ 400 ;
if 0 then set have;
declare hash ha(hashexp:20,dataset:'have(where=(_start is not missing and _end is not missing))',multidata:'y');
ha.definekey('_start');
ha.definedata('_end');
ha.definedone();
declare hash pa(ordered:'y');
declare hiter hi_path('pa');
pa.definekey('n');
pa.definedata('n','path');
pa.definedone();
end;
set min_id;
count=1;n=1;_n=1;
path=catx('|',_start,_end);
pa.add();output;
do while(hi_path.next()=0);
if n ne 1 then pa.remove(key:_n);_n=n;
_path=path;
_start=scan(path,-1,'|');
rc=ha.find();
do while(rc=0);
if not findw(path,strip(_end),'|') then do;
if length(path)+length(_end)+1 gt lengthc(path) then do;
putlog 'ERROR: The length of path and _path are set too short';
stop;
end;
count+1;n=count;
path=catx('|',path,_end);
pa.add(); output;
path=_path;
end;
rc=ha.find_next();
end;
end;
pa.clear();
run;
data want1;
if _n_=1 then do;
if 0 then set x(keep=id);
declare hash h4(dataset:'x(keep=id)');
h4.definekey('id');
h4.definedone();
end;
set want;
_id=scan(path,1,'|');
first=input(scan(path,-1,'|'),best.);
if h4.check(key:first)=0 then found=1;
drop id first;
run;
proc sql;
create table want2 as
select *
from want1
group by _id
having countw(path,'|')=max(countw(path,'|'));
quit;
data want3;
set want2;
n=countw(path,'|');
do i=1 to n;
name=scan(path,i,'|');
if missing(found) then generation=n-i-1;
else generation=n-i;
output;
end;
drop i n path;
run;
data final_want;
if _n_=1 then do;
if 0 then set want3;
declare hash h1(dataset:'want3');
h1.definekey('_id');
h1.definedone();
declare hash h2(dataset:'want3');
h2.definekey('name');
h2.definedata('generation');
h2.definedone();
if 0 then set famid;
declare hash h3(dataset:'famid');
h3.definekey('node');
h3.definedata('famid');
h3.definedone();
if 0 then set x;
declare hash h4(dataset:'x');
h4.definekey('id');
h4.definedone();
end;
set x;
_id=put(id,best. -l);
child=ifn(h1.check()=0,0,1);
call missing(generation);
rc=h2.find(key:_id);
call missing(famid);
rc=h3.find(key:id);
if h4.check(key:momid)=0 then mon_in=1;
else mon_in=0;
if h4.check(key:dadid)=0 then dad_in=1;
else dad_in=0;
drop _id found name rc node;
run;
data x;
input id sex $ year momid dadid;
datalines;
77 M 1929 . .
69 F 1940 . .
52 F 1961 69 77
99 F 1962 69 77
16 F 1963 31 12
22 M 1980 16 29
68 F 1984 52 21
72 M 1992 52 97
55 M 1994 52 97
11 F 1995 99 37
20 F 1999 99 37
98 F 2019 68 24
;
run;
/*************************/
/*********FamID**********/
data have;
set x;
from=id;
to=momid;if not missing(to) then output;
to=dadid;if not missing(to) then output;
keep from to ;
run;
data full;
set have end=last;
if _n_ eq 1 then do;
declare hash h();
h.definekey('node');
h.definedata('node');
h.definedone();
end;
output;
node=from; h.replace();
from=to; to=node; output;
node=from; h.replace();
if last then h.output(dataset:'node');
drop node;
run;
data famid(keep=node household rename=(household=famid));
declare hash ha(ordered:'a');
declare hiter hi('ha');
ha.definekey('count');
ha.definedata('last');
ha.definedone();
declare hash _ha(hashexp: 16);
_ha.definekey('key');
_ha.definedone();
if 0 then set full;
declare hash from_to(dataset:'full(where=(from is not missing and to is not missing))',hashexp:20,multidata:'y');
from_to.definekey('from');
from_to.definedata('to');
from_to.definedone();
if 0 then set node;
declare hash no(dataset:'node');
declare hiter hi_no('no');
no.definekey('node');
no.definedata('node');
no.definedone();
do while(hi_no.next()=0);
household+1; output;
count=1;
key=node;_ha.add();
last=node;ha.add();
rc=hi.first();
do while(rc=0);
from=last;rx=from_to.find();
do while(rx=0);
key=to;ry=_ha.check();
if ry ne 0 then do;
node=to;output;rr=no.remove(key:node);
key=to;_ha.add();
count+1;
last=to;ha.add();
end;
rx=from_to.find_next();
end;
rc=hi.next();
end;
ha.clear();_ha.clear();
end;
stop;
run;
/*************************/
/*********Other variables**********/
data have;
set x;
_start=id;
_end=momid; output;
_end=dadid; output;
keep _end _start ;
run;
proc sql;
create table min_id as
select *
from have
where _start not in (select _end from have);
quit;
data want(keep=path);
if _n_ eq 1 then do;
length path _path $ 400 ;
if 0 then set have;
declare hash ha(hashexp:20,dataset:'have(where=(_start is not missing and _end is not missing))',multidata:'y');
ha.definekey('_start');
ha.definedata('_end');
ha.definedone();
declare hash pa(ordered:'y');
declare hiter hi_path('pa');
pa.definekey('n');
pa.definedata('n','path');
pa.definedone();
end;
set min_id;
count=1;n=1;_n=1;
path=catx('|',_start,_end);
pa.add();output;
do while(hi_path.next()=0);
if n ne 1 then pa.remove(key:_n);_n=n;
_path=path;
_start=scan(path,-1,'|');
rc=ha.find();
do while(rc=0);
if not findw(path,strip(_end),'|') then do;
if length(path)+length(_end)+1 gt lengthc(path) then do;
putlog 'ERROR: The length of path and _path are set too short';
stop;
end;
count+1;n=count;
path=catx('|',path,_end);
pa.add(); output;
path=_path;
end;
rc=ha.find_next();
end;
end;
pa.clear();
run;
data want1;
if _n_=1 then do;
if 0 then set x(keep=id);
declare hash h4(dataset:'x(keep=id)');
h4.definekey('id');
h4.definedone();
end;
set want;
_id=scan(path,1,'|');
first=input(scan(path,-1,'|'),best.);
if h4.check(key:first)=0 then found=1;
drop id first;
run;
proc sql;
create table want2 as
select *
from want1
group by _id
having countw(path,'|')=max(countw(path,'|'));
quit;
data want3;
set want2;
n=countw(path,'|');
do i=1 to n;
name=scan(path,i,'|');
if missing(found) then generation=n-i-1;
else generation=n-i;
output;
end;
drop i n path;
run;
data final_want;
if _n_=1 then do;
if 0 then set want3;
declare hash h1(dataset:'want3');
h1.definekey('_id');
h1.definedone();
declare hash h2(dataset:'want3');
h2.definekey('name');
h2.definedata('generation');
h2.definedone();
if 0 then set famid;
declare hash h3(dataset:'famid');
h3.definekey('node');
h3.definedata('famid');
h3.definedone();
if 0 then set x;
declare hash h4(dataset:'x');
h4.definekey('id');
h4.definedone();
end;
set x;
_id=put(id,best. -l);
child=ifn(h1.check()=0,0,1);
call missing(generation);
rc=h2.find(key:_id);
call missing(famid);
rc=h3.find(key:id);
if h4.check(key:momid)=0 then mon_in=1;
else mon_in=0;
if h4.check(key:dadid)=0 then dad_in=1;
else dad_in=0;
drop _id found name rc node;
run;
@Ksharp, thank you so much for your help!! Your code is brilliant and solves a problem that I've been struggling with for too long. I really appreciate that you took the time to answer my question.
Join us for SAS Innovate 2025, our biggest and most exciting global event of the year, in Orlando, FL, from May 6-9. Sign up by March 14 for just $795.
Learn how use the CAT functions in SAS to join values from multiple variables into a single value.
Find more tutorials on the SAS Users YouTube channel.
Ready to level-up your skills? Choose your own adventure.