I have a dataset that contains information on multiple family generations. I am fortunate that everyone in the dataset has an ID number, and mothers and fathers are identified.
I need to organize the dataset so that I can identify families in which the mother, father and child have information in my dataset (triads and dyads). I need to identify generations of individuals as well.
I have tried using sql, adapting the great code by @Ksharp from the post “identifying sibling structures in dataset” and other steps, but none of these approaches yields what I need.
I am well and truly stuck, and any and all advice is appreciated. I am using SAS Enterprise v7.15.
Here is what I have:
id | sex | yob | momid | dadid |
77 | M | 1929 | . | . |
69 | F | 1940 | . | . |
52 | F | 1961 | 69 | 77 |
99 | F | 1962 | 69 | 77 |
16 | F | 1963 | 31 | 12 |
22 | M | 1980 | 16 | 29 |
68 | F | 1984 | 52 | 21 |
72 | M | 1992 | 52 | 97 |
55 | M | 1994 | 52 | 97 |
11 | F | 1995 | 99 | 37 |
20 | F | 1999 | 99 | 37 |
98 | F | 2019 | 68 | 24 |
And this is what I want:
id | sex | yob | momid | dadid | child | generation | famid | mom_in | dad_in |
77 | M | 1929 | . | . | 1 | 0 | 1 | 0 | 0 |
69 | F | 1940 | . | . | 1 | 0 | 1 | 0 | 0 |
52 | F | 1961 | 69 | 77 | 1 | 1 | 1 | 1 | 1 |
99 | F | 1962 | 69 | 77 | 1 | 1 | 1 | 1 | 1 |
16 | F | 1963 | 31 | 12 | 1 | 0 | 2 | 0 | 0 |
22 | M | 1980 | 16 | 29 | 0 | 1 | 2 | 1 | 0 |
68 | F | 1984 | 52 | 21 | 1 | 2 | 1 | 1 | 0 |
72 | M | 1992 | 52 | 97 | 0 | 2 | 1 | 1 | 0 |
55 | M | 1994 | 52 | 97 | 0 | 2 | 1 | 1 | 0 |
11 | F | 1995 | 99 | 37 | 0 | 2 | 1 | 1 | 0 |
20 | F | 1999 | 99 | 37 | 0 | 2 | 1 | 1 | 0 |
98 | F | 2019 | 68 | 24 | 0 | 3 | 1 | 1 | 0 |
The id variables are id, momid, dadid. Sex is M/F, and yob is year of birth. Child identifies individuals in the id column who have a child, generation identifies the generation of the family (starting at 0), famid is a family identification, and mom_in means that a mother is in the id column and dad_in means that a father is in the id column.
Below is what I've tried:
data have;
input id sex $ year momid dadid;
datalines;
77 M 1929 . .
69 F 1940 . .
52 F 1961 69 77
99 F 1962 69 77
16 F 1963 31 12
22 M 1980 16 29
68 F 1984 52 21
72 M 1992 52 97
55 M 1994 52 97
11 F 1995 99 37
20 F 1999 99 37
98 F 2019 68 24
;
run;
/*Identifies individuals who are mothers or fathers in the dataset (e.g. the id column).*/
data mom;
set have (drop=id);
rename momid=id;
run;
proc sort; by id; run;
data dad;
set have (drop=id);
rename dadid=id;
run;
proc sort; by id; run;
/*moms*/
data a;
merge have (in=a) mom (in=b);
by id;
if a and b;
mom_in=1;
run;
data aa;
set a;
rename id=momid;
run;
/*dads*/
data b;
merge have (in=a) dad (in=b);
by id;
if a and b;
dad_in=1;
run;
data bb;
set b;
rename id=dadid;
run;
proc sort data=have; by momid; run;
data have2;
merge have aa;
by momid;
run;
proc sort data=have2; by dadid; run;
data have3;
merge have2 bb;
by dadid;
run;
/*I don't know how to expand this code to make families based on mothers AND fathers*/
/*from is parent, to is child*/
data have;
set have;
rename momid=from;
rename id=to;
run;
/*Family ids*/
data full;
set have end=last;
if _n_ eq 1 then do;
declare hash h();
h.definekey('node');
h.definedata('node');
h.definedone();
end;
output;
node=from; h.replace();
from=to; to=node; output;
node=from; h.replace();
if last then h.output(dataset:'node');
drop node;
run;
data want/*(keep=node household)*/;
declare hash ha(ordered:'a');
declare hiter hi('ha');
ha.definekey('count');
ha.definedata('last');
ha.definedone();
declare hash _ha(hashexp: 16);
_ha.definekey('key');
_ha.definedone();
if 0 then set full;
declare hash from_to(dataset:'full',hashexp:20,multidata:'y');
from_to.definekey('from');
from_to.definedata('to');
from_to.definedone();
if 0 then set node;
declare hash no(dataset:'node');
declare hiter hi_no('no');
no.definekey('node');
no.definedata('node');
no.definedone();
do while(hi_no.next()=0);
household+1; output;
count=1;
key=node;_ha.add();
last=node;ha.add();
rc=hi.first();
do while(rc=0);
from=last;rx=from_to.find();
do while(rx=0);
key=to;ry=_ha.check();
if ry ne 0 then do;
node=to;output;rr=no.remove(key:node);
key=to;_ha.add();
count+1;
last=to;ha.add();
end;
rx=from_to.find_next();
end;
rc=hi.next();
end;
ha.clear();_ha.clear();
end;
stop;
run;
data x;
input id sex $ year momid dadid;
datalines;
77 M 1929 . .
69 F 1940 . .
52 F 1961 69 77
99 F 1962 69 77
16 F 1963 31 12
22 M 1980 16 29
68 F 1984 52 21
72 M 1992 52 97
55 M 1994 52 97
11 F 1995 99 37
20 F 1999 99 37
98 F 2019 68 24
;
run;
/*************************/
/*********FamID**********/
data have;
set x;
from=id;
to=momid;if not missing(to) then output;
to=dadid;if not missing(to) then output;
keep from to ;
run;
data full;
set have end=last;
if _n_ eq 1 then do;
declare hash h();
h.definekey('node');
h.definedata('node');
h.definedone();
end;
output;
node=from; h.replace();
from=to; to=node; output;
node=from; h.replace();
if last then h.output(dataset:'node');
drop node;
run;
data famid(keep=node household rename=(household=famid));
declare hash ha(ordered:'a');
declare hiter hi('ha');
ha.definekey('count');
ha.definedata('last');
ha.definedone();
declare hash _ha(hashexp: 16);
_ha.definekey('key');
_ha.definedone();
if 0 then set full;
declare hash from_to(dataset:'full(where=(from is not missing and to is not missing))',hashexp:20,multidata:'y');
from_to.definekey('from');
from_to.definedata('to');
from_to.definedone();
if 0 then set node;
declare hash no(dataset:'node');
declare hiter hi_no('no');
no.definekey('node');
no.definedata('node');
no.definedone();
do while(hi_no.next()=0);
household+1; output;
count=1;
key=node;_ha.add();
last=node;ha.add();
rc=hi.first();
do while(rc=0);
from=last;rx=from_to.find();
do while(rx=0);
key=to;ry=_ha.check();
if ry ne 0 then do;
node=to;output;rr=no.remove(key:node);
key=to;_ha.add();
count+1;
last=to;ha.add();
end;
rx=from_to.find_next();
end;
rc=hi.next();
end;
ha.clear();_ha.clear();
end;
stop;
run;
/*************************/
/*********Other variables**********/
data have;
set x;
_start=id;
_end=momid; output;
_end=dadid; output;
keep _end _start ;
run;
proc sql;
create table min_id as
select *
from have
where _start not in (select _end from have);
quit;
data want(keep=path);
if _n_ eq 1 then do;
length path _path $ 400 ;
if 0 then set have;
declare hash ha(hashexp:20,dataset:'have(where=(_start is not missing and _end is not missing))',multidata:'y');
ha.definekey('_start');
ha.definedata('_end');
ha.definedone();
declare hash pa(ordered:'y');
declare hiter hi_path('pa');
pa.definekey('n');
pa.definedata('n','path');
pa.definedone();
end;
set min_id;
count=1;n=1;_n=1;
path=catx('|',_start,_end);
pa.add();output;
do while(hi_path.next()=0);
if n ne 1 then pa.remove(key:_n);_n=n;
_path=path;
_start=scan(path,-1,'|');
rc=ha.find();
do while(rc=0);
if not findw(path,strip(_end),'|') then do;
if length(path)+length(_end)+1 gt lengthc(path) then do;
putlog 'ERROR: The length of path and _path are set too short';
stop;
end;
count+1;n=count;
path=catx('|',path,_end);
pa.add(); output;
path=_path;
end;
rc=ha.find_next();
end;
end;
pa.clear();
run;
data want1;
if _n_=1 then do;
if 0 then set x(keep=id);
declare hash h4(dataset:'x(keep=id)');
h4.definekey('id');
h4.definedone();
end;
set want;
_id=scan(path,1,'|');
first=input(scan(path,-1,'|'),best.);
if h4.check(key:first)=0 then found=1;
drop id first;
run;
proc sql;
create table want2 as
select *
from want1
group by _id
having countw(path,'|')=max(countw(path,'|'));
quit;
data want3;
set want2;
n=countw(path,'|');
do i=1 to n;
name=scan(path,i,'|');
if missing(found) then generation=n-i-1;
else generation=n-i;
output;
end;
drop i n path;
run;
data final_want;
if _n_=1 then do;
if 0 then set want3;
declare hash h1(dataset:'want3');
h1.definekey('_id');
h1.definedone();
declare hash h2(dataset:'want3');
h2.definekey('name');
h2.definedata('generation');
h2.definedone();
if 0 then set famid;
declare hash h3(dataset:'famid');
h3.definekey('node');
h3.definedata('famid');
h3.definedone();
if 0 then set x;
declare hash h4(dataset:'x');
h4.definekey('id');
h4.definedone();
end;
set x;
_id=put(id,best. -l);
child=ifn(h1.check()=0,0,1);
call missing(generation);
rc=h2.find(key:_id);
call missing(famid);
rc=h3.find(key:id);
if h4.check(key:momid)=0 then mon_in=1;
else mon_in=0;
if h4.check(key:dadid)=0 then dad_in=1;
else dad_in=0;
drop _id found name rc node;
run;
data x;
input id sex $ year momid dadid;
datalines;
77 M 1929 . .
69 F 1940 . .
52 F 1961 69 77
99 F 1962 69 77
16 F 1963 31 12
22 M 1980 16 29
68 F 1984 52 21
72 M 1992 52 97
55 M 1994 52 97
11 F 1995 99 37
20 F 1999 99 37
98 F 2019 68 24
;
run;
/*************************/
/*********FamID**********/
data have;
set x;
from=id;
to=momid;if not missing(to) then output;
to=dadid;if not missing(to) then output;
keep from to ;
run;
data full;
set have end=last;
if _n_ eq 1 then do;
declare hash h();
h.definekey('node');
h.definedata('node');
h.definedone();
end;
output;
node=from; h.replace();
from=to; to=node; output;
node=from; h.replace();
if last then h.output(dataset:'node');
drop node;
run;
data famid(keep=node household rename=(household=famid));
declare hash ha(ordered:'a');
declare hiter hi('ha');
ha.definekey('count');
ha.definedata('last');
ha.definedone();
declare hash _ha(hashexp: 16);
_ha.definekey('key');
_ha.definedone();
if 0 then set full;
declare hash from_to(dataset:'full(where=(from is not missing and to is not missing))',hashexp:20,multidata:'y');
from_to.definekey('from');
from_to.definedata('to');
from_to.definedone();
if 0 then set node;
declare hash no(dataset:'node');
declare hiter hi_no('no');
no.definekey('node');
no.definedata('node');
no.definedone();
do while(hi_no.next()=0);
household+1; output;
count=1;
key=node;_ha.add();
last=node;ha.add();
rc=hi.first();
do while(rc=0);
from=last;rx=from_to.find();
do while(rx=0);
key=to;ry=_ha.check();
if ry ne 0 then do;
node=to;output;rr=no.remove(key:node);
key=to;_ha.add();
count+1;
last=to;ha.add();
end;
rx=from_to.find_next();
end;
rc=hi.next();
end;
ha.clear();_ha.clear();
end;
stop;
run;
/*************************/
/*********Other variables**********/
data have;
set x;
_start=id;
_end=momid; output;
_end=dadid; output;
keep _end _start ;
run;
proc sql;
create table min_id as
select *
from have
where _start not in (select _end from have);
quit;
data want(keep=path);
if _n_ eq 1 then do;
length path _path $ 400 ;
if 0 then set have;
declare hash ha(hashexp:20,dataset:'have(where=(_start is not missing and _end is not missing))',multidata:'y');
ha.definekey('_start');
ha.definedata('_end');
ha.definedone();
declare hash pa(ordered:'y');
declare hiter hi_path('pa');
pa.definekey('n');
pa.definedata('n','path');
pa.definedone();
end;
set min_id;
count=1;n=1;_n=1;
path=catx('|',_start,_end);
pa.add();output;
do while(hi_path.next()=0);
if n ne 1 then pa.remove(key:_n);_n=n;
_path=path;
_start=scan(path,-1,'|');
rc=ha.find();
do while(rc=0);
if not findw(path,strip(_end),'|') then do;
if length(path)+length(_end)+1 gt lengthc(path) then do;
putlog 'ERROR: The length of path and _path are set too short';
stop;
end;
count+1;n=count;
path=catx('|',path,_end);
pa.add(); output;
path=_path;
end;
rc=ha.find_next();
end;
end;
pa.clear();
run;
data want1;
if _n_=1 then do;
if 0 then set x(keep=id);
declare hash h4(dataset:'x(keep=id)');
h4.definekey('id');
h4.definedone();
end;
set want;
_id=scan(path,1,'|');
first=input(scan(path,-1,'|'),best.);
if h4.check(key:first)=0 then found=1;
drop id first;
run;
proc sql;
create table want2 as
select *
from want1
group by _id
having countw(path,'|')=max(countw(path,'|'));
quit;
data want3;
set want2;
n=countw(path,'|');
do i=1 to n;
name=scan(path,i,'|');
if missing(found) then generation=n-i-1;
else generation=n-i;
output;
end;
drop i n path;
run;
data final_want;
if _n_=1 then do;
if 0 then set want3;
declare hash h1(dataset:'want3');
h1.definekey('_id');
h1.definedone();
declare hash h2(dataset:'want3');
h2.definekey('name');
h2.definedata('generation');
h2.definedone();
if 0 then set famid;
declare hash h3(dataset:'famid');
h3.definekey('node');
h3.definedata('famid');
h3.definedone();
if 0 then set x;
declare hash h4(dataset:'x');
h4.definekey('id');
h4.definedone();
end;
set x;
_id=put(id,best. -l);
child=ifn(h1.check()=0,0,1);
call missing(generation);
rc=h2.find(key:_id);
call missing(famid);
rc=h3.find(key:id);
if h4.check(key:momid)=0 then mon_in=1;
else mon_in=0;
if h4.check(key:dadid)=0 then dad_in=1;
else dad_in=0;
drop _id found name rc node;
run;
@Ksharp, thank you so much for your help!! Your code is brilliant and solves a problem that I've been struggling with for too long. I really appreciate that you took the time to answer my question.
Registration is now open for SAS Innovate 2025 , our biggest and most exciting global event of the year! Join us in Orlando, FL, May 6-9.
Sign up by Dec. 31 to get the 2024 rate of just $495.
Register now!
Learn how use the CAT functions in SAS to join values from multiple variables into a single value.
Find more tutorials on the SAS Users YouTube channel.
Ready to level-up your skills? Choose your own adventure.