I have a dataset that contains information on multiple family generations. I am fortunate that everyone in the dataset has an ID number, and mothers and fathers are identified. I need to organize the dataset so that I can identify families in which the mother, father and child have information in my dataset (triads and dyads). I need to identify generations of individuals as well. I have tried using sql, adapting the great code by @Ksharp from the post “identifying sibling structures in dataset” and other steps, but none of these approaches yields what I need. I am well and truly stuck, and any and all advice is appreciated. I am using SAS Enterprise v7.15. Here is what I have: id sex yob momid dadid 77 M 1929 . . 69 F 1940 . . 52 F 1961 69 77 99 F 1962 69 77 16 F 1963 31 12 22 M 1980 16 29 68 F 1984 52 21 72 M 1992 52 97 55 M 1994 52 97 11 F 1995 99 37 20 F 1999 99 37 98 F 2019 68 24 And this is what I want: id sex yob momid dadid child generation famid mom_in dad_in 77 M 1929 . . 1 0 1 0 0 69 F 1940 . . 1 0 1 0 0 52 F 1961 69 77 1 1 1 1 1 99 F 1962 69 77 1 1 1 1 1 16 F 1963 31 12 1 0 2 0 0 22 M 1980 16 29 0 1 2 1 0 68 F 1984 52 21 1 2 1 1 0 72 M 1992 52 97 0 2 1 1 0 55 M 1994 52 97 0 2 1 1 0 11 F 1995 99 37 0 2 1 1 0 20 F 1999 99 37 0 2 1 1 0 98 F 2019 68 24 0 3 1 1 0 The id variables are id, momid, dadid. Sex is M/F, and yob is year of birth. Child identifies individuals in the id column who have a child, generation identifies the generation of the family (starting at 0), famid is a family identification, and mom_in means that a mother is in the id column and dad_in means that a father is in the id column. Below is what I've tried: data have;
input id sex $ year momid dadid;
datalines;
77 M 1929 . .
69 F 1940 . .
52 F 1961 69 77
99 F 1962 69 77
16 F 1963 31 12
22 M 1980 16 29
68 F 1984 52 21
72 M 1992 52 97
55 M 1994 52 97
11 F 1995 99 37
20 F 1999 99 37
98 F 2019 68 24
;
run;
/*Identifies individuals who are mothers or fathers in the dataset (e.g. the id column).*/
data mom;
set have (drop=id);
rename momid=id;
run;
proc sort; by id; run;
data dad;
set have (drop=id);
rename dadid=id;
run;
proc sort; by id; run;
/*moms*/
data a;
merge have (in=a) mom (in=b);
by id;
if a and b;
mom_in=1;
run;
data aa;
set a;
rename id=momid;
run;
/*dads*/
data b;
merge have (in=a) dad (in=b);
by id;
if a and b;
dad_in=1;
run;
data bb;
set b;
rename id=dadid;
run;
proc sort data=have; by momid; run;
data have2;
merge have aa;
by momid;
run;
proc sort data=have2; by dadid; run;
data have3;
merge have2 bb;
by dadid;
run;
/*I don't know how to expand this code to make families based on mothers AND fathers*/
/*from is parent, to is child*/
data have;
set have;
rename momid=from;
rename id=to;
run;
/*Family ids*/
data full;
set have end=last;
if _n_ eq 1 then do;
declare hash h();
h.definekey('node');
h.definedata('node');
h.definedone();
end;
output;
node=from; h.replace();
from=to; to=node; output;
node=from; h.replace();
if last then h.output(dataset:'node');
drop node;
run;
data want/*(keep=node household)*/;
declare hash ha(ordered:'a');
declare hiter hi('ha');
ha.definekey('count');
ha.definedata('last');
ha.definedone();
declare hash _ha(hashexp: 16);
_ha.definekey('key');
_ha.definedone();
if 0 then set full;
declare hash from_to(dataset:'full',hashexp:20,multidata:'y');
from_to.definekey('from');
from_to.definedata('to');
from_to.definedone();
if 0 then set node;
declare hash no(dataset:'node');
declare hiter hi_no('no');
no.definekey('node');
no.definedata('node');
no.definedone();
do while(hi_no.next()=0);
household+1; output;
count=1;
key=node;_ha.add();
last=node;ha.add();
rc=hi.first();
do while(rc=0);
from=last;rx=from_to.find();
do while(rx=0);
key=to;ry=_ha.check();
if ry ne 0 then do;
node=to;output;rr=no.remove(key:node);
key=to;_ha.add();
count+1;
last=to;ha.add();
end;
rx=from_to.find_next();
end;
rc=hi.next();
end;
ha.clear();_ha.clear();
end;
stop;
run;
... View more