BookmarkSubscribeRSS Feed
☑ This topic is solved. Need further help from the community? Please sign in and ask a new question.
al15
Calcite | Level 5

I have a dataset that contains information on multiple family generations. I am fortunate that everyone in the dataset has an ID number, and mothers and fathers are identified.

 

I need to organize the dataset so that I can identify families in which the mother, father and child have information in my dataset (triads and dyads). I need to identify generations of individuals as well.

 

I have tried using sql, adapting the great code by @Ksharp from the post “identifying sibling structures in dataset” and other steps, but none of these approaches yields what I need.

 

I am well and truly stuck, and any and all advice is appreciated. I am using SAS Enterprise v7.15.

 

Here is what I have:

 

id

sex

yob

momid

dadid

77

M

1929

.

.

69

F

1940

.

.

52

F

1961

69

77

99

F

1962

69

77

16

F

1963

31

12

22

M

1980

16

29

68

F

1984

52

21

72

M

1992

52

97

55

M

1994

52

97

11

F

1995

99

37

20

F

1999

99

37

98

F

2019

68

24

 

And this is what I want:

id

sex

yob

momid

dadid

child

generation

famid

mom_in

dad_in

77

M

1929

.

.

1

0

1

0

0

69

F

1940

.

.

1

0

1

0

0

52

F

1961

69

77

1

1

1

1

1

99

F

1962

69

77

1

1

1

1

1

16

F

1963

31

12

1

0

2

0

0

22

M

1980

16

29

0

1

2

1

0

68

F

1984

52

21

1

2

1

1

0

72

M

1992

52

97

0

2

1

1

0

55

M

1994

52

97

0

2

1

1

0

11

F

1995

99

37

0

2

1

1

0

20

F

1999

99

37

0

2

1

1

0

98

F

2019

68

24

0

3

1

1

0

 

The id variables are id, momid, dadid. Sex is M/F, and yob is year of birth. Child identifies individuals in the id column who have a child, generation identifies the generation of the family (starting at 0), famid is a family identification, and mom_in means that a mother is in the id column and dad_in means that a father is in the id column.

 

Below is what I've tried:

data have;
input id sex $ year momid dadid;
datalines;
77 M 1929 . .
69 F 1940 . .
52 F 1961 69 77
99 F 1962 69 77
16 F 1963 31 12
22 M 1980 16 29
68 F 1984 52 21
72 M 1992 52 97
55 M 1994 52 97
11 F 1995 99 37
20 F 1999 99 37
98 F 2019 68 24
;
run;


/*Identifies individuals who are mothers or fathers in the dataset (e.g. the id column).*/
data mom;
set have (drop=id);
rename momid=id;
run;
proc sort; by id; run;
data dad;
set have (drop=id);
rename dadid=id;
run;
proc sort; by id; run;

/*moms*/
data a;
merge have (in=a) mom (in=b);
by id;
if a and b;
mom_in=1;
run;
data aa;
set a;
rename id=momid;
run;

/*dads*/
data b;
merge have (in=a) dad (in=b);
by id;
if a and b;
dad_in=1;
run;
data bb;
set b;
rename id=dadid;
run;

proc sort data=have; by momid; run;

data have2;
merge have aa;
by momid;
run;

proc sort data=have2; by dadid; run;

data have3;
merge have2 bb;
by dadid;
run;



/*I don't know how to expand this code to make families based on mothers AND fathers*/ 

/*from is parent, to is child*/
data have;
set have;
rename momid=from;
rename id=to;
run;

/*Family ids*/
data full;
  set have end=last;
  if _n_ eq 1 then do;
   declare hash h();
    h.definekey('node');
     h.definedata('node');
     h.definedone();
  end;
  output;
  node=from; h.replace();
  from=to; to=node; output;
  node=from; h.replace();
  if last then h.output(dataset:'node');
  drop node;
run;


data want/*(keep=node household)*/;
declare hash ha(ordered:'a');
declare hiter hi('ha');
ha.definekey('count');
ha.definedata('last');
ha.definedone();
declare hash _ha(hashexp: 16);
_ha.definekey('key');
_ha.definedone();

if 0 then set full;
declare hash from_to(dataset:'full',hashexp:20,multidata:'y');
 from_to.definekey('from');
 from_to.definedata('to');
 from_to.definedone();

if 0 then set node;
declare hash no(dataset:'node');
declare hiter hi_no('no');
 no.definekey('node');
 no.definedata('node');
 no.definedone();
 

do while(hi_no.next()=0);
 household+1; output;
 count=1;
 key=node;_ha.add();
 last=node;ha.add();
 rc=hi.first();
 do while(rc=0);
   from=last;rx=from_to.find();
   do while(rx=0);
     key=to;ry=_ha.check();
      if ry ne 0 then do;
       node=to;output;rr=no.remove(key:node);
       key=to;_ha.add();
       count+1;
       last=to;ha.add();
      end;
      rx=from_to.find_next();
   end;
   rc=hi.next();
end;
ha.clear();_ha.clear();
end;
stop;
run;

 

1 ACCEPTED SOLUTION

Accepted Solutions
Ksharp
Super User
data x;
input id sex $ year momid dadid;
datalines;
77 M 1929 . .
69 F 1940 . .
52 F 1961 69 77
99 F 1962 69 77
16 F 1963 31 12
22 M 1980 16 29
68 F 1984 52 21
72 M 1992 52 97
55 M 1994 52 97
11 F 1995 99 37
20 F 1999 99 37
98 F 2019 68 24
;
run;

/*************************/
/*********FamID**********/
data have;
 set x;
from=id;
to=momid;if not missing(to) then output;
to=dadid;if not missing(to) then output;
keep  from  to ;
run;
data full;
  set have end=last;
  if _n_ eq 1 then do;
   declare hash h();
    h.definekey('node');
     h.definedata('node');
     h.definedone();
  end;
  output;
  node=from; h.replace();
  from=to; to=node; output;
  node=from; h.replace();
  if last then h.output(dataset:'node');
  drop node;
run;


data famid(keep=node household rename=(household=famid));
declare hash ha(ordered:'a');
declare hiter hi('ha');
ha.definekey('count');
ha.definedata('last');
ha.definedone();
declare hash _ha(hashexp: 16);
_ha.definekey('key');
_ha.definedone();

if 0 then set full;
declare hash from_to(dataset:'full(where=(from is not missing and to is not missing))',hashexp:20,multidata:'y');
 from_to.definekey('from');
 from_to.definedata('to');
 from_to.definedone();

if 0 then set node;
declare hash no(dataset:'node');
declare hiter hi_no('no');
 no.definekey('node');
 no.definedata('node');
 no.definedone();
 

do while(hi_no.next()=0);
 household+1; output;
 count=1;
 key=node;_ha.add();
 last=node;ha.add();
 rc=hi.first();
 do while(rc=0);
   from=last;rx=from_to.find();
   do while(rx=0);
     key=to;ry=_ha.check();
      if ry ne 0 then do;
       node=to;output;rr=no.remove(key:node);
       key=to;_ha.add();
       count+1;
       last=to;ha.add();
      end;
      rx=from_to.find_next();
   end;
   rc=hi.next();
end;
ha.clear();_ha.clear();
end;
stop;
run;










/*************************/
/*********Other variables**********/
data have;
 set x;
_start=id;
_end=momid; output;
_end=dadid; output;
keep  _end  _start ;
run;
proc sql;
create table min_id as
select *
 from have 
  where _start not in (select _end from have);
quit;
data want(keep=path);
if _n_ eq 1 then do;
length path _path  $ 400 ;
if 0 then set have;
declare hash ha(hashexp:20,dataset:'have(where=(_start is not missing and _end is not missing))',multidata:'y');
ha.definekey('_start');
ha.definedata('_end');
ha.definedone();
declare hash pa(ordered:'y');
declare hiter hi_path('pa');
pa.definekey('n');
pa.definedata('n','path');
pa.definedone();
end;
set min_id;
count=1;n=1;_n=1;
path=catx('|',_start,_end);
pa.add();output;
do while(hi_path.next()=0);
 if n ne 1 then pa.remove(key:_n);_n=n;
 _path=path;   
 _start=scan(path,-1,'|');
 rc=ha.find();  
 do while(rc=0);
  if not findw(path,strip(_end),'|') then do;
   if length(path)+length(_end)+1 gt lengthc(path) then do;
    putlog 'ERROR: The length of path and _path are set too short';
    stop;
   end; 
   count+1;n=count;
   path=catx('|',path,_end);
   pa.add(); output;
   path=_path;
 end; 
  rc=ha.find_next();
end;
end;
pa.clear();
run;

data want1;
if _n_=1 then do;
 if 0 then set x(keep=id);
 declare hash h4(dataset:'x(keep=id)');
 h4.definekey('id');
 h4.definedone();
end;
 set want;
 _id=scan(path,1,'|');
 first=input(scan(path,-1,'|'),best.);
 if h4.check(key:first)=0 then found=1;
 drop id first;
run;
proc sql;
create table want2 as
select * 
 from want1 
  group by _id
   having countw(path,'|')=max(countw(path,'|'));
quit;
data want3;
 set want2;
 n=countw(path,'|');
 do i=1 to n;
   name=scan(path,i,'|');
   if missing(found) then generation=n-i-1;
    else    generation=n-i;
   output;
 end;
 drop i n  path;
run;
data final_want;
if _n_=1 then do;
 if 0 then set want3;
 declare hash h1(dataset:'want3');
 h1.definekey('_id');
 h1.definedone();
 declare hash h2(dataset:'want3');
 h2.definekey('name');
 h2.definedata('generation');
 h2.definedone();
 if 0 then set famid;
 declare hash h3(dataset:'famid');
 h3.definekey('node');
 h3.definedata('famid');
 h3.definedone();
 if 0 then set x;
 declare hash h4(dataset:'x');
 h4.definekey('id');
 h4.definedone();
end;
set x;
_id=put(id,best. -l);
child=ifn(h1.check()=0,0,1);

call missing(generation);
rc=h2.find(key:_id);

call missing(famid);
rc=h3.find(key:id);

if h4.check(key:momid)=0 then mon_in=1;
 else mon_in=0;

if h4.check(key:dadid)=0 then dad_in=1;
 else dad_in=0;
drop _id found name rc node;
run;

View solution in original post

2 REPLIES 2
Ksharp
Super User
data x;
input id sex $ year momid dadid;
datalines;
77 M 1929 . .
69 F 1940 . .
52 F 1961 69 77
99 F 1962 69 77
16 F 1963 31 12
22 M 1980 16 29
68 F 1984 52 21
72 M 1992 52 97
55 M 1994 52 97
11 F 1995 99 37
20 F 1999 99 37
98 F 2019 68 24
;
run;

/*************************/
/*********FamID**********/
data have;
 set x;
from=id;
to=momid;if not missing(to) then output;
to=dadid;if not missing(to) then output;
keep  from  to ;
run;
data full;
  set have end=last;
  if _n_ eq 1 then do;
   declare hash h();
    h.definekey('node');
     h.definedata('node');
     h.definedone();
  end;
  output;
  node=from; h.replace();
  from=to; to=node; output;
  node=from; h.replace();
  if last then h.output(dataset:'node');
  drop node;
run;


data famid(keep=node household rename=(household=famid));
declare hash ha(ordered:'a');
declare hiter hi('ha');
ha.definekey('count');
ha.definedata('last');
ha.definedone();
declare hash _ha(hashexp: 16);
_ha.definekey('key');
_ha.definedone();

if 0 then set full;
declare hash from_to(dataset:'full(where=(from is not missing and to is not missing))',hashexp:20,multidata:'y');
 from_to.definekey('from');
 from_to.definedata('to');
 from_to.definedone();

if 0 then set node;
declare hash no(dataset:'node');
declare hiter hi_no('no');
 no.definekey('node');
 no.definedata('node');
 no.definedone();
 

do while(hi_no.next()=0);
 household+1; output;
 count=1;
 key=node;_ha.add();
 last=node;ha.add();
 rc=hi.first();
 do while(rc=0);
   from=last;rx=from_to.find();
   do while(rx=0);
     key=to;ry=_ha.check();
      if ry ne 0 then do;
       node=to;output;rr=no.remove(key:node);
       key=to;_ha.add();
       count+1;
       last=to;ha.add();
      end;
      rx=from_to.find_next();
   end;
   rc=hi.next();
end;
ha.clear();_ha.clear();
end;
stop;
run;










/*************************/
/*********Other variables**********/
data have;
 set x;
_start=id;
_end=momid; output;
_end=dadid; output;
keep  _end  _start ;
run;
proc sql;
create table min_id as
select *
 from have 
  where _start not in (select _end from have);
quit;
data want(keep=path);
if _n_ eq 1 then do;
length path _path  $ 400 ;
if 0 then set have;
declare hash ha(hashexp:20,dataset:'have(where=(_start is not missing and _end is not missing))',multidata:'y');
ha.definekey('_start');
ha.definedata('_end');
ha.definedone();
declare hash pa(ordered:'y');
declare hiter hi_path('pa');
pa.definekey('n');
pa.definedata('n','path');
pa.definedone();
end;
set min_id;
count=1;n=1;_n=1;
path=catx('|',_start,_end);
pa.add();output;
do while(hi_path.next()=0);
 if n ne 1 then pa.remove(key:_n);_n=n;
 _path=path;   
 _start=scan(path,-1,'|');
 rc=ha.find();  
 do while(rc=0);
  if not findw(path,strip(_end),'|') then do;
   if length(path)+length(_end)+1 gt lengthc(path) then do;
    putlog 'ERROR: The length of path and _path are set too short';
    stop;
   end; 
   count+1;n=count;
   path=catx('|',path,_end);
   pa.add(); output;
   path=_path;
 end; 
  rc=ha.find_next();
end;
end;
pa.clear();
run;

data want1;
if _n_=1 then do;
 if 0 then set x(keep=id);
 declare hash h4(dataset:'x(keep=id)');
 h4.definekey('id');
 h4.definedone();
end;
 set want;
 _id=scan(path,1,'|');
 first=input(scan(path,-1,'|'),best.);
 if h4.check(key:first)=0 then found=1;
 drop id first;
run;
proc sql;
create table want2 as
select * 
 from want1 
  group by _id
   having countw(path,'|')=max(countw(path,'|'));
quit;
data want3;
 set want2;
 n=countw(path,'|');
 do i=1 to n;
   name=scan(path,i,'|');
   if missing(found) then generation=n-i-1;
    else    generation=n-i;
   output;
 end;
 drop i n  path;
run;
data final_want;
if _n_=1 then do;
 if 0 then set want3;
 declare hash h1(dataset:'want3');
 h1.definekey('_id');
 h1.definedone();
 declare hash h2(dataset:'want3');
 h2.definekey('name');
 h2.definedata('generation');
 h2.definedone();
 if 0 then set famid;
 declare hash h3(dataset:'famid');
 h3.definekey('node');
 h3.definedata('famid');
 h3.definedone();
 if 0 then set x;
 declare hash h4(dataset:'x');
 h4.definekey('id');
 h4.definedone();
end;
set x;
_id=put(id,best. -l);
child=ifn(h1.check()=0,0,1);

call missing(generation);
rc=h2.find(key:_id);

call missing(famid);
rc=h3.find(key:id);

if h4.check(key:momid)=0 then mon_in=1;
 else mon_in=0;

if h4.check(key:dadid)=0 then dad_in=1;
 else dad_in=0;
drop _id found name rc node;
run;
al15
Calcite | Level 5

@Ksharp, thank you so much for your help!! Your code is brilliant and solves a problem that I've been struggling with for too long. I really appreciate that you took the time to answer my question.

SAS Innovate 2025: Register Now

Registration is now open for SAS Innovate 2025 , our biggest and most exciting global event of the year! Join us in Orlando, FL, May 6-9.
Sign up by Dec. 31 to get the 2024 rate of just $495.
Register now!

How to Concatenate Values

Learn how use the CAT functions in SAS to join values from multiple variables into a single value.

Find more tutorials on the SAS Users YouTube channel.

SAS Training: Just a Click Away

 Ready to level-up your skills? Choose your own adventure.

Browse our catalog!

Discussion stats
  • 2 replies
  • 824 views
  • 6 likes
  • 2 in conversation