BookmarkSubscribeRSS Feed
☑ This topic is solved. Need further help from the community? Please sign in and ask a new question.
al15
Calcite | Level 5

I have a dataset that contains information on multiple family generations. I am fortunate that everyone in the dataset has an ID number, and mothers and fathers are identified.

 

I need to organize the dataset so that I can identify families in which the mother, father and child have information in my dataset (triads and dyads). I need to identify generations of individuals as well.

 

I have tried using sql, adapting the great code by @Ksharp from the post “identifying sibling structures in dataset” and other steps, but none of these approaches yields what I need.

 

I am well and truly stuck, and any and all advice is appreciated. I am using SAS Enterprise v7.15.

 

Here is what I have:

 

id

sex

yob

momid

dadid

77

M

1929

.

.

69

F

1940

.

.

52

F

1961

69

77

99

F

1962

69

77

16

F

1963

31

12

22

M

1980

16

29

68

F

1984

52

21

72

M

1992

52

97

55

M

1994

52

97

11

F

1995

99

37

20

F

1999

99

37

98

F

2019

68

24

 

And this is what I want:

id

sex

yob

momid

dadid

child

generation

famid

mom_in

dad_in

77

M

1929

.

.

1

0

1

0

0

69

F

1940

.

.

1

0

1

0

0

52

F

1961

69

77

1

1

1

1

1

99

F

1962

69

77

1

1

1

1

1

16

F

1963

31

12

1

0

2

0

0

22

M

1980

16

29

0

1

2

1

0

68

F

1984

52

21

1

2

1

1

0

72

M

1992

52

97

0

2

1

1

0

55

M

1994

52

97

0

2

1

1

0

11

F

1995

99

37

0

2

1

1

0

20

F

1999

99

37

0

2

1

1

0

98

F

2019

68

24

0

3

1

1

0

 

The id variables are id, momid, dadid. Sex is M/F, and yob is year of birth. Child identifies individuals in the id column who have a child, generation identifies the generation of the family (starting at 0), famid is a family identification, and mom_in means that a mother is in the id column and dad_in means that a father is in the id column.

 

Below is what I've tried:

data have;
input id sex $ year momid dadid;
datalines;
77 M 1929 . .
69 F 1940 . .
52 F 1961 69 77
99 F 1962 69 77
16 F 1963 31 12
22 M 1980 16 29
68 F 1984 52 21
72 M 1992 52 97
55 M 1994 52 97
11 F 1995 99 37
20 F 1999 99 37
98 F 2019 68 24
;
run;


/*Identifies individuals who are mothers or fathers in the dataset (e.g. the id column).*/
data mom;
set have (drop=id);
rename momid=id;
run;
proc sort; by id; run;
data dad;
set have (drop=id);
rename dadid=id;
run;
proc sort; by id; run;

/*moms*/
data a;
merge have (in=a) mom (in=b);
by id;
if a and b;
mom_in=1;
run;
data aa;
set a;
rename id=momid;
run;

/*dads*/
data b;
merge have (in=a) dad (in=b);
by id;
if a and b;
dad_in=1;
run;
data bb;
set b;
rename id=dadid;
run;

proc sort data=have; by momid; run;

data have2;
merge have aa;
by momid;
run;

proc sort data=have2; by dadid; run;

data have3;
merge have2 bb;
by dadid;
run;



/*I don't know how to expand this code to make families based on mothers AND fathers*/ 

/*from is parent, to is child*/
data have;
set have;
rename momid=from;
rename id=to;
run;

/*Family ids*/
data full;
  set have end=last;
  if _n_ eq 1 then do;
   declare hash h();
    h.definekey('node');
     h.definedata('node');
     h.definedone();
  end;
  output;
  node=from; h.replace();
  from=to; to=node; output;
  node=from; h.replace();
  if last then h.output(dataset:'node');
  drop node;
run;


data want/*(keep=node household)*/;
declare hash ha(ordered:'a');
declare hiter hi('ha');
ha.definekey('count');
ha.definedata('last');
ha.definedone();
declare hash _ha(hashexp: 16);
_ha.definekey('key');
_ha.definedone();

if 0 then set full;
declare hash from_to(dataset:'full',hashexp:20,multidata:'y');
 from_to.definekey('from');
 from_to.definedata('to');
 from_to.definedone();

if 0 then set node;
declare hash no(dataset:'node');
declare hiter hi_no('no');
 no.definekey('node');
 no.definedata('node');
 no.definedone();
 

do while(hi_no.next()=0);
 household+1; output;
 count=1;
 key=node;_ha.add();
 last=node;ha.add();
 rc=hi.first();
 do while(rc=0);
   from=last;rx=from_to.find();
   do while(rx=0);
     key=to;ry=_ha.check();
      if ry ne 0 then do;
       node=to;output;rr=no.remove(key:node);
       key=to;_ha.add();
       count+1;
       last=to;ha.add();
      end;
      rx=from_to.find_next();
   end;
   rc=hi.next();
end;
ha.clear();_ha.clear();
end;
stop;
run;

 

1 ACCEPTED SOLUTION

Accepted Solutions
Ksharp
Super User
data x;
input id sex $ year momid dadid;
datalines;
77 M 1929 . .
69 F 1940 . .
52 F 1961 69 77
99 F 1962 69 77
16 F 1963 31 12
22 M 1980 16 29
68 F 1984 52 21
72 M 1992 52 97
55 M 1994 52 97
11 F 1995 99 37
20 F 1999 99 37
98 F 2019 68 24
;
run;

/*************************/
/*********FamID**********/
data have;
 set x;
from=id;
to=momid;if not missing(to) then output;
to=dadid;if not missing(to) then output;
keep  from  to ;
run;
data full;
  set have end=last;
  if _n_ eq 1 then do;
   declare hash h();
    h.definekey('node');
     h.definedata('node');
     h.definedone();
  end;
  output;
  node=from; h.replace();
  from=to; to=node; output;
  node=from; h.replace();
  if last then h.output(dataset:'node');
  drop node;
run;


data famid(keep=node household rename=(household=famid));
declare hash ha(ordered:'a');
declare hiter hi('ha');
ha.definekey('count');
ha.definedata('last');
ha.definedone();
declare hash _ha(hashexp: 16);
_ha.definekey('key');
_ha.definedone();

if 0 then set full;
declare hash from_to(dataset:'full(where=(from is not missing and to is not missing))',hashexp:20,multidata:'y');
 from_to.definekey('from');
 from_to.definedata('to');
 from_to.definedone();

if 0 then set node;
declare hash no(dataset:'node');
declare hiter hi_no('no');
 no.definekey('node');
 no.definedata('node');
 no.definedone();
 

do while(hi_no.next()=0);
 household+1; output;
 count=1;
 key=node;_ha.add();
 last=node;ha.add();
 rc=hi.first();
 do while(rc=0);
   from=last;rx=from_to.find();
   do while(rx=0);
     key=to;ry=_ha.check();
      if ry ne 0 then do;
       node=to;output;rr=no.remove(key:node);
       key=to;_ha.add();
       count+1;
       last=to;ha.add();
      end;
      rx=from_to.find_next();
   end;
   rc=hi.next();
end;
ha.clear();_ha.clear();
end;
stop;
run;










/*************************/
/*********Other variables**********/
data have;
 set x;
_start=id;
_end=momid; output;
_end=dadid; output;
keep  _end  _start ;
run;
proc sql;
create table min_id as
select *
 from have 
  where _start not in (select _end from have);
quit;
data want(keep=path);
if _n_ eq 1 then do;
length path _path  $ 400 ;
if 0 then set have;
declare hash ha(hashexp:20,dataset:'have(where=(_start is not missing and _end is not missing))',multidata:'y');
ha.definekey('_start');
ha.definedata('_end');
ha.definedone();
declare hash pa(ordered:'y');
declare hiter hi_path('pa');
pa.definekey('n');
pa.definedata('n','path');
pa.definedone();
end;
set min_id;
count=1;n=1;_n=1;
path=catx('|',_start,_end);
pa.add();output;
do while(hi_path.next()=0);
 if n ne 1 then pa.remove(key:_n);_n=n;
 _path=path;   
 _start=scan(path,-1,'|');
 rc=ha.find();  
 do while(rc=0);
  if not findw(path,strip(_end),'|') then do;
   if length(path)+length(_end)+1 gt lengthc(path) then do;
    putlog 'ERROR: The length of path and _path are set too short';
    stop;
   end; 
   count+1;n=count;
   path=catx('|',path,_end);
   pa.add(); output;
   path=_path;
 end; 
  rc=ha.find_next();
end;
end;
pa.clear();
run;

data want1;
if _n_=1 then do;
 if 0 then set x(keep=id);
 declare hash h4(dataset:'x(keep=id)');
 h4.definekey('id');
 h4.definedone();
end;
 set want;
 _id=scan(path,1,'|');
 first=input(scan(path,-1,'|'),best.);
 if h4.check(key:first)=0 then found=1;
 drop id first;
run;
proc sql;
create table want2 as
select * 
 from want1 
  group by _id
   having countw(path,'|')=max(countw(path,'|'));
quit;
data want3;
 set want2;
 n=countw(path,'|');
 do i=1 to n;
   name=scan(path,i,'|');
   if missing(found) then generation=n-i-1;
    else    generation=n-i;
   output;
 end;
 drop i n  path;
run;
data final_want;
if _n_=1 then do;
 if 0 then set want3;
 declare hash h1(dataset:'want3');
 h1.definekey('_id');
 h1.definedone();
 declare hash h2(dataset:'want3');
 h2.definekey('name');
 h2.definedata('generation');
 h2.definedone();
 if 0 then set famid;
 declare hash h3(dataset:'famid');
 h3.definekey('node');
 h3.definedata('famid');
 h3.definedone();
 if 0 then set x;
 declare hash h4(dataset:'x');
 h4.definekey('id');
 h4.definedone();
end;
set x;
_id=put(id,best. -l);
child=ifn(h1.check()=0,0,1);

call missing(generation);
rc=h2.find(key:_id);

call missing(famid);
rc=h3.find(key:id);

if h4.check(key:momid)=0 then mon_in=1;
 else mon_in=0;

if h4.check(key:dadid)=0 then dad_in=1;
 else dad_in=0;
drop _id found name rc node;
run;

View solution in original post

2 REPLIES 2
Ksharp
Super User
data x;
input id sex $ year momid dadid;
datalines;
77 M 1929 . .
69 F 1940 . .
52 F 1961 69 77
99 F 1962 69 77
16 F 1963 31 12
22 M 1980 16 29
68 F 1984 52 21
72 M 1992 52 97
55 M 1994 52 97
11 F 1995 99 37
20 F 1999 99 37
98 F 2019 68 24
;
run;

/*************************/
/*********FamID**********/
data have;
 set x;
from=id;
to=momid;if not missing(to) then output;
to=dadid;if not missing(to) then output;
keep  from  to ;
run;
data full;
  set have end=last;
  if _n_ eq 1 then do;
   declare hash h();
    h.definekey('node');
     h.definedata('node');
     h.definedone();
  end;
  output;
  node=from; h.replace();
  from=to; to=node; output;
  node=from; h.replace();
  if last then h.output(dataset:'node');
  drop node;
run;


data famid(keep=node household rename=(household=famid));
declare hash ha(ordered:'a');
declare hiter hi('ha');
ha.definekey('count');
ha.definedata('last');
ha.definedone();
declare hash _ha(hashexp: 16);
_ha.definekey('key');
_ha.definedone();

if 0 then set full;
declare hash from_to(dataset:'full(where=(from is not missing and to is not missing))',hashexp:20,multidata:'y');
 from_to.definekey('from');
 from_to.definedata('to');
 from_to.definedone();

if 0 then set node;
declare hash no(dataset:'node');
declare hiter hi_no('no');
 no.definekey('node');
 no.definedata('node');
 no.definedone();
 

do while(hi_no.next()=0);
 household+1; output;
 count=1;
 key=node;_ha.add();
 last=node;ha.add();
 rc=hi.first();
 do while(rc=0);
   from=last;rx=from_to.find();
   do while(rx=0);
     key=to;ry=_ha.check();
      if ry ne 0 then do;
       node=to;output;rr=no.remove(key:node);
       key=to;_ha.add();
       count+1;
       last=to;ha.add();
      end;
      rx=from_to.find_next();
   end;
   rc=hi.next();
end;
ha.clear();_ha.clear();
end;
stop;
run;










/*************************/
/*********Other variables**********/
data have;
 set x;
_start=id;
_end=momid; output;
_end=dadid; output;
keep  _end  _start ;
run;
proc sql;
create table min_id as
select *
 from have 
  where _start not in (select _end from have);
quit;
data want(keep=path);
if _n_ eq 1 then do;
length path _path  $ 400 ;
if 0 then set have;
declare hash ha(hashexp:20,dataset:'have(where=(_start is not missing and _end is not missing))',multidata:'y');
ha.definekey('_start');
ha.definedata('_end');
ha.definedone();
declare hash pa(ordered:'y');
declare hiter hi_path('pa');
pa.definekey('n');
pa.definedata('n','path');
pa.definedone();
end;
set min_id;
count=1;n=1;_n=1;
path=catx('|',_start,_end);
pa.add();output;
do while(hi_path.next()=0);
 if n ne 1 then pa.remove(key:_n);_n=n;
 _path=path;   
 _start=scan(path,-1,'|');
 rc=ha.find();  
 do while(rc=0);
  if not findw(path,strip(_end),'|') then do;
   if length(path)+length(_end)+1 gt lengthc(path) then do;
    putlog 'ERROR: The length of path and _path are set too short';
    stop;
   end; 
   count+1;n=count;
   path=catx('|',path,_end);
   pa.add(); output;
   path=_path;
 end; 
  rc=ha.find_next();
end;
end;
pa.clear();
run;

data want1;
if _n_=1 then do;
 if 0 then set x(keep=id);
 declare hash h4(dataset:'x(keep=id)');
 h4.definekey('id');
 h4.definedone();
end;
 set want;
 _id=scan(path,1,'|');
 first=input(scan(path,-1,'|'),best.);
 if h4.check(key:first)=0 then found=1;
 drop id first;
run;
proc sql;
create table want2 as
select * 
 from want1 
  group by _id
   having countw(path,'|')=max(countw(path,'|'));
quit;
data want3;
 set want2;
 n=countw(path,'|');
 do i=1 to n;
   name=scan(path,i,'|');
   if missing(found) then generation=n-i-1;
    else    generation=n-i;
   output;
 end;
 drop i n  path;
run;
data final_want;
if _n_=1 then do;
 if 0 then set want3;
 declare hash h1(dataset:'want3');
 h1.definekey('_id');
 h1.definedone();
 declare hash h2(dataset:'want3');
 h2.definekey('name');
 h2.definedata('generation');
 h2.definedone();
 if 0 then set famid;
 declare hash h3(dataset:'famid');
 h3.definekey('node');
 h3.definedata('famid');
 h3.definedone();
 if 0 then set x;
 declare hash h4(dataset:'x');
 h4.definekey('id');
 h4.definedone();
end;
set x;
_id=put(id,best. -l);
child=ifn(h1.check()=0,0,1);

call missing(generation);
rc=h2.find(key:_id);

call missing(famid);
rc=h3.find(key:id);

if h4.check(key:momid)=0 then mon_in=1;
 else mon_in=0;

if h4.check(key:dadid)=0 then dad_in=1;
 else dad_in=0;
drop _id found name rc node;
run;
al15
Calcite | Level 5

@Ksharp, thank you so much for your help!! Your code is brilliant and solves a problem that I've been struggling with for too long. I really appreciate that you took the time to answer my question.

SAS Innovate 2025: Call for Content

Are you ready for the spotlight? We're accepting content ideas for SAS Innovate 2025 to be held May 6-9 in Orlando, FL. The call is open until September 16. Read more here about why you should contribute and what is in it for you!

Submit your idea!

How to Concatenate Values

Learn how use the CAT functions in SAS to join values from multiple variables into a single value.

Find more tutorials on the SAS Users YouTube channel.

Click image to register for webinarClick image to register for webinar

Classroom Training Available!

Select SAS Training centers are offering in-person courses. View upcoming courses for:

View all other training opportunities.

Discussion stats
  • 2 replies
  • 632 views
  • 6 likes
  • 2 in conversation