Solved: Grouping cases by shared documents (direct and indirect)

_KoAn_ · Posted 11-15-2021 12:07 PM

Hi all,

Within Enterprise Guide 8 I have a dataset with DocumentID's and CaseID's. A document can occur within multiple cases. I would like to group all cases when they are connected by one or more documents and give them a CaseGroupID. Also cases which are indirectly connected (by another case) should be all grouped to one. This is usefull so that, when I look at a specific case I can easily see which other cases make (partial) use of the same documents.

In the example below you can see how seven different documents occurs within nine different cases. In this short example by hand I identified three casegroups. How can I program this in SAS Enterprise Guide so that the CaseGroupID is determined by SAS?

Thank you for your help!

andreas_lds · Posted 11-16-2021 02:15 AM

This is, most likely, not the most efficient solution, but it seems to work with the "data" you have provided. Please post data in usable.

data have;
   input DocumentID $ CaseID $;
   datalines;
DC001 CA001
DC001 CA002
DC001 CA003
DC002 CA001
DC002 CA002
DC003 CA001
DC003 CA002
DC003 CA004
DC004 CA005
DC004 CA006
DC005 CA005
DC005 CA006
DC006 CA007
DC006 CA008
DC007 CA002
DC007 CA009
;
run;


proc sort data= work.have;
   by DocumentID;
run;

data want;
   set have;
   by DocumentID;
   
   length 
      CaseGroupID 8 
      NextGroupID 8  /* next ID to use, for the first case in a document, if the case has not been assigned a group before */
      MissingCaseID 8 /* ID to use if CaseID is not in the hash and obs is not the first of a document */
      used 8 /* flag set to 1 if the value of NextGroupId was used to form a new CaseGroup */
   ;
   
   retain NextGroupID MissingCaseID used;
   format CaseGroupId z3.;
   
   if _n_ = 1 then do;
      /* using a hash object to remember CaseIDs that are already in a group */
      declare hash h();
      h.defineKey('CaseID');
      h.defineData('CaseGroupID');
      h.defineDone();
           
      NextGroupID = 1;
   end;
   
   if first.DocumentID then do;
      MissingCaseID = NextGroupID;
      used = 0;
   end;
   
   if h.find() ^= 0 then do;
      /* CaseID is not in the hash-object */
      if first.DocumentId then do;
         CaseGroupID = NextGroupID;
         used = 1;
      end;
      else do;
         CaseGroupID = MissingCaseID;
      end;
      /* Add CaseID / CaseGroupID to the hash */
      h.add();
   end;
   else do;
      /* CaseID is found in the hash, set MissingCaseID to the CaseGroupID to group
       * other cases of the same document. */
      if first.DocumentID then do;
         MissingCaseID = CaseGroupID;
      end;
   end;

   if last.DocumentID and used then do;
      NextGroupID = NextGroupID + 1;
   end;
run;

View solution in original post

andreas_lds · Posted 11-16-2021 02:15 AM

This is, most likely, not the most efficient solution, but it seems to work with the "data" you have provided. Please post data in usable.

data have;
   input DocumentID $ CaseID $;
   datalines;
DC001 CA001
DC001 CA002
DC001 CA003
DC002 CA001
DC002 CA002
DC003 CA001
DC003 CA002
DC003 CA004
DC004 CA005
DC004 CA006
DC005 CA005
DC005 CA006
DC006 CA007
DC006 CA008
DC007 CA002
DC007 CA009
;
run;


proc sort data= work.have;
   by DocumentID;
run;

data want;
   set have;
   by DocumentID;
   
   length 
      CaseGroupID 8 
      NextGroupID 8  /* next ID to use, for the first case in a document, if the case has not been assigned a group before */
      MissingCaseID 8 /* ID to use if CaseID is not in the hash and obs is not the first of a document */
      used 8 /* flag set to 1 if the value of NextGroupId was used to form a new CaseGroup */
   ;
   
   retain NextGroupID MissingCaseID used;
   format CaseGroupId z3.;
   
   if _n_ = 1 then do;
      /* using a hash object to remember CaseIDs that are already in a group */
      declare hash h();
      h.defineKey('CaseID');
      h.defineData('CaseGroupID');
      h.defineDone();
           
      NextGroupID = 1;
   end;
   
   if first.DocumentID then do;
      MissingCaseID = NextGroupID;
      used = 0;
   end;
   
   if h.find() ^= 0 then do;
      /* CaseID is not in the hash-object */
      if first.DocumentId then do;
         CaseGroupID = NextGroupID;
         used = 1;
      end;
      else do;
         CaseGroupID = MissingCaseID;
      end;
      /* Add CaseID / CaseGroupID to the hash */
      h.add();
   end;
   else do;
      /* CaseID is found in the hash, set MissingCaseID to the CaseGroupID to group
       * other cases of the same document. */
      if first.DocumentID then do;
         MissingCaseID = CaseGroupID;
      end;
   end;

   if last.DocumentID and used then do;
      NextGroupID = NextGroupID + 1;
   end;
run;

Ksharp · Posted 11-16-2021 07:26 AM

data have;
   input from $ to $;
   datalines;
DC001 CA001
DC001 CA002
DC001 CA003
DC002 CA001
DC002 CA002
DC003 CA001
DC003 CA002
DC003 CA004
DC004 CA005
DC004 CA006
DC005 CA005
DC005 CA006
DC006 CA007
DC006 CA008
DC007 CA002
DC007 CA009
;
run;


data full;
  set have end=last;
  if _n_ eq 1 then do;
   declare hash h();
    h.definekey('node');
     h.definedata('node');
     h.definedone();
  end;
  output;
  node=from; h.replace();
  from=to; to=node;
  output;
  node=from; h.replace();
  if last then h.output(dataset:'node');
  drop node;
run;


data want(keep=node household);
declare hash ha(ordered:'a');
declare hiter hi('ha');
ha.definekey('count');
ha.definedata('last');
ha.definedone();
declare hash _ha(hashexp: 20);
_ha.definekey('key');
_ha.definedone();

if 0 then set full;
declare hash from_to(dataset:'full(where=(from is not missing and to is not missing))',hashexp:20,multidata:'y');
 from_to.definekey('from');
 from_to.definedata('to');
 from_to.definedone();

if 0 then set node;
declare hash no(dataset:'node');
declare hiter hi_no('no');
 no.definekey('node');
 no.definedata('node');
 no.definedone();
 

do while(hi_no.next()=0);
 household+1; output;
 count=1;
 key=node;_ha.add();
 last=node;ha.add();
 rc=hi.first();
 do while(rc=0);
   from=last;rx=from_to.find();
   do while(rx=0);
     key=to;ry=_ha.check();
      if ry ne 0 then do;
       node=to;output;rr=no.remove(key:node);
       key=to;_ha.add();
       count+1;
       last=to;ha.add();
      end;
      rx=from_to.find_next();
   end;
   rc=hi.next();
end;
ha.clear();_ha.clear();
end;
stop;
run;


data final_want;
 if _n_=1 then do;
   if 0 then set want;
   declare hash h(dataset:'want');
   h.definekey('node');
   h.definedata('household');
   h.definedone();
 end;
set have;
call missing(household);
rc=h.find(key:from);
rc=h.find(key:to);
drop rc node;
run;

Ksharp · Posted 11-16-2021 07:27 AM

If you have SAS/OR, could try this :

/* Same code as SAS/OR */
proc optnet data_links=have out_nodes=want GRAPH_DIRECTION=UNDIRECTED;
data_links_var from=from to=to;
concomp;
run;

_KoAn_ · Posted 11-17-2021 09:21 AM

Thank you for your help. Andreas, your solution works perfectly for what I want, even with larger datasets it's quick enough!

Grouping cases by shared documents (direct and indirect)

Re: Grouping cases by shared documents (direct and indirect)

Re: Grouping cases by shared documents (direct and indirect)

Re: Grouping cases by shared documents (direct and indirect)

Re: Grouping cases by shared documents (direct and indirect)

Re: Grouping cases by shared documents (direct and indirect)

Grouping cases by shared documents (direct and indirect)

Re: Grouping cases by shared documents (direct and indirect)

Re: Grouping cases by shared documents (direct and indirect)

Re: Grouping cases by shared documents (direct and indirect)

Re: Grouping cases by shared documents (direct and indirect)

Re: Grouping cases by shared documents (direct and indirect)

SAS Innovate 2025: Register Now