<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Identify crossover between groups in SAS Enterprise Guide</title>
    <link>https://communities.sas.com/t5/SAS-Enterprise-Guide/Identify-crossover-between-groups/m-p/797928#M40278</link>
    <description>&lt;P&gt;I suspect, for a large dataset, with many GROUPID*INDID values, this is most efficiently generated by (1) sorting by INDID/GROUPID, and (2) for each INDID, find all GROUPID combinations for that INDID and add to a running total for all INDID's.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;For each INDID, the second step is basically done by&lt;/P&gt;
&lt;OL&gt;
&lt;LI&gt;reading in a single group record&lt;/LI&gt;
&lt;LI&gt;with the current groupid in hand, iterate through a list of all prior groupid's and add to the count of each current/historical group pairs.&lt;/LI&gt;
&lt;LI&gt;add the current group id to the historical list.&lt;/LI&gt;
&lt;LI&gt;Go back to step 1.&lt;/LI&gt;
&lt;/OL&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Using&amp;nbsp;&lt;a href="https://communities.sas.com/t5/user/viewprofilepage/user-id/13884"&gt;@ballardw&lt;/a&gt;&amp;nbsp;'s dataset HAVE:&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;data have;
   input groupid $ indid $;
datalines;
1	A
1	B
1	C
1	D
2	A
2	B
2	C
3	D
3	E
3	F
4  A
4  C
4  E
4  F
5  B
5  D
5  F
run;

proc sort data=have out=need; by indid groupid; 
run;

data _null_;
  set need end=end_of_need;
  by indid;

  if _n_=1 then do;
    length groupid2 $8   pair_count 8;
    declare hash hist ();
      hist.definekey('indid','groupid2');
      hist.definedata('indid','groupid2');
      hist.definedone();
    declare hiter hi ('hist');

    declare hash pairs(ordered:'A');
      pairs.definekey('groupid','groupid2');
      pairs.definedata('groupid','groupid2','pair_count');
      pairs.definedone();
  end;

  /* Now iterate through all the previous GROUPIDs for this INDID */
  if not first.indid then do rc=hi.first() by 0 until (hi.next()^=0);
    pair_count=0;
    rc1=pairs.find();         /* Retrieve running total for this GROUPID,GROUPID2 pair*/
    pair_count=pair_count+1;
    rc2=pairs.replace();
  end;

  groupid2=groupid;          /* Current groupid becomes part of group history*/
  rc3=hist.add();               /* Add current groupid to hash H */
  if last.indid then hist.clear();  /*Done with this ID? empty the hash object */

  if end_of_need then pairs.output(dataset:'want');
run;&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;This code keeps memory requirements down, since it needs only enough memory to hold (1) a list off all groupid's for a single indid (in hash object &lt;EM&gt;&lt;STRONG&gt;hist&lt;/STRONG&gt;&lt;/EM&gt;), and (2) a table of frequency for all encountered groupid pairs (no more than&amp;nbsp; &amp;nbsp; one-half of N of groups squared) - in hash object &lt;EM&gt;&lt;STRONG&gt;pairs&lt;/STRONG&gt;&lt;/EM&gt;.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;By contrast, the SQL below, while much less code, is probably a far greater consumer of memory:&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;proc sql noprint;
  create table want as select
  catx('-',a.groupid,b.groupid) as group_pair,
  count(a.indid) as intersection_count
  from have as a   left join have as b
  on a.indid = b.indid  where a.groupid&amp;lt;b.groupid
  group by calculated group_pair;
quit;
&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Wed, 23 Feb 2022 03:30:04 GMT</pubDate>
    <dc:creator>mkeintz</dc:creator>
    <dc:date>2022-02-23T03:30:04Z</dc:date>
    <item>
      <title>Identify crossover between groups</title>
      <link>https://communities.sas.com/t5/SAS-Enterprise-Guide/Identify-crossover-between-groups/m-p/797860#M40265</link>
      <description>&lt;P&gt;Hi,&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;I'm trying to find a way to identify the level of crossover between groups of individuals in my data.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;For a dataset like:&lt;/P&gt;&lt;TABLE border="1"&gt;&lt;TBODY&gt;&lt;TR&gt;&lt;TD&gt;Group ID&lt;/TD&gt;&lt;TD&gt;Individual ID&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;1&lt;/TD&gt;&lt;TD&gt;A&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;1&lt;/TD&gt;&lt;TD&gt;B&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;1&lt;/TD&gt;&lt;TD&gt;C&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;1&lt;/TD&gt;&lt;TD&gt;D&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;2&lt;/TD&gt;&lt;TD&gt;A&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;2&lt;/TD&gt;&lt;TD&gt;B&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;2&lt;/TD&gt;&lt;TD&gt;C&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;3&lt;/TD&gt;&lt;TD&gt;D&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;3&lt;/TD&gt;&lt;TD&gt;E&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;3&lt;/TD&gt;&lt;TD&gt;F&lt;/TD&gt;&lt;/TR&gt;&lt;/TBODY&gt;&lt;/TABLE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;I'd like to be able to do something that would highlight any groups that share the a certain percentage of their members. So here'd I like to flag that for Group 1, 75% of it's members are also in Group 2, and so there's a connection between these groups, but only 25% of its members are in group 3 and so there isn't a connection here. Obviously the real data is much larger than the example above, and I'd want every group to be compared against every other one (so that a link between Group 2 and Group 1 would be flagged as 1 contains 100% of 2, and no link between 2 and 3 would be flagged as 3 contains 0% of 1).&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;I'm stumped as to how to do it though, or if this is even something that is possible. Any help would be much appreciated.&lt;/P&gt;</description>
      <pubDate>Tue, 22 Feb 2022 16:43:13 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Enterprise-Guide/Identify-crossover-between-groups/m-p/797860#M40265</guid>
      <dc:creator>dr91</dc:creator>
      <dc:date>2022-02-22T16:43:13Z</dc:date>
    </item>
    <item>
      <title>Re: Identify crossover between groups</title>
      <link>https://communities.sas.com/t5/SAS-Enterprise-Guide/Identify-crossover-between-groups/m-p/797868#M40266</link>
      <description>&lt;P&gt;I think it is the Cartesian element (needing to compare everything to everything) that is the killer here.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;There is probably a built in function that does this and I'm sure one of the SuperUser geniuses around here can tell you about it. I don't know it.&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;As a lessor mortal, I would take a no doubt incorrect line:&lt;/P&gt;
&lt;P&gt;1. Convert your data to a list of groups with an ordered string that is the group members.&amp;nbsp;&lt;/P&gt;
&lt;P&gt;1 ABCD&lt;/P&gt;
&lt;P&gt;2 ABC&lt;/P&gt;
&lt;P&gt;3 DEF&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;2. Use a loop to compare 2 copies of the groups lists to each other using COMPGED. I am envisioning a matrix with the groups as row and column headings and a COMPGED score in each cell representing the diff in the strings for the row and column headings for that cell.&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;3.&amp;nbsp; If I knew how to write it, I would have done so.&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Edit:&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;I still don't know how to make a list of strings representing the individuals in each group but my cartesian comment made me realize a simple join would do it.&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;So:&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;data orig;
	input GroupID Indstring $10.;
	datalines;
1 ABCD
2 ABC
3 DEF
;
run;

proc sql;
	select a.indstring, b.indstring, compged(a.indstring, b.indstring)
	from orig a, orig b;
run;
&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;gives me&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;TABLE style="border-collapse: collapse; width: 144pt;" border="0" width="192" cellspacing="0" cellpadding="0"&gt;
&lt;TBODY&gt;
&lt;TR style="height: 15.0pt;"&gt;
&lt;TD width="64" height="20" class="xl66" style="height: 15.0pt; width: 48pt;"&gt;Indstring&lt;/TD&gt;
&lt;TD width="64" class="xl66" style="width: 48pt;"&gt;Indstring&lt;/TD&gt;
&lt;TD width="64" class="xl65" style="width: 48pt;"&gt;&amp;nbsp;&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR style="height: 15.0pt;"&gt;
&lt;TD height="20" class="xl68" style="height: 15.0pt;"&gt;ABCD&lt;/TD&gt;
&lt;TD class="xl68"&gt;ABCD&lt;/TD&gt;
&lt;TD class="xl67"&gt;0&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR style="height: 15.0pt;"&gt;
&lt;TD height="20" class="xl68" style="height: 15.0pt;"&gt;ABCD&lt;/TD&gt;
&lt;TD class="xl68"&gt;ABC&lt;/TD&gt;
&lt;TD class="xl67"&gt;50&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR style="height: 15.0pt;"&gt;
&lt;TD height="20" class="xl68" style="height: 15.0pt;"&gt;ABCD&lt;/TD&gt;
&lt;TD class="xl68"&gt;DEF&lt;/TD&gt;
&lt;TD class="xl67"&gt;450&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR style="height: 15.0pt;"&gt;
&lt;TD height="20" class="xl68" style="height: 15.0pt;"&gt;ABC&lt;/TD&gt;
&lt;TD class="xl68"&gt;ABCD&lt;/TD&gt;
&lt;TD class="xl67"&gt;10&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR style="height: 15.0pt;"&gt;
&lt;TD height="20" class="xl68" style="height: 15.0pt;"&gt;ABC&lt;/TD&gt;
&lt;TD class="xl68"&gt;ABC&lt;/TD&gt;
&lt;TD class="xl67"&gt;0&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR style="height: 15.0pt;"&gt;
&lt;TD height="20" class="xl68" style="height: 15.0pt;"&gt;ABC&lt;/TD&gt;
&lt;TD class="xl68"&gt;DEF&lt;/TD&gt;
&lt;TD class="xl67"&gt;400&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR style="height: 15.0pt;"&gt;
&lt;TD height="20" class="xl68" style="height: 15.0pt;"&gt;DEF&lt;/TD&gt;
&lt;TD class="xl68"&gt;ABCD&lt;/TD&gt;
&lt;TD class="xl67"&gt;410&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR style="height: 15.0pt;"&gt;
&lt;TD height="20" class="xl68" style="height: 15.0pt;"&gt;DEF&lt;/TD&gt;
&lt;TD class="xl68"&gt;ABC&lt;/TD&gt;
&lt;TD class="xl67"&gt;400&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR style="height: 15.0pt;"&gt;
&lt;TD height="20" class="xl68" style="height: 15.0pt;"&gt;DEF&lt;/TD&gt;
&lt;TD class="xl68"&gt;DEF&lt;/TD&gt;
&lt;TD class="xl67"&gt;0&lt;/TD&gt;
&lt;/TR&gt;
&lt;/TBODY&gt;
&lt;/TABLE&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Low score is high overlap, high score is low overlap.&amp;nbsp; Bring along other data elements as needed. Name things better.&amp;nbsp; Delete the rows where the strings are the same.&amp;nbsp; Sort in asc or desc order. Scale the score to whatever flag setting you like.&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Edit #2:&lt;/P&gt;
&lt;P&gt;A Cartesian match with 4.8 million groups?&amp;nbsp; I'm out.&lt;/P&gt;</description>
      <pubDate>Tue, 22 Feb 2022 22:12:52 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Enterprise-Guide/Identify-crossover-between-groups/m-p/797868#M40266</guid>
      <dc:creator>HB</dc:creator>
      <dc:date>2022-02-22T22:12:52Z</dc:date>
    </item>
    <item>
      <title>Re: Identify crossover between groups</title>
      <link>https://communities.sas.com/t5/SAS-Enterprise-Guide/Identify-crossover-between-groups/m-p/797870#M40267</link>
      <description>&lt;P&gt;&lt;STRONG&gt;How many unique groups do you have?&lt;/STRONG&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;
&lt;P&gt;Untested but something like this should work, may not scale well if you have a lot of groups though. I think this gives you counts of the overlaps and you'll have to convert them to a percentage somehow, not sure what the denominator will be.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;data long;
set have;
value=1;
run;

proc sort data=long;
by id;
run;

proc transpose data=long out=wide prefix=GROUP;
by id;
id group;
var value;
run;

*fill in missing 0's;
data wide_filled;
set wide;
array _g(*) group:;
do i=1 to dim(_g);
if _g(i) = . then _g(i) = 0;
end;
run;

ods output sscp=coocs;
proc corr data=wide_filled sscp;
var group:;
run;

proc print data=coocs;
run;&lt;/CODE&gt;&lt;/PRE&gt;
&lt;BLOCKQUOTE&gt;&lt;HR /&gt;&lt;a href="https://communities.sas.com/t5/user/viewprofilepage/user-id/417737"&gt;@dr91&lt;/a&gt;&amp;nbsp;wrote:&lt;BR /&gt;
&lt;P&gt;Hi,&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;I'm trying to find a way to identify the level of crossover between groups of individuals in my data.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;For a dataset like:&lt;/P&gt;
&lt;TABLE border="1"&gt;
&lt;TBODY&gt;
&lt;TR&gt;
&lt;TD&gt;Group ID&lt;/TD&gt;
&lt;TD&gt;Individual ID&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR&gt;
&lt;TD&gt;1&lt;/TD&gt;
&lt;TD&gt;A&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR&gt;
&lt;TD&gt;1&lt;/TD&gt;
&lt;TD&gt;B&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR&gt;
&lt;TD&gt;1&lt;/TD&gt;
&lt;TD&gt;C&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR&gt;
&lt;TD&gt;1&lt;/TD&gt;
&lt;TD&gt;D&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR&gt;
&lt;TD&gt;2&lt;/TD&gt;
&lt;TD&gt;A&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR&gt;
&lt;TD&gt;2&lt;/TD&gt;
&lt;TD&gt;B&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR&gt;
&lt;TD&gt;2&lt;/TD&gt;
&lt;TD&gt;C&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR&gt;
&lt;TD&gt;3&lt;/TD&gt;
&lt;TD&gt;D&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR&gt;
&lt;TD&gt;3&lt;/TD&gt;
&lt;TD&gt;E&lt;/TD&gt;
&lt;/TR&gt;
&lt;TR&gt;
&lt;TD&gt;3&lt;/TD&gt;
&lt;TD&gt;F&lt;/TD&gt;
&lt;/TR&gt;
&lt;/TBODY&gt;
&lt;/TABLE&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;I'd like to be able to do something that would highlight any groups that share the a certain percentage of their members. So here'd I like to flag that for Group 1, 75% of it's members are also in Group 2, and so there's a connection between these groups, but only 25% of its members are in group 3 and so there isn't a connection here. Obviously the real data is much larger than the example above, and I'd want every group to be compared against every other one (so that a link between Group 2 and Group 1 would be flagged as 1 contains 100% of 2, and no link between 2 and 3 would be flagged as 3 contains 0% of 1).&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;I'm stumped as to how to do it though, or if this is even something that is possible. Any help would be much appreciated.&lt;/P&gt;
&lt;HR /&gt;&lt;/BLOCKQUOTE&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Tue, 22 Feb 2022 18:19:36 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Enterprise-Guide/Identify-crossover-between-groups/m-p/797870#M40267</guid>
      <dc:creator>Reeza</dc:creator>
      <dc:date>2022-02-22T18:19:36Z</dc:date>
    </item>
    <item>
      <title>Re: Identify crossover between groups</title>
      <link>https://communities.sas.com/t5/SAS-Enterprise-Guide/Identify-crossover-between-groups/m-p/797876#M40268</link>
      <description>&lt;P&gt;How much 'much larger' is your actual data? &lt;span class="lia-unicode-emoji" title=":slightly_smiling_face:"&gt;🙂&lt;/span&gt;&lt;/P&gt;</description>
      <pubDate>Tue, 22 Feb 2022 19:49:52 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Enterprise-Guide/Identify-crossover-between-groups/m-p/797876#M40268</guid>
      <dc:creator>PeterClemmensen</dc:creator>
      <dc:date>2022-02-22T19:49:52Z</dc:date>
    </item>
    <item>
      <title>Re: Identify crossover between groups</title>
      <link>https://communities.sas.com/t5/SAS-Enterprise-Guide/Identify-crossover-between-groups/m-p/797880#M40269</link>
      <description>Ah, yeah, that's probably quite important, and the biggest reason this is causing me problems.&lt;BR /&gt;&lt;BR /&gt;About 4.8m groups.</description>
      <pubDate>Tue, 22 Feb 2022 20:15:19 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Enterprise-Guide/Identify-crossover-between-groups/m-p/797880#M40269</guid>
      <dc:creator>dr91</dc:creator>
      <dc:date>2022-02-22T20:15:19Z</dc:date>
    </item>
    <item>
      <title>Re: Identify crossover between groups</title>
      <link>https://communities.sas.com/t5/SAS-Enterprise-Guide/Identify-crossover-between-groups/m-p/797885#M40270</link>
      <description>&lt;P&gt;And how much memory do you have available?&lt;/P&gt;</description>
      <pubDate>Tue, 22 Feb 2022 20:26:03 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Enterprise-Guide/Identify-crossover-between-groups/m-p/797885#M40270</guid>
      <dc:creator>PeterClemmensen</dc:creator>
      <dc:date>2022-02-22T20:26:03Z</dc:date>
    </item>
    <item>
      <title>Re: Identify crossover between groups</title>
      <link>https://communities.sas.com/t5/SAS-Enterprise-Guide/Identify-crossover-between-groups/m-p/797889#M40272</link>
      <description>&lt;BLOCKQUOTE&gt;&lt;HR /&gt;&lt;a href="https://communities.sas.com/t5/user/viewprofilepage/user-id/417737"&gt;@dr91&lt;/a&gt;&amp;nbsp;wrote:&lt;BR /&gt;Ah, yeah, that's probably quite important, and the biggest reason this is causing me problems.&lt;BR /&gt;&lt;BR /&gt;About 4.8m groups.&lt;HR /&gt;&lt;/BLOCKQUOTE&gt;
&lt;P&gt;Is that after reducing to distinct groups? I.E no duplicates of group and individual id?&lt;/P&gt;
&lt;P&gt;And how many unique individual id values?&lt;/P&gt;</description>
      <pubDate>Tue, 22 Feb 2022 20:43:05 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Enterprise-Guide/Identify-crossover-between-groups/m-p/797889#M40272</guid>
      <dc:creator>ballardw</dc:creator>
      <dc:date>2022-02-22T20:43:05Z</dc:date>
    </item>
    <item>
      <title>Re: Identify crossover between groups</title>
      <link>https://communities.sas.com/t5/SAS-Enterprise-Guide/Identify-crossover-between-groups/m-p/797899#M40273</link>
      <description>&lt;P&gt;Here is an approach that compares pairs of group ids. There are multiple count steps the "final" data set has the percent of common elements and currently a single variable that contains the two group ids.&lt;/P&gt;
&lt;P&gt;Cautions: Your number of group ids is going to be too large to fit into a single macro variable if there are indeed 4.8 million individual group id values. If your comment of 4.8m groups meant combinations of group and individual id then this might work. This is one way to get the values of the groupid into single macro variable that could be used as the call to the not very imaginatively named Complicated macro.&lt;/P&gt;
&lt;P&gt;If your data has duplicates of the groupid and indid then you need to reduce your data set so that there are no duplicates for this to work. (Potentially significantly reducing the 4.8M number since not much detail has been provided on your data)&lt;/P&gt;
&lt;PRE&gt;Proc sql noprint;
    select distinct groupid into: grouplist separated by ' '
   from have
   ;
quit;&lt;/PRE&gt;
&lt;P&gt;And a final caution: this may take quite awhile to run depending on actual number of groups and unique records.&lt;/P&gt;
&lt;P&gt;Assuming that there are actually 10,000 group id values there are going to be on the order of 50,000,000 pairs of comparisons run. Which at 1 second each run (possibly optimistic) will take roughly one year and 7 months to complete.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Suggestion would be to possibly reduce your analysis to a limited group of either Group or individual ids.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Obviously&amp;nbsp; you need to use your variable and data set names everywhere I reference Groupid, Indid, or Have&lt;/P&gt;
&lt;P&gt;I have arbitrarily created a single variable for the pair. You could use two variables instead but either has a potential issue with lengths. The Proc sql could be used instead of the macro parameter but still needs to be watched for length.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;I might suggest testing the macro with no more than 5 or so values of your actual Groupid variable that you have some ideas.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;You can filter on the resulting Percent value in the final Pairs data set.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;PRE&gt;data have;
   input groupid $ indid $;
datalines;
1	A
1	B
1	C
1	D
2	A
2	B
2	C
3	D
3	E
3	F
4  A
4  C
4  E
4  F
5  B
5  D
5  F
;
%macro complicated(grouplist=);

%do i=1 %to %eval(%sysfunc(countw(&amp;amp;grouplist.)) - 1);
   %do j= %eval (&amp;amp;i. +1) %to %sysfunc(countw(&amp;amp;grouplist.))
       %let firstgroup= %scan(&amp;amp;grouplist.,&amp;amp;i.);
       %let secondgroup= %scan(&amp;amp;grouplist.,&amp;amp;j.);
       %put &amp;amp;firstgroup. &amp;amp;secondgroup.;
       Proc freq data=have noprint;
          where groupid in ("&amp;amp;firstgroup." "&amp;amp;secondgroup");
          tables indid*groupid/out=temp;
      run;
      proc freq data=temp order=freq noprint;
         tables indid/out=temp2 outcum;
      run;
      proc freq data=temp2 (rename=(count=tocount)) order=freq noprint;
         tables tocount/out=temp3;
      run;
      data temp4;
         set temp3;
         length group_pair $ 50;
         where tocount=2;
         group_pair = "&amp;amp;firstgroup. &amp;amp;secondgroup.";
         drop count tocount;
      run;
      proc append base=pairs data= temp4;
      run;
   %end;
%end;
%mend;

%complicated(grouplist=1 2 3 4 5 );

       
   &lt;/PRE&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Tue, 22 Feb 2022 21:41:08 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Enterprise-Guide/Identify-crossover-between-groups/m-p/797899#M40273</guid>
      <dc:creator>ballardw</dc:creator>
      <dc:date>2022-02-22T21:41:08Z</dc:date>
    </item>
    <item>
      <title>Re: Identify crossover between groups</title>
      <link>https://communities.sas.com/t5/SAS-Enterprise-Guide/Identify-crossover-between-groups/m-p/797928#M40278</link>
      <description>&lt;P&gt;I suspect, for a large dataset, with many GROUPID*INDID values, this is most efficiently generated by (1) sorting by INDID/GROUPID, and (2) for each INDID, find all GROUPID combinations for that INDID and add to a running total for all INDID's.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;For each INDID, the second step is basically done by&lt;/P&gt;
&lt;OL&gt;
&lt;LI&gt;reading in a single group record&lt;/LI&gt;
&lt;LI&gt;with the current groupid in hand, iterate through a list of all prior groupid's and add to the count of each current/historical group pairs.&lt;/LI&gt;
&lt;LI&gt;add the current group id to the historical list.&lt;/LI&gt;
&lt;LI&gt;Go back to step 1.&lt;/LI&gt;
&lt;/OL&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Using&amp;nbsp;&lt;a href="https://communities.sas.com/t5/user/viewprofilepage/user-id/13884"&gt;@ballardw&lt;/a&gt;&amp;nbsp;'s dataset HAVE:&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;data have;
   input groupid $ indid $;
datalines;
1	A
1	B
1	C
1	D
2	A
2	B
2	C
3	D
3	E
3	F
4  A
4  C
4  E
4  F
5  B
5  D
5  F
run;

proc sort data=have out=need; by indid groupid; 
run;

data _null_;
  set need end=end_of_need;
  by indid;

  if _n_=1 then do;
    length groupid2 $8   pair_count 8;
    declare hash hist ();
      hist.definekey('indid','groupid2');
      hist.definedata('indid','groupid2');
      hist.definedone();
    declare hiter hi ('hist');

    declare hash pairs(ordered:'A');
      pairs.definekey('groupid','groupid2');
      pairs.definedata('groupid','groupid2','pair_count');
      pairs.definedone();
  end;

  /* Now iterate through all the previous GROUPIDs for this INDID */
  if not first.indid then do rc=hi.first() by 0 until (hi.next()^=0);
    pair_count=0;
    rc1=pairs.find();         /* Retrieve running total for this GROUPID,GROUPID2 pair*/
    pair_count=pair_count+1;
    rc2=pairs.replace();
  end;

  groupid2=groupid;          /* Current groupid becomes part of group history*/
  rc3=hist.add();               /* Add current groupid to hash H */
  if last.indid then hist.clear();  /*Done with this ID? empty the hash object */

  if end_of_need then pairs.output(dataset:'want');
run;&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;This code keeps memory requirements down, since it needs only enough memory to hold (1) a list off all groupid's for a single indid (in hash object &lt;EM&gt;&lt;STRONG&gt;hist&lt;/STRONG&gt;&lt;/EM&gt;), and (2) a table of frequency for all encountered groupid pairs (no more than&amp;nbsp; &amp;nbsp; one-half of N of groups squared) - in hash object &lt;EM&gt;&lt;STRONG&gt;pairs&lt;/STRONG&gt;&lt;/EM&gt;.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;By contrast, the SQL below, while much less code, is probably a far greater consumer of memory:&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;proc sql noprint;
  create table want as select
  catx('-',a.groupid,b.groupid) as group_pair,
  count(a.indid) as intersection_count
  from have as a   left join have as b
  on a.indid = b.indid  where a.groupid&amp;lt;b.groupid
  group by calculated group_pair;
quit;
&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Wed, 23 Feb 2022 03:30:04 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Enterprise-Guide/Identify-crossover-between-groups/m-p/797928#M40278</guid>
      <dc:creator>mkeintz</dc:creator>
      <dc:date>2022-02-23T03:30:04Z</dc:date>
    </item>
  </channel>
</rss>

