The OP's problem is one of finding (connected) components given a graph, once we convert the given data into one. Below is my try that implements the depth-first search(DFS) based component finding algorithm running on the graph represented as an edgelist stored in a hash-of-hashes. This runs pretty fast. On my desktop with 32bit windows, it usually takes 10 to 20 seconds or so given N=10000. But it consumes an enormous amount of memory. The code asked for 910M at peak and used 512M while running for N=10000, most of which seems to be overhead for hash objects. Enjoy. options nocenter mprint fullstimer; /* test data */ %let N=10000; %let seed=20110921; data one; length id 4 v1-v4 $3; array arr v1-v4; alpha = "abcdefghijklmnopqrstuvwxyz"; alphaLen = vlength(alpha); vlen = vlength(v1); do id = 1 to &N; call missing(of v1-v4); do i = 1 to 4; do j = 1 to vlen; p = ceil(alphaLen*ranuni(&seed)); substr(arr,j,1) = substr(alpha,p,1); end; end; output; keep id v1-v4; end; run; /* a tiny test dataset -- uncomment to use this one data one; input id (v1 v2 v3 v4) ($); cards; 1 a b c d the graph has four (connected) components, 2 a e f g including two singletons, like so: 3 . . . . 4 i j k l 1 - 2 - 6 - 9 5 . . m . \ | 6 . n f o 8 7 . . m . 3 8 . n . g 5 - 7 9 . . . o 4 ; run; */ /* step one: convert it to a graph represented as an edge list. */ %macro matches; %local i j k; %let k = 1; %do i = 1 %to 4; %do j = &i %to 4; %let k = %eval(&k + 1); union all select m&k..id as tail, n&k..id as head from one as m&k., one as n&k. where m&k..v&i = n&k..v&j and not missing(m&k..v&i) %end; %end; %mend matches; proc sql _method; drop index id on one; create index id on one (id); drop index v1 on one; create index v1 on one (v1); drop index v2 on one; create index v2 on one (v2); drop index v3 on one; create index v3 on one (v3); drop index v4 on one; create index v4 on one (v4); create table g as select distinct mn.tail, mn.head from ( select m1.id as tail, n1.id as head from one as m1, one as n1 where m1.id=n1.id %matches ) as mn order by mn.tail, mn.head; quit; /* Step Two: to find connected components and singletons. depth-first search(DFS) of a hash-of-hash representation of the graph. the limit on the depth of recursive link blocks is 11 by default but can be changed by the stack= data statement option. */ data c(rename=(tail=vertex cid=component)) /stack=&N; retain OK top component 0; dcl hash tails(); dcl hash heads(); dcl hash stack(); dcl hiter ti; dcl hiter hi; link main; main: link loadData; do rc = ti.first() by 0 while (rc = OK); tails.find(); if missing(cid) then do; component + 1; cid = component; tails.replace(); link connect; end; rc = ti.next(); end; link doOutput; stop; return; connect: link pushStack; hi = _new_ hiter('heads'); do rc = hi.first() by 0 while (rc = OK); tail = head; tails.find(); if missing(cid) then do; cid = component; tails.replace(); link connect; end; rc = hi.next(); end; link popStack; return; loadData: /* load data into a hash of hashes */ tails = _new_ hash(ordered:'a'); tails.defineKey('tail'); tails.defineData('tail', 'cid', 'heads'); tails.defineDone(); do until (end); set g end=end; by tail; call missing(cid); if first.tail then do; heads = _new_ hash(ordered:'a'); heads.defineKey('head'); heads.defineData('head'); heads.defineDone(); tails.replace(); end; heads.replace(); end; put "NOTE: done loading data into memory hash-of-hashes."; ti = _new_ hiter('tails'); /* a call stack */ stack = _new_ hash(ordered:'a'); stack.defineKey('top'); stack.defineData('tail', 'cid', 'heads', 'head', 'rc', 'hi'); stack.defineDone(); return; popStack: if stack.find() ^= OK then return; stack.remove(); top + (-1); return; pushStack: top + 1; stack.add(); return; doOutput: do rc = ti.first() by 0 while (rc = OK); keep tail cid; output c; rc = ti.next(); end; return; run; /* check -- how many clusters? */ proc sql; select size as 'component size'n, count(*) as 'number of components'n from ( select component as componentID, count(*) as size from c group by component ) group by size order by size desc; quit; /* on lst: for the small dataset component number of size components --------------------- 5 1 2 1 1 2 for the larger one component number of size components --------------------- 9964 1 2 3 1 30 */
... View more