BookmarkSubscribeRSS Feed
rogerjdeangelis
Barite | Level 11
SAS/WPS/R: How to find links between nodes

link to this message
https://goo.gl/KgN8Xw
https://communities.sas.com/t5/General-SAS-Programming/How-to-find-link-between-nodes/td-p/341819

HAVE
====

Up to 40 obs SD1.HAVE total obs=8

Obs                    STR

 1     za1 > email1 > ip1 > address1 > phone1       >> first cluster
 2     za2 > email2 > ip2 > address1 > phone2
 3     za3 > email3 > ip2 > address2 > phone5
 4     za4 > email5 > ip1 > address3 > phone13
 5     za5 > email1 > ip13 > address13 > phone13

 6     za11 > email21 > ip21 > address21 > phone21  >> second cluster
 7     za12 > email22 > ip21 > address22 > phone22
 8     za13 > email22 > ip22 > address23 > phone23

 6 and 7 are connected by ip21
 7 and 8 are connected by email22


WANT ( There are two non-connected clusters )
==============================================

CLUSTER                    UNIQUE CONNECTED CLUSTERS (ie ip2 only occurs once)

 1     za1>email1>ip1>address1>phone1>za2>email2>ip2>phone2>za3>email3>
       address2>phone5>za4>email5>address3>phone13>za5>ip13>address13

 2     za11>email21>ip21>address21>phone21>za12>email22>address22>phone22>za13>ip22>address23>phone23

WORKING CODE
============

   R - all other code is prep for input and output

        cl <- clusters(graph.data.frame(combspl))$membership[-(1:length(spl))];

    igraph package is heavily used and debugged (not true of all R packages)

FULL SOLUTION

*                _                  _       _
 _ __ ___   __ _| | _____        __| | __ _| |_ __ _
| '_ ` _ \ / _` | |/ / _ \_____ / _` |/ _` | __/ _` |
| | | | | | (_| |   <  __/_____| (_| | (_| | || (_| |
|_| |_| |_|\__,_|_|\_\___|      \__,_|\__,_|\__\__,_|

;

options validvarname=upcase;
libname sd1 "d:/sd1";
data sd1.have(keep=str);
input (Application_ID Email_ID IP_ID Address_ID phone_ID) ( :$20.);
array chr _character_;
str=catx('>',of _character_);
cards4;
za1 email1 ip1 address1 phone1
za2 email2 ip2 address1 phone2
za3 email3 ip2 address2 phone5
za4 email5 ip1 address3 phone13
za5 email1 ip13 address13 phone13
za11 email21 ip21 address21 phone21
za12 email22 ip21 address22 phone22
za13 email22 ip22 address23 phone23
;;;;
run;quit;


%utl_submit_wps64('
libname sd1 "d:/sd1";
options set=R_HOME "C:/Program Files/R/R-3.3.2";
libname wrk "%sysfunc(pathname(work))";
proc r;
submit;
source("c:/Program Files/R/R-3.3.2/etc/Rprofile.site",echo=T);
library(igraph);
library(haven);
data <-read_sas("d:/sd1/have.sas7bdat");
data<-as.character(data$STR);
spl <- strsplit(data,">");
combspl <- data.frame(
  grp = rep(seq_along(spl),lengths(spl)),
  val = unlist(spl)
);
cl <- clusters(graph.data.frame(combspl))$membership[-(1:length(spl))];
dat <- data.frame(cl);
dat[,2] <- row.names(dat);
a <- character(0);
for (i in 1:max(cl)) {
  a[i] <- paste(paste0(dat[(dat[,1] == i),][,2]), collapse=">");
};
endsubmit;
import r=a data=wrk.linkages;
run;quit;
');

proc print data=linkages width=min;
run;quit;

Up to 40 obs from linkages total obs=2

CLUSTERS

 1     za1 > email1 > ip1 > address1 > phone1 > za2 > email2 > ip2 > phone2 >
       za3 > email3 > address2 > phone5 > za4 > email5 > address3 > phone13 >
       za5 > ip13 > address13

 2     za11 > email21 > ip21 > address21 > phone21 > za12 > email22 >
       address22 > phone22 > za13 > ip22 > address23 > phone23



Ksharp
Super User

In the last data step, you could add one more option in hash table to fast it.

 

declare hash h(dataset:'want', hashexp: 20);

  

Ksharp
Super User
The last data step could be as simple as the following.




data final_want;
if _n_=1 then do;
 if 0 then set want;
 declare hash h(dataset:'want');
 h.definekey('node');
 h.definedata('household');
 h.definedone();
end;
 set x;
  node=email_id;
  h.find();
 drop  node;
run;
proc print noobs;run;

Ksharp
Super User

Final version code.

 



data x;
input Application_ID   (Email_ID  IP_ID Address_ID phone_ID) ( :$20.);
cards;
1                            email1    ip1       address1     phone1    
2                            email2    ip2        address1    phone2
3                            email3   ip2         address2    phone5
4                            email5    ip1         address3     phone13
5                            email1  ip13       address13  phone13
11 email21 ip21 address21 phone21
12 email22 ip21 address22 phone22
13 email22 ip22 address23 phone23
;
run;

data have;
 set x;
 array x{*} $ Email_ID  IP_ID Address_ID phone_ID;
 length from to $ 100;

if cmiss(of x{*})=dim(x) then delete;
 else if cmiss(of x{*})=1 then do;
     from=coalescec(of x{*});to=uuidgen(0);output; 
 end;
  else do;

          do i=1 to dim(x)-1;
           from=x{i};
            do j=i+1 to dim(x);
             to=x{j};
             if not missing(from) and not missing(to) then output;
             end;
          end;

       end;


 keep from to;
run;




data full;
  set have end=last;
  if _n_ eq 1 then do;
   declare hash h();
    h.definekey('node');
     h.definedata('node');
     h.definedone();
  end;
  output;
  node=from; h.replace();
  from=to; to=node;
  output;
  node=from; h.replace();
  if last then h.output(dataset:'node');
  drop node;
run;


data want(keep=node household);
declare hash ha(ordered:'a');
declare hiter hi('ha');
ha.definekey('count');
ha.definedata('last');
ha.definedone();
declare hash _ha(hashexp: 20);
_ha.definekey('key');
_ha.definedone();

if 0 then set full;
declare hash from_to(dataset:'full(where=(from is not missing and to is not missing))',hashexp:20,multidata:'y');
 from_to.definekey('from');
 from_to.definedata('to');
 from_to.definedone();

if 0 then set node;
declare hash no(dataset:'node');
declare hiter hi_no('no');
 no.definekey('node');
 no.definedata('node');
 no.definedone();
 

do while(hi_no.next()=0);
 household+1; output;
 count=1;
 key=node;_ha.add();
 last=node;ha.add();
 rc=hi.first();
 do while(rc=0);
   from=last;rx=from_to.find();
   do while(rx=0);
     key=to;ry=_ha.check();
      if ry ne 0 then do;
       node=to;output;rr=no.remove(key:node);
       key=to;_ha.add();
       count+1;
       last=to;ha.add();
      end;
      rx=from_to.find_next();
   end;
   rc=hi.next();
end;
ha.clear();_ha.clear();
end;
stop;
run;

data final_want;
if _n_=1 then do;
 if 0 then set want;
 declare hash h(dataset:'want');
 h.definekey('node');
 h.definedata('household');
 h.definedone();
end;
 set x;
 array x{*} $ Email_ID  IP_ID Address_ID phone_ID;
 do i=1 to dim(x);
  node=x{i};call missing(household);
  if h.find()=0 then leave;
 end;
 drop i node;
run;
proc print noobs;run;

Ready to join fellow brilliant minds for the SAS Hackathon?

Build your skills. Make connections. Enjoy creative freedom. Maybe change the world. Registration is now open through August 30th. Visit the SAS Hackathon homepage.

Register today!
How to Concatenate Values

Learn how use the CAT functions in SAS to join values from multiple variables into a single value.

Find more tutorials on the SAS Users YouTube channel.

Click image to register for webinarClick image to register for webinar

Classroom Training Available!

Select SAS Training centers are offering in-person courses. View upcoming courses for:

View all other training opportunities.

Discussion stats
  • 18 replies
  • 1466 views
  • 0 likes
  • 4 in conversation