How to find link between nodes - Page 2

rogerjdeangelis · Posted 03-31-2017 02:26 PM

SAS/WPS/R: How to find links between nodes

link to this message
https://goo.gl/KgN8Xw
https://communities.sas.com/t5/General-SAS-Programming/How-to-find-link-between-nodes/td-p/341819

HAVE
====

Up to 40 obs SD1.HAVE total obs=8

Obs                    STR

 1     za1 > email1 > ip1 > address1 > phone1       >> first cluster
 2     za2 > email2 > ip2 > address1 > phone2
 3     za3 > email3 > ip2 > address2 > phone5
 4     za4 > email5 > ip1 > address3 > phone13
 5     za5 > email1 > ip13 > address13 > phone13

 6     za11 > email21 > ip21 > address21 > phone21  >> second cluster
 7     za12 > email22 > ip21 > address22 > phone22
 8     za13 > email22 > ip22 > address23 > phone23

 6 and 7 are connected by ip21
 7 and 8 are connected by email22


WANT ( There are two non-connected clusters )
==============================================

CLUSTER                    UNIQUE CONNECTED CLUSTERS (ie ip2 only occurs once)

 1     za1>email1>ip1>address1>phone1>za2>email2>ip2>phone2>za3>email3>
       address2>phone5>za4>email5>address3>phone13>za5>ip13>address13

 2     za11>email21>ip21>address21>phone21>za12>email22>address22>phone22>za13>ip22>address23>phone23

WORKING CODE
============

   R - all other code is prep for input and output

        cl <- clusters(graph.data.frame(combspl))$membership[-(1:length(spl))];

    igraph package is heavily used and debugged (not true of all R packages)

FULL SOLUTION

*                _                  _       _
 _ __ ___   __ _| | _____        __| | __ _| |_ __ _
| '_ ` _ \ / _` | |/ / _ \_____ / _` |/ _` | __/ _` |
| | | | | | (_| |   <  __/_____| (_| | (_| | || (_| |
|_| |_| |_|\__,_|_|\_\___|      \__,_|\__,_|\__\__,_|

;

options validvarname=upcase;
libname sd1 "d:/sd1";
data sd1.have(keep=str);
input (Application_ID Email_ID IP_ID Address_ID phone_ID) ( :$20.);
array chr _character_;
str=catx('>',of _character_);
cards4;
za1 email1 ip1 address1 phone1
za2 email2 ip2 address1 phone2
za3 email3 ip2 address2 phone5
za4 email5 ip1 address3 phone13
za5 email1 ip13 address13 phone13
za11 email21 ip21 address21 phone21
za12 email22 ip21 address22 phone22
za13 email22 ip22 address23 phone23
;;;;
run;quit;


%utl_submit_wps64('
libname sd1 "d:/sd1";
options set=R_HOME "C:/Program Files/R/R-3.3.2";
libname wrk "%sysfunc(pathname(work))";
proc r;
submit;
source("c:/Program Files/R/R-3.3.2/etc/Rprofile.site",echo=T);
library(igraph);
library(haven);
data <-read_sas("d:/sd1/have.sas7bdat");
data<-as.character(data$STR);
spl <- strsplit(data,">");
combspl <- data.frame(
  grp = rep(seq_along(spl),lengths(spl)),
  val = unlist(spl)
);
cl <- clusters(graph.data.frame(combspl))$membership[-(1:length(spl))];
dat <- data.frame(cl);
dat[,2] <- row.names(dat);
a <- character(0);
for (i in 1:max(cl)) {
  a[i] <- paste(paste0(dat[(dat[,1] == i),][,2]), collapse=">");
};
endsubmit;
import r=a data=wrk.linkages;
run;quit;
');

proc print data=linkages width=min;
run;quit;

Up to 40 obs from linkages total obs=2

CLUSTERS

 1     za1 > email1 > ip1 > address1 > phone1 > za2 > email2 > ip2 > phone2 >
       za3 > email3 > address2 > phone5 > za4 > email5 > address3 > phone13 >
       za5 > ip13 > address13

 2     za11 > email21 > ip21 > address21 > phone21 > za12 > email22 >
       address22 > phone22 > za13 > ip22 > address23 > phone23

Ksharp · Posted 03-20-2017 07:53 AM

In the last data step, you could add one more option in hash table to fast it.

declare hash h(dataset:'want', hashexp: 20);

Ksharp · Posted 03-17-2017 06:37 AM

The last data step could be as simple as the following.




data final_want;
if _n_=1 then do;
 if 0 then set want;
 declare hash h(dataset:'want');
 h.definekey('node');
 h.definedata('household');
 h.definedone();
end;
 set x;
  node=email_id;
  h.find();
 drop  node;
run;
proc print noobs;run;

Ksharp · Posted 03-31-2017 11:16 AM

Final version code.



data x;
input Application_ID   (Email_ID  IP_ID Address_ID phone_ID) ( :$20.);
cards;
1                            email1    ip1       address1     phone1    
2                            email2    ip2        address1    phone2
3                            email3   ip2         address2    phone5
4                            email5    ip1         address3     phone13
5                            email1  ip13       address13  phone13
11 email21 ip21 address21 phone21
12 email22 ip21 address22 phone22
13 email22 ip22 address23 phone23
;
run;

data have;
 set x;
 array x{*} $ Email_ID  IP_ID Address_ID phone_ID;
 length from to $ 100;

if cmiss(of x{*})=dim(x) then delete;
 else if cmiss(of x{*})=1 then do;
     from=coalescec(of x{*});to=uuidgen(0);output; 
 end;
  else do;

          do i=1 to dim(x)-1;
           from=x{i};
            do j=i+1 to dim(x);
             to=x{j};
             if not missing(from) and not missing(to) then output;
             end;
          end;

       end;


 keep from to;
run;




data full;
  set have end=last;
  if _n_ eq 1 then do;
   declare hash h();
    h.definekey('node');
     h.definedata('node');
     h.definedone();
  end;
  output;
  node=from; h.replace();
  from=to; to=node;
  output;
  node=from; h.replace();
  if last then h.output(dataset:'node');
  drop node;
run;


data want(keep=node household);
declare hash ha(ordered:'a');
declare hiter hi('ha');
ha.definekey('count');
ha.definedata('last');
ha.definedone();
declare hash _ha(hashexp: 20);
_ha.definekey('key');
_ha.definedone();

if 0 then set full;
declare hash from_to(dataset:'full(where=(from is not missing and to is not missing))',hashexp:20,multidata:'y');
 from_to.definekey('from');
 from_to.definedata('to');
 from_to.definedone();

if 0 then set node;
declare hash no(dataset:'node');
declare hiter hi_no('no');
 no.definekey('node');
 no.definedata('node');
 no.definedone();
 

do while(hi_no.next()=0);
 household+1; output;
 count=1;
 key=node;_ha.add();
 last=node;ha.add();
 rc=hi.first();
 do while(rc=0);
   from=last;rx=from_to.find();
   do while(rx=0);
     key=to;ry=_ha.check();
      if ry ne 0 then do;
       node=to;output;rr=no.remove(key:node);
       key=to;_ha.add();
       count+1;
       last=to;ha.add();
      end;
      rx=from_to.find_next();
   end;
   rc=hi.next();
end;
ha.clear();_ha.clear();
end;
stop;
run;

data final_want;
if _n_=1 then do;
 if 0 then set want;
 declare hash h(dataset:'want');
 h.definekey('node');
 h.definedata('household');
 h.definedone();
end;
 set x;
 array x{*} $ Email_ID  IP_ID Address_ID phone_ID;
 do i=1 to dim(x);
  node=x{i};call missing(household);
  if h.find()=0 then leave;
 end;
 drop i node;
run;
proc print noobs;run;

Re: How to find link between nodes

Re: How to find link between nodes

Re: How to find link between nodes

Re: How to find link between nodes

Re: How to find link between nodes

Re: How to find link between nodes

Re: How to find link between nodes

Re: How to find link between nodes

Registration is open

SAS Training: Just a Click Away