New SAS User

Completely new to SAS or trying something new with SAS? Post here for help getting started.
BookmarkSubscribeRSS Feed
🔒 This topic is solved and locked. Need further help from the community? Please sign in and ask a new question.
pfyeo88
Calcite | Level 5

Hi,

 

I have been grappling with this problem for a while now and can't figure out a way to do this. I am using SAS EG7.1 and my data structure is Data Vault 2.0 as you will see from the code provided

 

Background

In the domain of banking, a home loan is linked to a security i.e. a House. The security can have many accounts and similarly an account may be connected to many securities.

 

In calculating an LVR (loan-to-value ratio) it is important to be able to identify all accounts and all securities that make up a relationship. 

 

Data Example

 

TABLE1

 

Account_id       Security_id       Link_id

111                      xxx                    a1

222                     xxx                    a2

333                     xxx                    a3

444                     xxx                    a4

444                     yyy                    a5

555                     yyy                    a6

666                     yyy                    a7

 

The table represents one relationship which needs to be identified as a group ID. This operation needs to be completed on a entire database scale so sequential ID's cannot be assumed. 

 

Desired Output

 

TABLE1

Account_id       Group_id       

111                      ZZZ                    

222                     ZZZ                    

333                     ZZZ                   

444                     ZZZ                  

555                     ZZZ                  

666                     ZZZ               

 

TABLE2

Security_id       Group_id

xxx                     ZZZ

yyy                     ZZZ

 

 

 

What I have

 

 

data links (keep=hub_collateral_key hub_account_key);
	merge target.lnk_coll_account (in=a )
target.lsat_coll_account (in=b where=("&long_dt"dt between effective_from_dtm and effective_to_dtm ));
by lnk_coll_account_key;
if a and b then output;
run;

data linkz (keep=hub_collateral_key hub_account_key);
set target.lnk_collateral_account;
run;

proc sort data=links;
	by hub_account_key;
run;

data Accounts (keep=group_id hub_account_key) all;
	set links;
	by hub_collateral_key;
	retain group_id 100;

	if first.hub_collateral_key then
		group_id +1;
	output Accounts;
	output all;
run;

proc sort data=accounts;
	by hub_account_key group_id;
run;

data account_dedupe;
	set accounts;
	by hub_account_key group_id;

	if first.hub_account_key then
		output;
run;

proc sql;
	create table join_one as
		select 
			a.group_id,
			b.hub_collateral_key,
			a.hub_account_key

		from account_dedupe a 
			inner join all b on (a.hub_account_key = b.hub_account_key);
quit;

data collaterals (keep=group_id hub_collateral_key) all_2;
	set join_one;
	output collaterals;
	output all_2;
run;

proc sort data=collaterals;
	by hub_collateral_key group_id;
run;

data collateral_dedupe;
	set collaterals;
	by hub_collateral_key group_id;

	if first.hub_collateral_key then
		output;
run;

proc sql;
	create table join_two as
		select 
			a.group_id,
			b.hub_account_key,
			a.hub_collateral_key

		from collateral_dedupe a 
			inner join all_2 b on (a.hub_collateral_key = b.hub_collateral_key);
quit;

data collateral_groups (keep=group_id hub_collateral_key) account_groups (keep=group_id hub_account_key);
	set join_two;
	output collateral_groups;
	output account_groups;
run;

proc sort data=collateral_groups nodupkey;
	by group_id hub_collateral_key;
run;

proc sort data=account_groups nodupkey;
	by group_id hub_account_key;
run;

 

 

Why this doesn't Work

 

This process doesn't identify the accounts only connected to security yyy as a part of the group that occurs first sequentially which is connected to secuity xxx.

 

Constraints

 

  1. Proc Sql to be used as a preference.
  2. My table contains thousands of these relationships and I need to be able to identify them all.

 

Hoping you can help.

 

Thanks in advance!

1 ACCEPTED SOLUTION

Accepted Solutions
Ksharp
Super User

data have(rename=(Account_id=from Security_id=to));
infile cards ;
input Account_id $  Security_id   $ ;
cards;       
111                      xxx                   
222                     xxx                   
333                     xxx                   
444                     xxx                   
444                     yyy                   
555                     yyy                 
666                     yyy  
;
run;
data full;
  set have end=last;
  if _n_ eq 1 then do;
   declare hash h();
    h.definekey('node');
     h.definedata('node');
     h.definedone();
  end;
  output;
  node=from; h.replace();
  from=to; to=node;
  output;
  node=from; h.replace();
  if last then h.output(dataset:'node');
  drop node;
run;


data want(keep=node household);
declare hash ha(ordered:'a');
declare hiter hi('ha');
ha.definekey('count');
ha.definedata('last');
ha.definedone();
declare hash _ha(hashexp: 20);
_ha.definekey('key');
_ha.definedone();

if 0 then set full;
declare hash from_to(dataset:'full(where=(from is not missing and to is not missing))',hashexp:20,multidata:'y');
 from_to.definekey('from');
 from_to.definedata('to');
 from_to.definedone();

if 0 then set node;
declare hash no(dataset:'node');
declare hiter hi_no('no');
 no.definekey('node');
 no.definedata('node');
 no.definedone();
 

do while(hi_no.next()=0);
 household+1; output;
 count=1;
 key=node;_ha.add();
 last=node;ha.add();
 rc=hi.first();
 do while(rc=0);
   from=last;rx=from_to.find();
   do while(rx=0);
     key=to;ry=_ha.check();
      if ry ne 0 then do;
       node=to;output;rr=no.remove(key:node);
       key=to;_ha.add();
       count+1;
       last=to;ha.add();
      end;
      rx=from_to.find_next();
   end;
   rc=hi.next();
end;
ha.clear();_ha.clear();
end;
stop;
run;

data final_want;
if _n_=1 then do;
  if 0 then set want;
  declare hash h(dataset:'want');
  h.definekey('node');
  h.definedata('household');
  h.definedone();
end;
 set have(keep=from);
 call missing(household);
 rc=h.find(key:from);
 drop rc node;
run;


proc print;run;

View solution in original post

5 REPLIES 5
SASKiwi
PROC Star

I also work for a bank and we calculate LVRs in a similar way, except we base it on security-account-customer linkages. This is so we ensure that a customer only belongs to one LVR group. The way you are doing it means that the same customer could be in multiple LVR groups. I'm assuming this is by design and you are OK with this. If not please advise.

 

I've found DATA steps are the best way to tackle this type of problem, starting with a list of all of the linkages. Conditional logic is required and SQL is not good for this.

pfyeo88
Calcite | Level 5

Hi SASKiwi,

 

Thanks for your response. I am approaching this problem from a credit risk perspective so really only interested in the Collateral-Account relationship with the intention of revaluing our portfolio and identify areas of negative equity.

 

Do you have any code examples that you could share for reference?

Ksharp
Super User

data have(rename=(Account_id=from Security_id=to));
infile cards ;
input Account_id $  Security_id   $ ;
cards;       
111                      xxx                   
222                     xxx                   
333                     xxx                   
444                     xxx                   
444                     yyy                   
555                     yyy                 
666                     yyy  
;
run;
data full;
  set have end=last;
  if _n_ eq 1 then do;
   declare hash h();
    h.definekey('node');
     h.definedata('node');
     h.definedone();
  end;
  output;
  node=from; h.replace();
  from=to; to=node;
  output;
  node=from; h.replace();
  if last then h.output(dataset:'node');
  drop node;
run;


data want(keep=node household);
declare hash ha(ordered:'a');
declare hiter hi('ha');
ha.definekey('count');
ha.definedata('last');
ha.definedone();
declare hash _ha(hashexp: 20);
_ha.definekey('key');
_ha.definedone();

if 0 then set full;
declare hash from_to(dataset:'full(where=(from is not missing and to is not missing))',hashexp:20,multidata:'y');
 from_to.definekey('from');
 from_to.definedata('to');
 from_to.definedone();

if 0 then set node;
declare hash no(dataset:'node');
declare hiter hi_no('no');
 no.definekey('node');
 no.definedata('node');
 no.definedone();
 

do while(hi_no.next()=0);
 household+1; output;
 count=1;
 key=node;_ha.add();
 last=node;ha.add();
 rc=hi.first();
 do while(rc=0);
   from=last;rx=from_to.find();
   do while(rx=0);
     key=to;ry=_ha.check();
      if ry ne 0 then do;
       node=to;output;rr=no.remove(key:node);
       key=to;_ha.add();
       count+1;
       last=to;ha.add();
      end;
      rx=from_to.find_next();
   end;
   rc=hi.next();
end;
ha.clear();_ha.clear();
end;
stop;
run;

data final_want;
if _n_=1 then do;
  if 0 then set want;
  declare hash h(dataset:'want');
  h.definekey('node');
  h.definedata('household');
  h.definedone();
end;
 set have(keep=from);
 call missing(household);
 rc=h.find(key:from);
 drop rc node;
run;


proc print;run;
pfyeo88
Calcite | Level 5

Hi Ksharp,

 

Thanks for this! It appears to be working as expected however, the query performance is very slow on my data set. Are there any alternatives you could suggest that would improve the query run time?

 

Thanks,

Ksharp
Super User

It should not be . Do you have a big table ? Or could you post some data and see what kind of data you have .