Hello
I need your appreciated assistance in the below case
I have a data set includes three columns Customer ID , Existing Account and New Account , the existing account could be replaced with the new account and the new account will open a new record to be a existing account , and the account may be replaced many times . Like below
Data have ; input Cust_ID $3. Exist_Account $3. New_Account $4.; datalines; 003 CA CB 002 BB BC 002 BC 001 AA AB 003 DC DD 001 AC AD 001 AD AE 003 CB CC 003 DD 003 DB DC 003 DA DB 002 BA BB 001 AE 001 AB AC 003 CC ; Run ;
for example : Existing Account "AA" replaced by new account "AB" and "AB" opened a new record to be existing account then "AB" Replaced by "AC" and "AC" opened a new record to be existing account then "AC" Replaced by "AD" and "AD" opened a new record to be existing account then "AD" Replaced by "AE" and "AE" opened a new record to be existing account .
Cust_ID | Exist_Account | New_Account |
003 | CA | CB |
002 | BB | BC |
002 | BC | |
001 | AA | AB |
003 | DC | DD |
001 | AC | AD |
001 | AD | AE |
003 | CB | CC |
003 | DD | |
003 | DB | DC |
003 | DA | DB |
002 | BA | BB |
001 | AE | |
001 | AB | AC |
003 | CC |
the results should be like the below:
Cust_ID | First_Account | Last_Account | Replacement_Count |
001 | AA | AE | 4 |
002 | BA | BC | 2 |
003 | CA | CC | 2 |
003 | DA | DD | 3 |
Thank you ...
500,000 records is small dataset as my opinion .
You must run into a dead loop like :
003 CA CB
002 CB CA
Try this one : (Edited)
proc sql;
create table ancentor as
select * from have
where exist_account not in
(select new_account from have)
order by 1;
quit;
data want;
if _n_=1 then do;
declare hash h(dataset:'have(where=(exist_account is not missing and
new_account is not missing))');
h.definekey('exist_account');
h.definedata('new_account');
h.definedone();
length kk $ 80;
declare hash k();
k.definekey('kk');
k.definedone();
end;
set ancentor;
fist=exist_account;
count=0;
do until(rc ne 0 or rx = 0);
if not missing(exist_account) then do;kk=exist_account; k.replace();end;
call missing(new_account);
rc=h.find();
rx=k.check(key:new_account);
if rc=0 then do;exist_account=new_account;if rx ne 0 then do;count+1;last=new_account;end;end;
end;
k.clear();
drop exist_account new_account rc kk rx;
run;
I wasn't sure about the extra split in 003, but this is nearly there, you can add the other restriction, maybe by adding another grouping variable to the data.
data have; input cust_id $3. exist_account $3. new_account $4.; datalines; 003 CA CB 002 BB BC 002 BC 001 AA AB 003 DC DD 001 AC AD 001 AD AE 003 CB CC 003 DD 003 DB DC 003 DA DB 002 BA BB 001 AE 001 AB AC 003 CC ; run; proc sort data=have out=want; by cust_id exist_account; run; data want; set want; retain replacement_count first_account; by cust_id; if first.cust_id then do; first_account=exist_account; if new_account ne "" then replacement_count=sum(replacement_count,1); end; else if last.cust_id then do; last_account=exist_account; output; end; else if new_account ne "" then replacement_count=sum(replacement_count,1); run;
Please re-read what I posted:
"
I wasn't sure about the extra split in 003, but this is nearly there, you can add the other restriction, maybe by adding another grouping variable to the data.
"
I do not know about the split in cust 3 - as I suggested you should add some information into the dataset to indicate a new account, maybe something like:
cust_id account exist_account ...
003 1 ...
003 2 ...
Then where I use just by cust_id, you would add in by cust_id account, and do first/last from that.
Are you sure it is one to one match ?
data have;
input cust_id $3. exist_account $3. new_account $4.;
datalines;
003 CA CB
002 BB BC
002 BC
001 AA AB
003 DC DD
001 AC AD
001 AD AE
003 CB CC
003 DD
003 DB DC
003 DA DB
002 BA BB
001 AE
001 AB AC
003 CC
;
run;
proc sql;
create table ancentor as
select * from have
where exist_account not in
(select new_account from have)
order by 1;
quit;
data want;
if _n_=1 then do;
declare hash h(dataset:'have(where=(exist_account is not missing and
new_account is not missing))');
h.definekey('exist_account');
h.definedata('new_account');
h.definedone();
end;
set ancentor;
fist=exist_account;
count=0;
do until(rc ne 0);
call missing(new_account);
rc=h.find();
if rc=0 then do;exist_account=new_account;count+1;last=new_account;end;
end;
drop exist_account new_account rc;
run;
proc print;run;
500,000 records is small dataset as my opinion .
You must run into a dead loop like :
003 CA CB
002 CB CA
Try this one : (Edited)
proc sql;
create table ancentor as
select * from have
where exist_account not in
(select new_account from have)
order by 1;
quit;
data want;
if _n_=1 then do;
declare hash h(dataset:'have(where=(exist_account is not missing and
new_account is not missing))');
h.definekey('exist_account');
h.definedata('new_account');
h.definedone();
length kk $ 80;
declare hash k();
k.definekey('kk');
k.definedone();
end;
set ancentor;
fist=exist_account;
count=0;
do until(rc ne 0 or rx = 0);
if not missing(exist_account) then do;kk=exist_account; k.replace();end;
call missing(new_account);
rc=h.find();
rx=k.check(key:new_account);
if rc=0 then do;exist_account=new_account;if rx ne 0 then do;count+1;last=new_account;end;end;
end;
k.clear();
drop exist_account new_account rc kk rx;
run;
Hi @Amr_Alaaeldin if the proc sort is right as did by RW, I think this is pretty straight forward assuming my understanding is right
data have;
infile cards truncover;
input cust_id $3. exist_account $3. new_account $4.;
datalines;
003 CA CB
002 BB BC
002 BC
001 AA AB
003 DC DD
001 AC AD
001 AD AE
003 CB CC
003 DD
003 DB DC
003 DA DB
002 BA BB
001 AE
001 AB AC
003 CC
004 AA
;
run;
proc sort data=have out=_have;
by cust_id exist_account;
run;
data want;
set _have;
by cust_id exist_account;
array t(10) $ _temporary_;/*arbitrary subscript, should assign something large like 9999*/
retain First_Account;
if first.cust_id then do; First_Account=exist_account;Replacement_Count=0;end;
if first.cust_id and missing(new_account) then
do;
last_account=exist_account;
output;
end;
else if not missing(new_account) and (new_account not in t) then
do;
if Replacement_Count=0 then First_Account=exist_account;
Replacement_Count+1;
t(Replacement_Count)=new_account;
end;
else if missing(new_account) then
do;
last_account=exist_account;
output;
Replacement_Count=0;
call missing(of t(*));
end;
keep cust_id First_Account last_account Replacement_Count;
run;
SAS Innovate 2025 is scheduled for May 6-9 in Orlando, FL. Sign up to be first to learn about the agenda and registration!
Learn how use the CAT functions in SAS to join values from multiple variables into a single value.
Find more tutorials on the SAS Users YouTube channel.
Ready to level-up your skills? Choose your own adventure.