Hi, My datasets like this. I wanna make sure that both datasets contains the same cusips with the same yymm_a. Thanks
data WORK.TEST2;
infile datalines dsd truncover;
input CUSIP:$8. fe:32. yymm_a:YYMMN6.;
format yymm_a YYMMN6.;
label CUSIP="CUSIP/SEDOL" fe="Mean Estimate";
datalines4;
00036020,0.06,199201
00036020,0.06,199201
00036020,0.06,199201
00036020,0.07,199201
00036110,0.91,199705
00036110,1.24,199705
00036110,1.47,199705
00036110,1.73,199705
00036110,1.55,199705
00036110,1.61,199705
00036110,1.54,199705
00036110,1.25,199705
00036110,1.19,199705
00036110,1.65,199705
00036110,1.71,199705
00036110,1.24,199705
00036110,1.46,199705
00036110,1.47,199810
00036110,1.04,199810
00036110,1.73,199810
;;;;
data WORK.TEST1;
infile datalines dsd truncover;
input CUSIP:$8. fe:32. yymm_a:YYMMN6.;
format yymm_a YYMMN6.;
label CUSIP="CUSIP/SEDOL" fe="Mean Estimate";
datalines4;
00036020,0.09,199801
00036020,0.05,199801
00036020,0.03,199801
00036020,0.03,199801
00036020,0.07,199801
00036020,0.03,199801
00036020,0.06,199801
00036020,0.03,199801
00036020,0.04,199801
00036110,0.88,199705
00036110,0.46,199705
00036110,0.57,199705
00036110,0.47,199705
00036110,0.91,199705
00036110,0.93,199705
00036110,0.41,199705
00036110,0.65,199705
00036110,0.51,199705
00036110,0.47,199705
00036110,0.88,199810
;;;;
The code I tried which worked well when the amount of samples is not very large.
data test11;
set test1;
run;
data test21;
set test2;
run;
proc sort data=test11 nodupkey;
by cusip yymm_a;
run;
proc sort data=test21 nodupkey;
by cusip yymm_e;
run;
proc sql;
create table wtest as select a.cusip,b.cusip as cusip1,a.yymm_a,b.yymm_e
from test11 as a,
test21 as b
where a.cusip=b.cusip and
a.yymm_a=b.yymm_a;
quit;
proc sql;
create table test13 as select *
from test1
where cusip in (select cusip from wtest) and
yymm_a in (select yymm_a from wtest);
quit;
proc sql;
create table test23 as select *
from test2
where cusip in (select cusip from wtest) and
yymm_e in (select yymm_e from wtest);
quit;
If I understand you, is this what you are after, a comparison:
data WORK.TEST1;
infile datalines dsd truncover;
input CUSIP:$8. fe:32. yymm_a:YYMMN6.;
format yymm_a YYMMN6.;
label CUSIP="CUSIP/SEDOL" fe="Mean Estimate";
datalines4;
00036020,0.06,199201
00036020,0.06,199201
00036020,0.06,199201
00036020,0.07,199201
00036110,0.91,199705
00036110,1.24,199705
00036110,1.47,199705
00036110,1.73,199705
00036110,1.55,199705
00036110,1.61,199705
00036110,1.54,199705
00036110,1.25,199705
00036110,1.19,199705
00036110,1.65,199705
00036110,1.71,199705
00036110,1.24,199705
00036110,1.46,199705
00036110,1.47,199810
00036110,1.04,199810
00036110,1.73,199810
;;;;
data WORK.TEST2;
infile datalines dsd truncover;
input CUSIP:$8. fe:32. yymm_a:YYMMN6.;
format yymm_a YYMMN6.;
label CUSIP="CUSIP/SEDOL" fe="Mean Estimate";
datalines4;
00036020,0.09,199801
00036020,0.05,199801
00036020,0.03,199801
00036020,0.03,199801
00036020,0.07,199801
00036020,0.03,199801
00036020,0.06,199801
00036020,0.03,199801
00036020,0.04,199801
00036110,0.88,199705
00036110,0.46,199705
00036110,0.57,199705
00036110,0.47,199705
00036110,0.91,199705
00036110,0.93,199705
00036110,0.41,199705
00036110,0.65,199705
00036110,0.51,199705
00036110,0.47,199705
00036110,0.88,199810
;;;;
proc sort data=test1 nodupkey;
by cusip yymm_a;
run;
proc sort data=test2 nodupkey;
by cusip yymm_a;
run;
data exist_only_test1 exist_only_test2 exist_test1_test2 ;
merge test1(in=a) test2(in=b );
by cusip yymm_a;
if a and b then output exist_test1_test2;
else if a and not b then output exist_only_test1;
else if b and not a then output exist_only_test2;
run;
The simplest way is to run two proc sorts and a merge:
proc sort data=test1 (keep=cusip yymm_a) out=need1 nodupkey;
by cusip yymm_a;
run;
proc sort data=test2 (keep=cusip yymm_a) out=need2 nodupkey;
by cusip yymm_a;
run;
data test1_only test2_only both;
merge need1 (in=in1) need2 (in=in2);
by cusip yymm_a;
if in1=in2 then output both;
else if in1 then output test1_only;
else output test2_only;
run;
Thank you ,your method is right, and my problem is how I can convert the result into my original datasets test1 and test2?
Hi, I posted the problem yesterday and got no answer. My datasets like this. I wanna make sure that both datasets contains the same cusips with the same yymm_a without adding observations. Thanks ahead
data WORK.TEST2;
infile datalines dsd truncover;
input CUSIP:$8. fe:32. yymm_a:YYMMN6.;
format yymm_a YYMMN6.;
label CUSIP="CUSIP/SEDOL" fe="Mean Estimate";
datalines4;00036020,0.06,199201
00036020,0.06,199201
00036020,0.06,199201
00036020,0.07,199201
00036110,0.91,199705
00036110,1.24,199705
00036110,1.47,199705
00036110,1.73,199705
00036110,1.55,199705
00036110,1.61,199705
00036110,1.54,199705
00036110,1.25,199705
00036110,1.19,199705
00036110,1.65,199705
00036110,1.71,199705
00036110,1.24,199705
00036110,1.46,199705
00036110,1.47,199810
00036110,1.04,199810
00036110,1.73,199810;;;;
data WORK.TEST1;
infile datalines dsd truncover;
input CUSIP:$8. fe:32. yymm_a:YYMMN6.;
format yymm_a YYMMN6.;
label CUSIP="CUSIP/SEDOL" fe="Mean Estimate";
datalines4;00036020,0.09,199801
00036020,0.05,199801
00036020,0.03,199801
00036020,0.03,199801
00036020,0.07,199801
00036020,0.03,199801
00036020,0.06,199801
00036020,0.03,199801
00036020,0.04,199801
00036110,0.88,199705
00036110,0.46,199705
00036110,0.57,199705
00036110,0.47,199705
00036110,0.91,199705
00036110,0.93,199705
00036110,0.41,199705
00036110,0.65,199705
00036110,0.51,199705
00036110,0.47,199705
00036110,0.88,199810;;;;
The codes I tried worked well when the amount of samples is not very large, but with the number of observations increasing, the diffrerence happened.
data test11;
set test1;
run;
data test21;
set test2;
run;
proc sort data=test11 nodupkey;
by cusip yymm_a;
run;
proc sort data=test21 nodupkey;
by cusip yymm_e;
run;
proc sql;
create table wtest as select a.cusip,b.cusip as cusip1,a.yymm_a,b.yymm_e
from test11 as a,
test21 as b
where a.cusip=b.cusip and
a.yymm_a=b.yymm_a;
quit;
proc sql;
create table test13 as select *
from test1
where cusip in (select cusip from wtest) and
yymm_a in (select yymm_a from wtest);
quit;
proc sql;
create table test23 as select *
from test2
where cusip in (select cusip from wtest) and
yymm_e in (select yymm_e from wtest);
quit;
Ok, scrub the below, I ahve just checked your other posts and this was already given to you, hence why its not a good idea to duplicate post. Please explain you request clearly, as I got to exactly the same thing as another member from what you posted, and this doesn't seem to cover what you want, hence explain what you want. I will merge this with your other post.
Can you not do it with proc compare:
proc compare base=test2 (kee=cusip yymm_a) comp=test1 (keep=cuspid yymm_a) listall; run;
Or a simple sort merge:
proc sort data=test2 out=b (keep=cusip yymm_a) nodupkey; by cusip yymm_a; run; proc sort data=test1 out=c (keep=cusip yymm_a) nodupkey; by cusip yymm_a; run; data want; merge b (in=a) c (in=b); by cusip yymm_a; if not(a and b); run;
@RW9Thanks ,sorry I didnt expaln my logic very clearly. I want the both datasets to be sperately so I can compare the change of variable g. As the result , the same cusip with same date is wanted.
Maybe show what the output should be from the given test data. Another guess, remove the not() function?
@RW9 I want the result is: As you can see , both table are seperated and cusip and yymm_a are same. I tried to merge your result with test1 and test2, but there are too many duplicated observations which I can't control in my result .
result of test1
cusip g yymm_a
00036110,0.91,199705
00036110,1.24,199705
00036110,1.47,199705
00036110,1.73,199705
00036110,1.55,199705
00036110,1.61,199705
00036110,1.54,199705
00036110,1.25,199705
00036110,1.19,199705
00036110,1.65,199705
00036110,1.71,199705
00036110,1.24,199705
00036110,1.46,199705
00036110,1.47,199810
00036110,1.04,199810
00036110,1.73,199810
result of test2
00036110,0.88,199705
00036110,0.46,199705
00036110,0.57,199705
00036110,0.47,199705
00036110,0.91,199705
00036110,0.93,199705
00036110,0.41,199705
00036110,0.65,199705
00036110,0.51,199705
00036110,0.47,199705
00036110,0.88,199810
Yes, still not really following. I am done for the day now, but got to here, maybe you can refine it with specific logic:
data WORK.TEST2; infile datalines dsd truncover; input CUSIP:$8. fe:32. yymm_a:YYMMN6.; format yymm_a YYMMN6.; label CUSIP="CUSIP/SEDOL" fe="Mean Estimate"; datalines; 00036020,0.06,199201 00036020,0.06,199201 00036020,0.06,199201 00036020,0.07,199201 00036110,0.91,199705 00036110,1.24,199705 00036110,1.47,199705 00036110,1.73,199705 00036110,1.55,199705 00036110,1.61,199705 00036110,1.54,199705 00036110,1.25,199705 00036110,1.19,199705 00036110,1.65,199705 00036110,1.71,199705 00036110,1.24,199705 00036110,1.46,199705 00036110,1.47,199810 00036110,1.04,199810 00036110,1.73,199810 ; run; data WORK.TEST1; infile datalines dsd truncover; input CUSIP:$8. fe:32. yymm_a:YYMMN6.; format yymm_a YYMMN6.; label CUSIP="CUSIP/SEDOL" fe="Mean Estimate"; datalines; 00036020,0.09,199801 00036020,0.05,199801 00036020,0.03,199801 00036020,0.03,199801 00036020,0.07,199801 00036020,0.03,199801 00036020,0.06,199801 00036020,0.03,199801 00036020,0.04,199801 00036110,0.88,199705 00036110,0.46,199705 00036110,0.57,199705 00036110,0.47,199705 00036110,0.91,199705 00036110,0.93,199705 00036110,0.41,199705 00036110,0.65,199705 00036110,0.51,199705 00036110,0.47,199705 00036110,0.88,199810 ; run; data res1 (keep=cusip fe yymm_a) res2 (keep=cusip fe_tmp yymm_tmp rename=(fe_tmp=fe yymm_tmp=yymm_a)); merge test1 (in=a) test2 (in=b rename=(fe=fe_tmp yymm_a=yymm_tmp)); by cusip; if a and b and yymm_a ne yymm_tmp then delete; if a then output res1; if b then output res2; run;
@RW9 I study your codes, and I try to explain as my understanding. First, you deleted the part of a.cusip=b.cusip& a.yymm_a ne b.yymm_a, then the left part includes a.cusip=b.cusip& a.yymm_a=b.yymm_a and a.cusip ne b.cusip; Second, you pick the left part in a. and left part in b. I wonder how your result looks like. Could you explain to me ?
My logic is very simple. you see cusip 00036110 whose date is 199705 in test1, and the date of cusip 00036110 in test2 is also 199705, so all the cusip 00036110 with date 199705 should be kept in test1 and test2.
Thanks
SAS Innovate 2025 is scheduled for May 6-9 in Orlando, FL. Sign up to be first to learn about the agenda and registration!
Learn how use the CAT functions in SAS to join values from multiple variables into a single value.
Find more tutorials on the SAS Users YouTube channel.
Ready to level-up your skills? Choose your own adventure.