BookmarkSubscribeRSS Feed
lixuan
Obsidian | Level 7

Hi, My datasets like this. I wanna make sure that both datasets contains the same cusips with the same yymm_a. Thanks

data WORK.TEST2;
  infile datalines dsd truncover;
  input CUSIP:$8. fe:32. yymm_a:YYMMN6.;
  format yymm_a YYMMN6.;
  label CUSIP="CUSIP/SEDOL" fe="Mean Estimate";
datalines4;
00036020,0.06,199201
00036020,0.06,199201
00036020,0.06,199201
00036020,0.07,199201
00036110,0.91,199705
00036110,1.24,199705
00036110,1.47,199705
00036110,1.73,199705
00036110,1.55,199705
00036110,1.61,199705
00036110,1.54,199705
00036110,1.25,199705
00036110,1.19,199705
00036110,1.65,199705
00036110,1.71,199705
00036110,1.24,199705
00036110,1.46,199705
00036110,1.47,199810
00036110,1.04,199810
00036110,1.73,199810
;;;;
data WORK.TEST1;
  infile datalines dsd truncover;
  input CUSIP:$8. fe:32. yymm_a:YYMMN6.;
  format yymm_a YYMMN6.;
  label CUSIP="CUSIP/SEDOL" fe="Mean Estimate";
datalines4;
00036020,0.09,199801
00036020,0.05,199801
00036020,0.03,199801
00036020,0.03,199801
00036020,0.07,199801
00036020,0.03,199801
00036020,0.06,199801
00036020,0.03,199801
00036020,0.04,199801
00036110,0.88,199705
00036110,0.46,199705
00036110,0.57,199705
00036110,0.47,199705
00036110,0.91,199705
00036110,0.93,199705
00036110,0.41,199705
00036110,0.65,199705
00036110,0.51,199705
00036110,0.47,199705
00036110,0.88,199810
;;;;

The code I tried which worked  well when the amount of samples is not very large.

data test11;
set test1;
run; 
data test21;
set test2;
run;
proc sort data=test11 nodupkey;
by cusip yymm_a;
run;
proc sort data=test21 nodupkey;
by cusip yymm_e;
run;
proc sql;
create table wtest as select  a.cusip,b.cusip as cusip1,a.yymm_a,b.yymm_e
from test11  as a,
	 test21  as b
where a.cusip=b.cusip and 
	  a.yymm_a=b.yymm_a;
quit;
proc sql;
create table test13 as select *
from test1
where cusip in (select cusip from wtest) and 
	  yymm_a in (select yymm_a from wtest);
quit;
proc sql;
create table test23 as select *
from test2
where cusip in (select cusip from wtest) and 
	  yymm_e in (select yymm_e from wtest);
quit; 

 

10 REPLIES 10
novinosrin
Tourmaline | Level 20

If I understand you, is this what you are after, a comparison:

 

data WORK.TEST1;

  infile datalines dsd truncover;

  input CUSIP:$8. fe:32. yymm_a:YYMMN6.;

  format yymm_a YYMMN6.;

  label CUSIP="CUSIP/SEDOL" fe="Mean Estimate";

datalines4;

00036020,0.06,199201

00036020,0.06,199201

00036020,0.06,199201

00036020,0.07,199201

00036110,0.91,199705

00036110,1.24,199705

00036110,1.47,199705

00036110,1.73,199705

00036110,1.55,199705

00036110,1.61,199705

00036110,1.54,199705

00036110,1.25,199705

00036110,1.19,199705

00036110,1.65,199705

00036110,1.71,199705

00036110,1.24,199705

00036110,1.46,199705

00036110,1.47,199810

00036110,1.04,199810

00036110,1.73,199810

;;;;

data WORK.TEST2;

  infile datalines dsd truncover;

  input CUSIP:$8. fe:32. yymm_a:YYMMN6.;

  format yymm_a YYMMN6.;

  label CUSIP="CUSIP/SEDOL" fe="Mean Estimate";

datalines4;

00036020,0.09,199801

00036020,0.05,199801

00036020,0.03,199801

00036020,0.03,199801

00036020,0.07,199801

00036020,0.03,199801

00036020,0.06,199801

00036020,0.03,199801

00036020,0.04,199801

00036110,0.88,199705

00036110,0.46,199705

00036110,0.57,199705

00036110,0.47,199705

00036110,0.91,199705

00036110,0.93,199705

00036110,0.41,199705

00036110,0.65,199705

00036110,0.51,199705

00036110,0.47,199705

00036110,0.88,199810

;;;;

 

 


proc sort data=test1 nodupkey;
by cusip yymm_a;
run;
proc sort data=test2 nodupkey;
by cusip yymm_a;
run;

data exist_only_test1 exist_only_test2 exist_test1_test2 ;
merge test1(in=a) test2(in=b );
by cusip yymm_a;
if a and b then output exist_test1_test2;
else if a and not b then output exist_only_test1;
else if b and not a then output exist_only_test2;
run;

mkeintz
PROC Star

The simplest way is to run two proc sorts and a merge:

 

 

proc sort data=test1 (keep=cusip yymm_a) out=need1 nodupkey;

  by cusip yymm_a;

run;

 

proc sort data=test2 (keep=cusip yymm_a) out=need2 nodupkey;

  by cusip yymm_a;

run;

 

data test1_only test2_only both;

  merge need1 (in=in1) need2 (in=in2);

  by cusip yymm_a;

  if in1=in2 then output both;

  else if in1 then output test1_only;

  else output test2_only;

run;

 

--------------------------
The hash OUTPUT method will overwrite a SAS data set, but not append. That can be costly. Consider voting for Add a HASH object method which would append a hash object to an existing SAS data set

Would enabling PROC SORT to simultaneously output multiple datasets be useful? Then vote for
Allow PROC SORT to output multiple datasets

--------------------------
lixuan
Obsidian | Level 7

Thank you ,your method is right, and my problem is how I can convert the result into my original datasets test1 and test2?

lixuan
Obsidian | Level 7

Hi, I posted the problem yesterday and got no answer. My datasets like this. I wanna make sure that both datasets contains the same cusips with the same yymm_a without adding observations. Thanks ahead 

data WORK.TEST2;
  infile datalines dsd truncover;
  input CUSIP:$8. fe:32. yymm_a:YYMMN6.;
  format yymm_a YYMMN6.;
  label CUSIP="CUSIP/SEDOL" fe="Mean Estimate";
datalines4;00036020,0.06,199201
00036020,0.06,199201
00036020,0.06,199201
00036020,0.07,199201
00036110,0.91,199705
00036110,1.24,199705
00036110,1.47,199705
00036110,1.73,199705
00036110,1.55,199705
00036110,1.61,199705
00036110,1.54,199705
00036110,1.25,199705
00036110,1.19,199705
00036110,1.65,199705
00036110,1.71,199705
00036110,1.24,199705
00036110,1.46,199705
00036110,1.47,199810
00036110,1.04,199810
00036110,1.73,199810;;;;
data WORK.TEST1;
  infile datalines dsd truncover;
  input CUSIP:$8. fe:32. yymm_a:YYMMN6.;
  format yymm_a YYMMN6.;
  label CUSIP="CUSIP/SEDOL" fe="Mean Estimate";
datalines4;00036020,0.09,199801
00036020,0.05,199801
00036020,0.03,199801
00036020,0.03,199801
00036020,0.07,199801
00036020,0.03,199801
00036020,0.06,199801
00036020,0.03,199801
00036020,0.04,199801
00036110,0.88,199705
00036110,0.46,199705
00036110,0.57,199705
00036110,0.47,199705
00036110,0.91,199705
00036110,0.93,199705
00036110,0.41,199705
00036110,0.65,199705
00036110,0.51,199705
00036110,0.47,199705
00036110,0.88,199810;;;;

The codes I tried worked  well when the amount of samples is not very large, but with the number of observations increasing, the diffrerence happened.

data test11;
set test1;
run; 
data test21;
set test2;
run;
proc sort data=test11 nodupkey;
by cusip yymm_a;
run;
proc sort data=test21 nodupkey;
by cusip yymm_e;
run;
proc sql;
create table wtest as select  a.cusip,b.cusip as cusip1,a.yymm_a,b.yymm_e
from test11  as a,
	 test21  as b
where a.cusip=b.cusip and 
	  a.yymm_a=b.yymm_a;
quit;
proc sql;
create table test13 as select *
from test1
where cusip in (select cusip from wtest) and 
	  yymm_a in (select yymm_a from wtest);
quit;
proc sql;
create table test23 as select *
from test2
where cusip in (select cusip from wtest) and 
	  yymm_e in (select yymm_e from wtest);
quit; 

  

RW9
Diamond | Level 26 RW9
Diamond | Level 26

Ok, scrub the below, I ahve just checked your other posts and this was already given to you, hence why its not a good idea to duplicate post.  Please explain you request clearly, as I got to exactly the same thing as another member from what you posted, and this doesn't seem to cover what you want, hence explain what you want.  I will merge this with your other post.

 

 

Can you not do it with proc compare:

proc compare base=test2 (kee=cusip yymm_a) comp=test1 (keep=cuspid yymm_a) listall;
run;

Or a simple sort merge:

proc sort data=test2 out=b (keep=cusip yymm_a) nodupkey;
  by cusip yymm_a;
run;
proc sort data=test1 out=c (keep=cusip yymm_a) nodupkey;
  by cusip yymm_a;
run;
data want;
  merge b (in=a) c (in=b);
  by cusip yymm_a;
  if not(a and b);
run;

 

lixuan
Obsidian | Level 7

@RW9Thanks ,sorry I didnt expaln my logic very clearly. I want the  both datasets to be sperately so I can compare the change of variable g. As the result , the same cusip with same date is wanted.

RW9
Diamond | Level 26 RW9
Diamond | Level 26

Maybe show what the output should be from the given test data.  Another guess, remove the not() function?

lixuan
Obsidian | Level 7

@RW9 I want the result is: As you can see , both table are seperated and cusip and yymm_a are same. I tried to merge your result with test1 and test2, but there are too many duplicated observations which I can't control in my result .

result of test1
cusip      g  yymm_a
00036110,0.91,199705
00036110,1.24,199705
00036110,1.47,199705
00036110,1.73,199705
00036110,1.55,199705
00036110,1.61,199705
00036110,1.54,199705
00036110,1.25,199705
00036110,1.19,199705
00036110,1.65,199705
00036110,1.71,199705
00036110,1.24,199705
00036110,1.46,199705
00036110,1.47,199810
00036110,1.04,199810
00036110,1.73,199810
result of test2
00036110,0.88,199705
00036110,0.46,199705
00036110,0.57,199705
00036110,0.47,199705
00036110,0.91,199705
00036110,0.93,199705
00036110,0.41,199705
00036110,0.65,199705
00036110,0.51,199705
00036110,0.47,199705
00036110,0.88,199810

 

RW9
Diamond | Level 26 RW9
Diamond | Level 26

Yes, still not really following.  I am done for the day now, but got to here, maybe you can refine it with specific logic:

data WORK.TEST2;
  infile datalines dsd truncover;
  input CUSIP:$8. fe:32. yymm_a:YYMMN6.;
  format yymm_a YYMMN6.;
  label CUSIP="CUSIP/SEDOL" fe="Mean Estimate";
datalines;
00036020,0.06,199201
00036020,0.06,199201
00036020,0.06,199201
00036020,0.07,199201
00036110,0.91,199705
00036110,1.24,199705
00036110,1.47,199705
00036110,1.73,199705
00036110,1.55,199705
00036110,1.61,199705
00036110,1.54,199705
00036110,1.25,199705
00036110,1.19,199705
00036110,1.65,199705
00036110,1.71,199705
00036110,1.24,199705
00036110,1.46,199705
00036110,1.47,199810
00036110,1.04,199810
00036110,1.73,199810
;
run;

data WORK.TEST1;
  infile datalines dsd truncover;
  input CUSIP:$8. fe:32. yymm_a:YYMMN6.;
  format yymm_a YYMMN6.;
  label CUSIP="CUSIP/SEDOL" fe="Mean Estimate";
datalines;
00036020,0.09,199801
00036020,0.05,199801
00036020,0.03,199801
00036020,0.03,199801
00036020,0.07,199801
00036020,0.03,199801
00036020,0.06,199801
00036020,0.03,199801
00036020,0.04,199801
00036110,0.88,199705
00036110,0.46,199705
00036110,0.57,199705
00036110,0.47,199705
00036110,0.91,199705
00036110,0.93,199705
00036110,0.41,199705
00036110,0.65,199705
00036110,0.51,199705
00036110,0.47,199705
00036110,0.88,199810
;
run;

data res1 (keep=cusip fe yymm_a) res2 (keep=cusip fe_tmp yymm_tmp rename=(fe_tmp=fe yymm_tmp=yymm_a));
  merge test1 (in=a) test2 (in=b rename=(fe=fe_tmp yymm_a=yymm_tmp));
  by cusip;
  if a and b and yymm_a ne yymm_tmp then delete;
  if a then output res1;
  if b then output res2;
run;
  
lixuan
Obsidian | Level 7

@RW9 I study your codes, and I try to explain as my understanding. First, you deleted the part of a.cusip=b.cusip& a.yymm_a ne b.yymm_a, then the left part includes a.cusip=b.cusip& a.yymm_a=b.yymm_a  and a.cusip ne b.cusip; Second, you pick the left part in a. and left part in b. I wonder how your result looks like. Could you explain to me ?

My logic is very simple. you see cusip 00036110 whose date is 199705 in test1, and the date of cusip 00036110 in test2 is also 199705, so all the cusip 00036110  with date 199705 should be kept in test1 and test2.

Thanks 

SAS Innovate 2025: Save the Date

 SAS Innovate 2025 is scheduled for May 6-9 in Orlando, FL. Sign up to be first to learn about the agenda and registration!

Save the date!

How to Concatenate Values

Learn how use the CAT functions in SAS to join values from multiple variables into a single value.

Find more tutorials on the SAS Users YouTube channel.

SAS Training: Just a Click Away

 Ready to level-up your skills? Choose your own adventure.

Browse our catalog!

Discussion stats
  • 10 replies
  • 1688 views
  • 0 likes
  • 4 in conversation