Thanks for your reply that is helpful. I have a lot of data and perhaps there is a simpler way to deal with this. I will now combine everything into a single dataset rather than the four datasets I had intended and this will reduce the duplication and make it easier for analysis. There are also two smaller datasets with ethnicity /gender/age groups variables and they all have the linking variable VAR2.
I have only been able to use the macro above on the 18 single keywords. I have several other variables that are a combination of keywords (or excluding keywords) so I am extracting these separately. The main dataset (MYLIB2.DATA1) has about 3.5 million rows and it is down to 8 variables but if I was to also merge it with the outputs from the hash objects and the two ethnicity datasets (using the linking variable var2) it would have 60+ variables, which would be large. I’ve not been able to make the hash object add the output to a variable in a single dataset so the best I could do is below. The final merge statement will be a bit messy if there is an easier way to do this it would be helpful. But I will no longer need the merge a and b statement - this was causing a warning, which I think it was because of the multiple observations with the same VAR2 in MYLIB2.DATA1
data TEST.B1 ;
if _n_= 1 then
do;
declare hash CS (dataset:”MYLIB2.DATA1(where=(
find(cats(VAR1),’XXX’, ‘I’) AND
find(cats(VAR1),’AAA’, ‘I’)
))”);
CS.defineKey('VAR2');
CS.defineDone();
end;
set MYLIB.DATA1;
if CS.check()=0;
run;
proc sql;
select count(distinct VAR2) as distinct_B1_XA
from TEST.B1;
quit;
data TEST.B1_1;
set TEST.B1;
XA = 1;
run;
data TEST.B2 (compress=yes);
if _n_= 1 then
do;
dcl hash CS(dataset:"MYLIB.DATA1(where=(
find(cats(VAR1),'AAA', 'i') OR
find(cats(VAR1),'BBB', 'i')
))");
CS.defineKey('VAR2');
CS.defineDone();
end;
set MYLIB.DATA1;
if CS.check()=0;
run;
proc sql;
select count(distinct VAR2) as distinct_B2
from TEST.B2;
quit;
data TEST.B2_1;
set TEST.B2;
AB = 1;
run;
proc sort data=TEST.B2_1 out=TEST_B2_1;
by VAR2;
run;
proc sort data=TEST.B1_1 out=TEST_B1_1 ;
by VAR2;
run;
proc sort data=MYLIB.DATA1 out= MYLIB_DATA1;
by VAR2;
run;
data test.F1;
merge TEST_B2_1 (in=a) TEST_B1_1 (in=b) MYLIB_DATA1 (in=c);
by VAR2;
output;
run;
proc delete data=TEST.B1 TEST.B1_1 TEST_B1_1 TEST.B2 TEST.B2_1 TEST_B2_1 MYLIB_DATA1;
run;
... View more