Hello all;
I want a macro or program that allows retain those observations that meet the following conditions::
1- if the frequencies of "variable1" are greater than or equal to 3; and
2- if the frequencies of variable2 are greater than or equal to 3;
Frequency=Number (n) of records of that variable.
Here is an example of my data file:
data have;
input id $ variable1 $ variable2 $;
datalines;
9914089 100689 81999
9914096 9613166 81999
9914108 100671 91996
9914115 100690 91999
9914119 100690 91996
9914124 9512799 91996
9914130 100671 91996
9914136 9512799 91996
9914137 9512782 72005
9914148 9613166 72005
9914158 100690 91999
9914163 100671 91999
9914164 9512799 91999
;
run;
In this case, the observations to be retained would be:
data want;
input id $ variable1 $ variable2 $;
datalines;
9914108 100671 91996
9914115 100690 91999
9914119 100690 91996
9914124 9512799 91996
9914130 100671 91996
9914136 9512799 91996
9914158 100690 91999
9914163 100671 91999
9914164 9512799 91999
;
run;
I want a program that checks the two conditions simultaneously (loop?), But I do not know how to do it.
It is complex, because the frequencies are conditioned. That is, to delete or to retein by the 1st condition (by frequencies of variable1), the frequencies of variable2 change (condition 2).
How could I do it?
Thanks!
Alan
O.K. - So if I get this right then you have counts for two differen categories and you only want to keep records where the count in both categories is greater than or equal 3.
I was first thinking that this is a simple cross-tabulation but if I understand you right then that's not what you're after.
The challenge appears to be that there are two overlapping categories on the same level (and it's not about a simple cell count of cat1*catb) and when you remove a record because it doesn't meet the requirement for one category then this also impacts on the count for the other category - and vice-versa.
If I got this right then test below code if this gives you what you're after.
libname test 'C:\temp';
data have;
set test.temp2;
_const=1;
run;
proc sql;
create table freq1 as
select
identip,
count(animal) as _cnt1
from have
group by identip
having count(animal)>=3
;
create table freq2 as
select
cg,
count(animal) as _cnt2
from have
group by cg
having count(animal)>=3
;
quit;
data _null_;
/* define hashes */
if 0 then set have;
dcl hash have(dataset:'have', multidata:'y');
have.defineKey('_const');
have.defineData(all:'y');
have.defineDone();
if 0 then set freq1;
dcl hash h1(dataset:'freq1');
h1.defineKey('identip');
h1.defineData('_cnt1');
h1.defineDone();
if 0 then set freq2;
dcl hash h2(dataset:'freq2');
h2.defineKey('cg');
h2.definedata('_cnt2');
h2.defineDone();
/* set value for _const */
set have(keep=_const obs=1);
/* work out records which meet requirement */
_have_items=0;
/* iterate over hash have until all items meet requirement */
do while(_have_items ne have.num_items);
_have_items=have.num_items;
do while(have.do_over()=0);
call missing(of _cnt:);
/* condition true for records not meeting requirement */
if (h1.find() ne 0 OR _cnt1<3 or h2.find() ne 0 OR _cnt2<3) then
do;
/* amend counts */
_cnt1=sum(_cnt1,-1);
h1.replace();
_cnt2=sum(_cnt2,-1);
h2.replace();
/* remove record from hash have */
have.removedup();
end;
end;
end;
/* write result to SAS dataset */
have.output(dataset:'want(drop=_:)');
stop;
run;
proc freq data=want;
table identip /nocum nopercent;
table cg /nocum nopercent;
run;
What about calculating the number of variable1 and variable2 separately, merge them back into the data set and then apply your rule?
I'd usually say SQL but you'd still have to add them separately.
proc sql;
create table step1 as
select *, count(var1) as N_var1
from have
group by var1;
quit;
proc sql;
create table step2 as
select *, count(var2) as N_var2
from step1
group by var2;
quit;
data want;
set step2;
where N_var1>=3 and n_var2>=3;
run;
Hi, another idea that also use PROC SQL first (I see you have variable1 & 2 as caharcter data so the values are quoted in SQL)...
proc sql;
select quote(var1) into :v1 separated by ' ' from have group by var1 having count(*) ge 3;
select quote(var2) into :v2 separated by ' ' from have group by var2 having count(*) ge 3;
quit;
data want;
set have;
if var1 in (&v1) and var2 in(&v2);
run;
DATA SET: want
id var1 var2
9914108 100671 91996
9914115 100690 91999
9914119 100690 91996
9914124 9512799 91996
9914130 100671 91996
9914136 9512799 91996
9914158 100690 91999
9914163 100671 91999
9914164 9512799 91999
Hi, another idea if there are a lot of different values of VARIABLES 1 and 2 ...
proc sql;
create table fmt as
select "$v1_" as fmtname, var1 as start, "ok" as label from have
group by var1 having count(*) ge 3
union
select "$v2_" as fmtname, var2 as start, "ok" as label from have
group by var2 having count(*) ge 3;
quit;
* uncomment SELECT if you want to see the formats;
proc format cntlin=fmt;
*select $v1_ $v2_;
run;
data want;
set have;
if put(var1,$v1_.) eq "ok" and put(var2,$v2_.) eq "ok";
run;
And here yet another way how you could get to your result.
data have;
input id $ variable1 $ variable2 $;
datalines;
9914089 100689 81999
9914096 9613166 81999
9914108 100671 91996
9914115 100690 91999
9914119 100690 91996
9914124 9512799 91996
9914130 100671 91996
9914136 9512799 91996
9914137 9512782 72005
9914148 9613166 72005
9914158 100690 91999
9914163 100671 91999
9914164 9512799 91999
;
run;
proc freq data=have noprint;
table variable1 / out=freq1(rename=(count=_cnt1) drop=percent);
table variable2 / out=freq2(rename=(count=_cnt2) drop=percent);
run;
data want(drop=_:);
if _n_=1 then
do;
length _cnt1 _cnt2 8;
dcl hash h1(dataset:'freq1');
h1.defineKey('variable1');
h1.defineData('_cnt1');
h1.defineDone();
dcl hash h2(dataset:'freq2');
h2.defineKey('variable2');
h2.definedata('_cnt2');
h2.defineDone();
end;
call missing(of _cnt1:);
set have;
if h1.find()=0 and _cnt1<3 then delete;
else if h2.find()=0 and _cnt2<3 then delete;
run;
proc datasets lib=work nolist nowarn;
delete freq1 freq2;
run;
quit;
Or if your data resides in a database and you want to express this in a single SQL.
proc sql feedback;
create table want as
select
h.id,
h.variable1,
h.variable2
from
have as h
inner join
(select variable1, count(*) as cnt from have group by variable1) v1
on h.variable1=v1.variable1 and v1.cnt>-3
inner join
(select variable2, count(*) as cnt from have group by variable2) v2
on h.variable2=v2.variable2 and v2.cnt>3
order by h.id
;
quit;
The code does not work.
proc sql feedback;
create table want as
select
h.id,
h.variable1,
h.variable2
from
have as h
inner join
(select variable1, count(*) as cnt from have group by variable1) v1
on h.variable1=v1.variable1 and v1.cnt>-3
inner join
(select variable2, count(*) as cnt from have group by variable2) v2
on h.variable2=v2.variable2 and v2.cnt>3 /*----> the observations deleted here (v2.cnt> 3) affect the frequencies again in v1, and so on ...*/
order by h.id
;
quit;
If you have a big table, I would like to use Hash Table just as Patrick did .
data have;
input id $ variable1 $ variable2 $;
datalines;
9914089 100689 81999
9914096 9613166 81999
9914108 100671 91996
9914115 100690 91999
9914119 100690 91996
9914124 9512799 91996
9914130 100671 91996
9914136 9512799 91996
9914137 9512782 72005
9914148 9613166 72005
9914158 100690 91999
9914163 100671 91999
9914164 9512799 91999
;
run;
proc sql;
create table want as
select *
from have
where variable1 in
(select distinct variable1 from have group by variable1 having count(*) ge 3)
and variable2 in
(select distinct variable2 from have group by variable2 having count(*) ge 3) ;
quit;
I have tested with my data file. The codes do not work! none.
I give you another set of data to visualize the problem:
data have1;
input id $ var1 $ var2 $;
datalines;
9914119 100690 91996
9914124 9512799 91996
9914136 9512799 91996
9914163 100690 91999
9914158 100690 91999
9914164 9512799 91999
9914115 100690 72005
9914137 9512782 72005
9914108 100671 81999
9914130 100671 81999
9914089 100689 91996
9914096 9613166 91996
9914148 9613166 91999
;
run;
Is there any way to check the two conditions while the observations are eliminated?
thanks
Alan
Here I attach my data file..
Most - if not all - of the proposed solutions return the WANT data you've posted.
It appears that all of us haven't fully understood how your logic needs to work. I've got some idea and it looks to me you're after something which works sequentially.
Given that apparently no one understood what you're after I suggest that you explain your logic a bit more in detail.
Also provide a new set of sample data which only returns your desired result if we've got the logic "right" (and then also provide the correspondit WANT data for us to test).
If you're after something sequentially and in order then you should also add a sequence number column to your data.
Please also provide the number of rows / the volume of your real data so we can understand if any in-memory approach using hash tables is suitable.
Sorry for not understanding me; Yes it is correct that proposed solutions return the WANT data, but it did not work with my data file (I do not know why).
I attach my data file (temp2.sas7bdat); Has 1007 rows and 3 columns (animal identip cg).
To summarize, and I hope you understand me:
I want to exclude "identip" with less than three "animal", AND exclude "cg" with less than three "animal".
That is, when performing a PROC FREQ in the WANT data file, the frequencies by "identip" and by "cg" can not be less than 3.
Can you take a look at my data file attached?
Thanks!
O.K. - So if I get this right then you have counts for two differen categories and you only want to keep records where the count in both categories is greater than or equal 3.
I was first thinking that this is a simple cross-tabulation but if I understand you right then that's not what you're after.
The challenge appears to be that there are two overlapping categories on the same level (and it's not about a simple cell count of cat1*catb) and when you remove a record because it doesn't meet the requirement for one category then this also impacts on the count for the other category - and vice-versa.
If I got this right then test below code if this gives you what you're after.
libname test 'C:\temp';
data have;
set test.temp2;
_const=1;
run;
proc sql;
create table freq1 as
select
identip,
count(animal) as _cnt1
from have
group by identip
having count(animal)>=3
;
create table freq2 as
select
cg,
count(animal) as _cnt2
from have
group by cg
having count(animal)>=3
;
quit;
data _null_;
/* define hashes */
if 0 then set have;
dcl hash have(dataset:'have', multidata:'y');
have.defineKey('_const');
have.defineData(all:'y');
have.defineDone();
if 0 then set freq1;
dcl hash h1(dataset:'freq1');
h1.defineKey('identip');
h1.defineData('_cnt1');
h1.defineDone();
if 0 then set freq2;
dcl hash h2(dataset:'freq2');
h2.defineKey('cg');
h2.definedata('_cnt2');
h2.defineDone();
/* set value for _const */
set have(keep=_const obs=1);
/* work out records which meet requirement */
_have_items=0;
/* iterate over hash have until all items meet requirement */
do while(_have_items ne have.num_items);
_have_items=have.num_items;
do while(have.do_over()=0);
call missing(of _cnt:);
/* condition true for records not meeting requirement */
if (h1.find() ne 0 OR _cnt1<3 or h2.find() ne 0 OR _cnt2<3) then
do;
/* amend counts */
_cnt1=sum(_cnt1,-1);
h1.replace();
_cnt2=sum(_cnt2,-1);
h2.replace();
/* remove record from hash have */
have.removedup();
end;
end;
end;
/* write result to SAS dataset */
have.output(dataset:'want(drop=_:)');
stop;
run;
proc freq data=want;
table identip /nocum nopercent;
table cg /nocum nopercent;
run;
That is really really uneasy.
The simplest way is combing identip,cg as a same level and count them ,to see if this new level is greater than and equal 3.
OR YOU WANT MAKE IT VERY COMPLICATED ?
libname x v9 'c:\temp\';
data have;
set x.temp2;
run;
proc sql;
create table want as
select *
from have
group by identip,cg
having count(*) ge 3;
quit;
Here is how to get rid of overlaping animal as Patrick said,only 0 and 101997 left.
libname x v9 'c:\temp\';
data temp;
set x.temp2;
rename identip=from cg=to;
keep identip cg;
run;
data have;
length start end $ 40;
set temp temp(rename=(from=to to=from));
run;
data full;
set have end=last;
if _n_ eq 1 then do;
declare hash h();
h.definekey('node');
h.definedata('node');
h.definedone();
end;
output;
node=from; h.replace();
from=to; to=node;
output;
node=from; h.replace();
if last then h.output(dataset:'node');
drop node;
run;
data want(keep=node household);
declare hash ha(ordered:'a');
declare hiter hi('ha');
ha.definekey('count');
ha.definedata('last');
ha.definedone();
declare hash _ha(hashexp: 20);
_ha.definekey('key');
_ha.definedone();
if 0 then set full;
declare hash from_to(dataset:'full(where=(from is not missing and to is not missing))',hashexp:20,multidata:'y');
from_to.definekey('from');
from_to.definedata('to');
from_to.definedone();
if 0 then set node;
declare hash no(dataset:'node');
declare hiter hi_no('no');
no.definekey('node');
no.definedata('node');
no.definedone();
do while(hi_no.next()=0);
household+1; output;
count=1;
key=node;_ha.add();
last=node;ha.add();
rc=hi.first();
do while(rc=0);
from=last;rx=from_to.find();
do while(rx=0);
key=to;ry=_ha.check();
if ry ne 0 then do;
node=to;output;rr=no.remove(key:node);
key=to;_ha.add();
count+1;
last=to;ha.add();
end;
rx=from_to.find_next();
end;
rc=hi.next();
end;
ha.clear();_ha.clear();
end;
stop;
run;
proc sql;
create table key as
select from as key
from temp
group by from
having count(*) lt 3
union
select to
from temp
group by to
having count(*) lt 3 ;
create table bad_id as
select distinct household
from want
where node in (select key from key);
create table good_id as
select * from want where household in (
select household from want
except
select household from bad_id );
create table final_want as
select * from x.temp2
where identip in (select node from good_id) or
cg in (select node from good_id);
quit;
Are you ready for the spotlight? We're accepting content ideas for SAS Innovate 2025 to be held May 6-9 in Orlando, FL. The call is open until September 25. Read more here about why you should contribute and what is in it for you!
Learn how use the CAT functions in SAS to join values from multiple variables into a single value.
Find more tutorials on the SAS Users YouTube channel.