Re: Count distinct with a hierarchy

catkat96 · Posted 03-15-2021 07:14 AM

Hello, I have a dataset that looks like this:

data Have;
input ID callDate :ddmmyy. cat1 cat2 cat3 cat4;
format callDate ddmmyy10.;
datalines;
1	15/02/2021	0	0	0	0
1	17/02/2021	0	0	0	0
1	19/02/2021	0	0	1	0
2	08/02/2021	0	0	0	1
2	09/02/2021	1	0	0	0
3	10/02/2021	0	0	0	0
3	11/02/2021	1	0	0	0
3	15/02/2021	1	0	0	0
4	01/02/2021	0	0	0	0
4	02/02/2021	0	0	0	0
4	03/02/2021	0	0	0	0
4	04/02/2021	0	1	0	0
4	05/02/2021	0	0	0	0
4	23/02/2021	0	1	0	0
5   10/02/2021  0   0   0   0
;
run;

I just grabbed a snippet of february's data but there is data for every month, and more data for each month. Basically, a client ID can be contacted many times a month and in each call they can be put in a category.

My objective is to find out how many UNIQUE ID's entered each category each month. To do this, I need to follow a hierarchy: cat1>cat2>cat3>cat4>no category. So if a client appears in both category 1 and 3 in a month, they should only be counted for category 1, not 3.

Here is how the output should look:

data Have;
input yearmonth :mmyys7. cat1 cat2 cat3 cat4 nocat;
format yearmonth mmyys7.;
datalines;
Feb-21	2	1	1	0	1
;
run;

As you can see:

-ID 1 was counted towards Cat 3

-ID 2 was counted towards Cat 1 only, not 4, because cat1>cat4

-ID 3 was counted towards Cat 1 once

-ID 4 was counted towards Cat 2 once

-ID 5 was counted towards nocat

I believe this is not very complicated but what I've done so far has not been working:

proc sql;
create table test as
select count(distinct id) as unique_id
, month(calldate) as month
, count ( case when cat1=1 then 1 else 0 end) as cat1
from have
group by month;
quit;

proc sql;
create table test2 as
select count(distinct id) as unique_id
, month(calldate) as month
, count (distinct case when cat2=1 then 1 else 0 end) as cat2
from have
group by month;
quit;

proc sql;
create table test3 as
select count(distinct id) as unique_id
, month(calldate) as month
, count (distinct case when cat3=1 then 1 else 0 end) as cat3
from have
group by month;
quit;

proc sql;
create table test4 as
select count(distinct id) as unique_id
, month(calldate) as month
, count (distinct case when cat4=1 then 1 else 0 end) as cat4
from have
group by month;
quit;

I couldn't make it work with year-month either.

If you could show me a way to solve it in PROC SQL it is very much appreciated, since the rest of my code is all in that format.

andreas_lds · Posted 03-15-2021 07:53 AM

The complexity is caused by suboptimal data-design.

data Have;
input ID callDate :ddmmyy. cat1 cat2 cat3 cat4;
format callDate ddmmyy10.;
datalines;
1  15/02/2021  0  0  0  0
1  17/02/2021  0  0  0  0
1  19/02/2021  0  0  1  0
2  08/02/2021  0  0  0  1
2  09/02/2021  1  0  0  0
3  10/02/2021  0  0  0  0
3  11/02/2021  1  0  0  0
3  15/02/2021  1  0  0  0
4  01/02/2021  0  0  0  0
4  02/02/2021  0  0  0  0
4  03/02/2021  0  0  0  0
4  04/02/2021  0  1  0  0
4  05/02/2021  0  0  0  0
4  23/02/2021  0  1  0  0
5   10/02/2021  0   0   0   0
;
run;

data HaveReduced;
   set have;

   Cat = whichn(1, of cat1-cat4, 1);
   
   drop cat1-cat4;
run;
   
proc summary data=HaveReduced nway;
   class CallDate Id;
   format CallDate mmyy7.;
   var Cat;
   output out=unique(drop=_:) min=;
run;

proc format;
   value Category 
      1 = 'Cat 1'
      2 = 'Cat 2'
      3 = 'Cat 3'
      4 = 'Cat 4'
      5 = 'No Cat'
   ;
run;
   
option missing='0';
proc report data=unique ;
   columns CallDate Cat;
   define CallDate / group 'yearmonth';
   define Cat / n across format= Category. preloadfmt;
run;
options missing='.';

catkat96 · Posted 03-15-2021 08:05 AM

That looks good, I have a question though, how could I do to get a dataset instead of a report table in the last step?

rudfaden · Posted 03-15-2021 08:17 AM

Maybe something like this

data Have;
	input ID callDate :ddmmyy. cat1 cat2 cat3 cat4;
	format callDate ddmmyy10.;
	datalines;
1 15/02/2021 0 0 0 0
1 17/02/2021 0 0 0 0
1 19/02/2021 0 0 1 0
2 08/02/2021 0 0 0 1
2 09/02/2021 1 0 0 0
3 10/02/2021 0 0 0 0
3 11/02/2021 1 0 0 0
3 15/02/2021 1 0 0 0
4 01/02/2021 0 0 0 0
4 02/02/2021 0 0 0 0
4 03/02/2021 0 0 0 0
4 04/02/2021 0 1 0 0
4 05/02/2021 0 0 0 0
4 23/02/2021 0 1 0 0
5 10/02/2021 0 0 0 0
;
run;

proc sort data=have;
	by id cat1 cat2 cat3 cat4;
run;

data _temp;
	set have;
	by id cat1 cat2 cat3 cat4;

	if last.id then
		output;
run;

proc sql;
	select month(callDate) as calldateMonth
		,sum(cat1) as cat1
		,sum(cat2) as cat2
		,sum(cat3) as cat3
		,sum(cat4) as cat4
	from _temp
		group by calldateMonth
	;
quit;

Ksharp · Posted 03-15-2021 08:49 AM

SQL is not right tool for this scenario .

data Have;
input ID callDate :ddmmyy. cat1 cat2 cat3 cat4;
format callDate ddmmyy10.;
datalines;
1	15/02/2021	0	0	0	0
1	17/02/2021	0	0	0	0
1	19/02/2021	0	0	1	0
2	08/02/2021	0	0	0	1
2	09/02/2021	1	0	0	0
3	10/02/2021	0	0	0	0
3	11/02/2021	1	0	0	0
3	15/02/2021	1	0	0	0
4	01/02/2021	0	0	0	0
4	02/02/2021	0	0	0	0
4	03/02/2021	0	0	0	0
4	04/02/2021	0	1	0	0
4	05/02/2021	0	0	0	0
4	23/02/2021	0	1	0	0
5   10/02/2021  0   0   0   0
;
run;
proc sql;
create table temp as
select id,intnx('month',callDate,0) as callDate format monyy7.,
sum(cat1) as cat1,sum(cat2) as cat2,sum(cat3) as cat3,sum(cat4) as cat4
 from have
  group by id,calculated callDate;

create table temp2 as
select id,callDate,
case when cat1 ne 0 then 1 else 0 end as cat1,
case when cat2 ne 0 and cat1=0 then 1 else 0 end as cat2,
case when cat3 ne 0 and cat1=0 and cat2=0 then 1 else 0 end as cat3,
case when cat4 ne 0 and cat1=0 and cat2=0 and cat3=0 then 1 else 0 end as cat4
 from temp;

create table want as
select distinct callDate,
(select count(distinct id) from temp2 where callDate=a.callDate and cat1=1) as cat1,
(select count(distinct id) from temp2 where callDate=a.callDate and cat2=1) as cat2,
(select count(distinct id) from temp2 where callDate=a.callDate and cat3=1) as cat3,
(select count(distinct id) from temp2 where callDate=a.callDate and cat4=1) as cat4,
(select count(distinct id) from temp2 where callDate=a.callDate 
 and cat1=0 and cat2=0 and cat3=0 and cat4=0 ) as nocat

 from temp2 as a ;
quit;

Count distinct with a hierarchy