topic Duplicate Count with Datastep in SAS Programming

Duplicate Count with Datastep

BrahmanandaRao — Fri, 20 Aug 2021 08:32:39 GMT

data customers;
input id name $ 12.  Age  Address $ 12. Salary ;
format Salary 10.2 ;
datalines;
1  Ramesh      32   Ahmedabad    2000.00  
2  Khilan      25   Delhi        1500.00  
3  kaushik     23   Kota         2000.00  
4  Chaitali    25   Mumbai       6500.00  
5  Hardik      27   Bhopal       8500.00  
6  Komal       22   MP           4500.00  
7  Muffy       24   Indore      10000.00 
;
proc print;
run;


proc sort data= Customers;
by salary;
run;



proc sql;
select salary from customers
order by salary;
quit;

proc sql;
select distinct salary from customers
order by salary;
quit;


/* Count same salary */


proc sort data=Customers;
by salary;
run;

data dup_count (Keep= Salary Count)   ;
set Customers ;
by salary;
if first.salary then Count=0;
count+1;
if last.salary;
proc print noobs;
run;


/* Keep only Duplicates obs using Datastep */
data dup_count    ;
set Customers ;
by salary;
if not(first.salary  and last.salary) then output ;
proc print noobs;
run;

/* Keep only Duplicates rows using  proc sql */

proc sql; 
    
    select Salary, count(*) as count 
    from customers
    group by Salary
    having COUNT > 1 
    ;
quit;


data dup_count;
set customers;
by salary;
if first.salary then Count=0;
count+1;
if last.salary;
proc print;
run;

Required output using datastep

Salary	count
2000.00	2

Re: Duplicate Count with Datastep

PeterClemmensen — Fri, 20 Aug 2021 08:35:50 GMT

What is the logic here? Do you simply want a frequency table only with salaries with a frequency > 1?

Re: Duplicate Count with Datastep

PeterClemmensen — Fri, 20 Aug 2021 09:40:19 GMT

If so, then the data step is not the right tool:

data customers;
input id name $ 12.  Age  Address $ 12. Salary ;
format Salary 10.2 ;
datalines;
1  Ramesh      32   Ahmedabad    2000.00  
2  Khilan      25   Delhi        1500.00  
3  kaushik     23   Kota         2000.00  
4  Chaitali    25   Mumbai       6500.00  
5  Hardik      27   Bhopal       8500.00  
6  Komal       22   MP           4500.00  
7  Muffy       24   Indore      10000.00 
;

proc summary data = customers nway;
   class salary;
   var salary;
   output out = want(where = (count > 1) drop = _:) n = count;
run;

Re: Duplicate Count with Datastep

andreas_lds — Fri, 20 Aug 2021 10:40:51 GMT

Fully agree to @PeterClemmensen : the data step is not the first choice if you have to count.

If you have to use a data step, using proc sort before is necessary to keep the data step as simple as possible.

proc sort data=customers out=sorted(keep= Salary);
   by salary;
run;

data counts;
   set sorted;
   by Salary;
   where Salary = 2000;

   if first.Salary then count = 0;

   count + 1;

   if last.Salary;
run;

Another alternative is proc freq:

proc freq data=customers noprint;
   table salary / out=freq_count(where=(salary=2000) drop= percent);
run;

Re: Duplicate Count with Datastep

PeterClemmensen — Fri, 20 Aug 2021 11:22:29 GMT

Also, if you have to do this in a data step, here is a hash approach

data customers;
input id name $ 12.  Age  Address $ 12. Salary ;
format Salary 10.2 ;
datalines;
1  Ramesh      32   Ahmedabad    2000.00  
2  Khilan      25   Delhi        1500.00  
3  kaushik     23   Kota         2000.00  
4  Chaitali    25   Mumbai       6500.00  
5  Hardik      27   Bhopal       8500.00  
6  Komal       22   MP           4500.00  
7  Muffy       24   Indore      10000.00 
;

data _null_;
   dcl hash h();
   h.definekey('Salary');
   h.definedata('Salary', 'count');
   h.definedone();
 
   do until (z);
      set customers end = z;
      if h.find() ne 0 then count = 0;
      count + 1;
      h.replace();
   end;
 
   h.output(dataset : 'want(where = (count > 1))');
run;