Solved: Re: Removing Duplicate Dates with conditions

sjm · Posted 05-09-2020 09:11 AM

Hi All,

The datafile below records annual data on four firm characteristics (A,B,C and D) that has duplicate values in some years (namely 2014 for firm 1 and 2016 for firm 2). There should however be only one annual observation per firm. I need to remove the duplicate observation that occurs in those years. The criteria is to accept the most recent date if there are at least two variables (A,B, C or D) with non-missing values for that observation. Otherwise, accept the duplicate firm-year observation instead. The example datafile is given below:



Obs	Firm	Date	A	B	C	D
1	1	30/6/2014	1.6		3.6
2	1	31/12/2014	1.2	3.1	3.0	5.6
3	1	31/12/2015	1.7	2.5	2.3	5.1
4	1	31/12/2016	1.4	2.3	1.9	6.0
5	2	31/12/2014	6.2	5.7	7.2	8.0
6	2	31/12/2015	5.8	6.2	7.0	8.2
7	2	31/3/2016	5.9	6.4	6.5	8.9
8	2	30/9/2016	6.5	7.0

Based on the critera, obs 2 should be chosen for firm 1 since it is the most recent date and there are more than two variables without missing values. For firm 2, obs 7 should be chosen since obs 8 with the most recent date, has values missing for more than 2 variables, while values for all four variables are available for obs 7. The resulting output should look like:

Firm	Date	A	B	C	D
1	31/12/2014	1.2	3.1	3.0	5.6
1	31/12/2015	1.7	2.5	2.3	5.1
1	31/12/2016	1.4	2.3	1.9	6.0
2	31/12/2014	6.2	5.7	7.2	8.0
2	31/12/2015	5.8	6.2	7.0	8.2
2	31/3/2016	5.9	6.4	6.5	8.9

I tried to adapt a code that I found (see below

data test1;
	set test;
	year=year(date);
run;
data test2;
	set test1;
	by firm year;
	retain pre_date; drop prev_date;
	if first.firm then do;
		prev_year=year;
		output;
		end; else;
	if year-prev_year>1 then do;
	output;
	end; else delete;
run;

but could not get the result I wanted. Would appreciate your help please. Thanks.

ed_sas_member · Posted 05-11-2020 10:57 AM

Hi @sjm

Thank you for the clarification 😊

I have adapted the program as follows. I have also edited the list of removed observations so that you can check that the program works according to your expectations.

Best,

data have;
	set mylib.testfile;	/*Input file*/
	obs_num+1; 			/*Observation number*/
run;

proc sql;
	
	create table want as

	/****************************/
	/*obs with 1 record per quarter per year*/
	select *
	from have
	group by GVKEY, DATADATE
	having count(DATADATE)=1
	/****************************/

	union all corr

	/*********************************************************************/
	/*obs with >1 record per year AND 1 record with the min number of missing values */	
	select *
	from (select *
		from 
			(select *, nmiss(ATQ,CHEQ,LTQ,REVTQ) as count_miss
			from have
			group by GVKEY, DATADATE
			having count(DATADATE)>1)
		group by GVKEY, DATADATE
		having count_miss = min(count_miss))
		group by GVKEY, DATADATE
		having count(DATADATE)=1
		
	/*********************************************************************/

	union all corr

	/*********************************************************************/
	/*obs with >1 record per year AND >1 records:
		- with the min number of missing values
		- 1 record having sames years in both DATADATE & DATAFQTR variables */

	select *
	from (select *
		  from (select *
				from 
					(select *, nmiss(ATQ,CHEQ,LTQ,REVTQ) as count_miss
					from have
					group by GVKEY, DATADATE
					having count(DATADATE)>1)
		 		group by GVKEY, DATADATE
				having count_miss = min(count_miss))
		  group by GVKEY, DATADATE
		  having count(DATADATE)>1)
	where year(DATADATE)=input(substr(DATAFQTR,1,4),4.)
	group by GVKEY, DATADATE
	having count(DATADATE)=1
	
	/*********************************************************************/

union all corr

	/*********************************************************************/
	/*obs with >1 record per year AND >1 records:
		- with the min number of missing values
		- >1 records having sames years in both DATADATE & DATAFQTR variables */
	select *
	from(select *
		from (select *
			  from (select *
					from 
						(select *, nmiss(ATQ,CHEQ,LTQ,REVTQ) as count_miss
						from have
						group by GVKEY, DATADATE
						having count(DATADATE)>1)
			 		group by GVKEY, DATADATE
					having count_miss = min(count_miss))
			  group by GVKEY, DATADATE
			  having count(DATADATE)>1)
		where year(DATADATE)=input(substr(DATAFQTR,1,4),4.)
		group by GVKEY, DATADATE
		having count(DATADATE)>1)
	group by GVKEY, DATADATE
	having input(substr(DATAFQTR,6,1),1.) = max(input(substr(DATAFQTR,6,1),1.))
	
	/*********************************************************************/
	
 	order by obs_num;
quit;

/******* Quality checks *******/

data check_delete;
	merge have(in=x) want(in=y);
	by obs_num;
	if y=0;
run;

proc print data=check_delete;
	title 'QUALITY CHECK: list of removed observations';
	id obs_num;
run;

View solution in original post

ed_sas_member · Posted 05-09-2020 09:22 AM

Hi @sjm

Just a clarification: for firm 2, you says that "obs 7 should be chosen since obs 8 with the most recent date, has values missing for more than 2 variables" -> but obs 7 has only 2 missing values.

Is the rule to consider the most recent date if 0 or 1 value max are missing?

Best,

sjm · Posted 05-09-2020 09:46 AM

Thanks for your question. Sorry about that! It should be missing more than one. But the main idea of the rule is to try to use the most recent observation as much as possible unless that observation has too many variables with missing values, in which case the duplicate is chosen.

novinosrin · Posted 05-09-2020 10:44 AM

Hi @sjm Assuming I understand your requirement, here is a solution


data have;
input Obs	Firm	Date :ddmmyy10.	A	B	C	D;
format date ddmmyy10.;
cards;
1	1	30/6/2014	1.6	.	3.6	.
2	1	31/12/2014	1.2	3.1	3	5.6
3	1	31/12/2015	1.7	2.5	2.3	5.1
4	1	31/12/2016	1.4	2.3	1.9	6
5	2	31/12/2014	6.2	5.7	7.2	8
6	2	31/12/2015	5.8	6.2	7	8.2
7	2	31/3/2016	5.9	6.4	6.5	8.9
8	2	30/9/2016	6.5	7	.	.
;


data want;
 do _n_=1 by 1 until(last.date);
  set have;
  by firm date groupformat;
  format date year.;
  array t A	B	C	D;
  if n(of t(*))>=_n then do;
    _iorc_=date;
	_n= n(of t(*));
  end;
 end;
 do _n_=1 to _n_;
  set have;
  if date=_iorc_ then output;
 end;
 _iorc_=.;
 drop _:;
run;
proc datasets lib=work noprint;
 modify want;
 format date ddmmyy10.;
run;

proc print noobs;run;

sjm · Posted 05-11-2020 08:00 AM

Thanks, Novinosrin.

ed_sas_member · Posted 05-09-2020 10:46 AM

Hi @sjm

Another question comes up to my mind: what if you have several observations for 1 company and a specific year, and all fo them have 2 or more missing values? Should we take the latest date?

I have added some use cases in the datalines accordingly to clarify my purpose.

Here is an SQL code that will give the result. For the mentioned use case, it takes the max value (cf. comment)

Best,

data have;
	infile datalines dlm="," dsd missover;
	input Obs Firm Date:DDMMYY10. A B C D;
	format date DDMMYY10.;
	datalines;
1,1,30/6/2014,1.6,.,3.6,.
2,1,31/12/2014,1.2,3.1,3.0,5.6
3,1,31/12/2015,.,4,2.3,5.1
4,1,31/12/2016,1.4,2.3,1.9,6.0
5,2,31/12/2014,6.2,5.7,7.2,8.0
6,2,30/12/2015,.,.,7.0,.
7,2,31/12/2015,.,.,7.0,8.2
8,2,31/3/2016,5.9,6.4,6.5,8.9
9,2,28/4/2016,5.9,6.4,6.5,8.9
10,2,30/9/2016,6.5,7.0,7,.
;
run;

proc sql;
	
	create table want as

	/****************************/
	/*obs with 1 record per year*/
	select *
	from have
	group by Firm, year(date)
	having count(year(date))=1
	/****************************/

	union all corr

	/*********************************************************************/
	/*obs with >1 record per year and all records with > 1 missing values*/	
	select a.*
	from have as a right join
		(select Firm, max(date) as Date2 format=DDMMYY10. /* Min Max ??*/
		from have
		group by Firm, year(date)
		having count(year(date))>1 and min(nmiss(A,B,C,D)) > 1) as b
		on a.Firm=b.Firm and a.Date=b.Date2
	/*********************************************************************/

	union all corr

	/*********************************************************************/
	/*obs with >1 record per year and at least one record with 0 or 1 missing value*/
	select a.*
	from have as a right join
		(select Firm, max(date) as Date2 format=DDMMYY10.
		from (select *
			  from have
			  group by Firm, year(date)
	     	  having count(year(date))>1 and min(nmiss(A,B,C,D)) <= 1)
	     where nmiss(A,B,C,D)<= 1
	     group by Firm, year(date)) as b
	     on a.Firm=b.Firm and a.Date=b.Date2
	/*********************************************************************/
	
	 order by obs;
	 
quit;

sjm · Posted 05-10-2020 12:32 AM

Thank you for your suggestion. Could I check whether the second part of the code that selects "obs with 1 record per year and at least one record with 0 or 1 missing value" also captures the situation where there are three same-year observations for the firm that all have no missing values. The rule in such a case would be to pick the observation with max(date).

Shmuel · Posted 05-10-2020 12:57 AM

Its not clear to me what exactly you want to check.

if it is to select the last date just change:

1) sort by date instead by yy

2) select by if first.date instead first.yy

or maybe you are looking for:

if first.yy or miss > 1;

sjm · Posted 05-10-2020 01:45 AM

Thank you for your suggestion, Shmuel. Regarding your question: what I wanted was to select one annual observation from two or more annual observations on the same firm. The criteria was based on, most importantly, the observation with the most recent date. However, if the observation did not have much information i.e. in terms of having variables with missing values, then I would use one of the other two dates within the same year which had more information i.e. more variables with non-missing values.

Shmuel · Posted 05-10-2020 01:58 AM

The posted code fits the results you asked.

In case there is some year not fitting what you want,

please point to that line or add new lines to the test example.

ed_sas_member · Posted 05-10-2020 04:00 AM

Hi @sjm

Thank you for the quick reply.

I have added such a case in the below data (Firm2, 2017-> records 11, 12, 13):

data have;
	infile datalines dlm="," dsd missover;
	input Obs Firm Date:DDMMYY10. A B C D;
	format date DDMMYY10.;
	datalines;
1,1,30/6/2014,1.6,.,3.6,.
2,1,31/12/2014,1.2,3.1,3.0,5.6
3,1,31/12/2015,.,4,2.3,5.1
4,1,31/12/2016,1.4,2.3,1.9,6.0
5,2,31/12/2014,6.2,5.7,7.2,8.0
6,2,30/12/2015,.,.,7.0,.
7,2,31/12/2015,.,.,7.0,8.2
8,2,31/3/2016,5.9,6.4,6.5,8.9
9,2,28/4/2016,5.9,6.4,6.5,8.9
10,2,30/9/2016,6.5,7.0,7,.
11,2,28/1/2017,5,6.4,6.5,8.9
12,2,28/2/2017,9,6.4,6.5,8.9
13,2,28/4/2017,8,6.4,6.5,8.9
;
run;

The programs correctly picks up record=13:

Best,

Shmuel · Posted 05-10-2020 06:17 AM

I suppose you want to select lines per year per farm, so I modified the program:

data have;
	infile datalines dlm="," dsd missover;
	input Obs Firm Date:DDMMYY10. A B C D;
	format date DDMMYY10.;
	yy = year(date);
	datalines;
1,1,30/6/2014,1.6,.,3.6,.
2,1,31/12/2014,1.2,3.1,3.0,5.6
3,1,31/12/2015,.,4,2.3,5.1
4,1,31/12/2016,1.4,2.3,1.9,6.0
5,2,31/12/2014,6.2,5.7,7.2,8.0
6,2,30/12/2015,.,.,7.0,.
7,2,31/12/2015,.,.,7.0,8.2
8,2,31/3/2016,5.9,6.4,6.5,8.9
9,2,28/4/2016,5.9,6.4,6.5,8.9
10,2,30/9/2016,6.5,7.0,7,.
11,2,28/1/2017,5,6.4,6.5,8.9
12,2,28/2/2017,9,6.4,6.5,8.9
13,2,28/4/2017,8,6.4,6.5,8.9
;
run;


data temp;
 set have;
     miss = nmiss(a,b,c,d);
run;
proc sort data=temp; by firm descending yy miss; run;

data want;
 set temp;
  by firm descending yy;
     if first.yy or miss=0;
     drop yy;
run;
proc sort data=want; by firm date; run;

I don't understand why you selected for firm=2 year=2016 - line 10 and not lines 8 and 9.

line 10 has one missing value while lines 8 and 9 are full.

Shmuel · Posted 05-10-2020 06:21 AM

In case of any issue please post the result lined and the expected lines marking the difference and explain why you selected the specific line(s) instead.

sjm · Posted 05-10-2020 09:22 AM

Line 10 was selected because it has the most recent date and it meets the criteria that there is at least more than 1 variable with a nonmissing value. More recent dates are preferred.

Shmuel · Posted 05-10-2020 10:24 AM

Next code results fit your expected:

data have;
	infile datalines dlm="," dsd missover;
	input Obs Firm Date:DDMMYY10. A B C D;
	format date DDMMYY10.;
	yy = year(date);
	datalines;
1,1,30/6/2014,1.6,.,3.6,.
2,1,31/12/2014,1.2,3.1,3.0,5.6
3,1,31/12/2015,.,4,2.3,5.1
4,1,31/12/2016,1.4,2.3,1.9,6.0
5,2,31/12/2014,6.2,5.7,7.2,8.0
6,2,30/12/2015,.,.,7.0,.
7,2,31/12/2015,.,.,7.0,8.2
8,2,31/3/2016,5.9,6.4,6.5,8.9
9,2,28/4/2016,5.9,6.4,6.5,8.9
10,2,30/9/2016,6.5,7.0,7,.
11,2,28/1/2017,5,6.4,6.5,8.9
12,2,28/2/2017,9,6.4,6.5,8.9
13,2,28/4/2017,8,6.4,6.5,8.9
;
run;


data temp1;
 set have;
     miss = nmiss(a,b,c,d);
run;
proc sort data=temp1; by firm descending yy miss; run;

data temp2;
 set temp1;
  by firm descending yy;
     if first.yy and last.yy or miss < 3 
        then flag_sel = 1;
        else flag_sel=0;
run;
proc sort data=temp2; by firm date flag_sel; run;

data want;
 set temp2 (where=(flag_sel=1));
  by firm yy;
     if last.yy;
     drop flag_sel yy;
run;

Registration is open

SAS Training: Just a Click Away