Solved: Fill missing values with other values depending on frequency

YellowBanana · Posted 10-06-2018 03:36 AM

Hello,

I have a dataset that looks like this:

APP_ID	QUARTER
APP1	2019Q1
APP2	2019Q1
APP3	2019Q1
APP4	2019Q1
APP5	2019Q1
APP6	2019Q1
APP7	2019Q1
APP8	2019Q1
APP9	2019Q1
APP10	2019Q1
APP11	2019Q2
APP12	2019Q2
APP13	2019Q2
APP14	2019Q2
APP15	2019Q2
APP16	2019Q2
APP17	2019Q2
APP18	2019Q2
APP19	2019Q2
APP20	2019Q2
APP21	2019Q2
APP22	2019Q2
APP23	2019Q2
APP24	2019Q2
APP25	2019Q2
APP26	2019Q2
APP27	2019Q2
APP28	2019Q3
APP29	2019Q3
APP30	2019Q3
APP31	2019Q4
APP32	2019Q4
APP33
APP34
APP35
APP36
APP37
APP38
APP39
APP40
APP41
APP42
APP43
APP44
APP45
APP46
APP47
APP48

And the distribution of frequencies:

QUARTER	COUNT	FREQ
2019Q1	10	20,8%
2019Q2	17	35,4%
2019Q3	3	6,3%
2019Q4	2	4,2%
	16	33,3%

I need to fill missing values with other quarters in such way that distribution of frequencies would be almost uniform (all values occur with almost the same frequency as much as possible).

The number of application can change. As time goes in year, as the number can increase. Now there may appear applications with quarters 2019Q2 or 2019Q3 or missing values, but for example in december they can be application with 2019Q3 or 2019Q4 or missing values.

Missing values can only be filled with quarters from the next year.

The non-missing values cannot be changed.

At this moment it could look like this:

APP_ID	QUARTER
APP1	2019Q1
APP2	2019Q1
APP3	2019Q1
APP4	2019Q1
APP5	2019Q1
APP6	2019Q1
APP7	2019Q1
APP8	2019Q1
APP9	2019Q1
APP10	2019Q1
APP11	2019Q2
APP12	2019Q2
APP13	2019Q2
APP14	2019Q2
APP15	2019Q2
APP16	2019Q2
APP17	2019Q2
APP18	2019Q2
APP19	2019Q2
APP20	2019Q2
APP21	2019Q2
APP22	2019Q2
APP23	2019Q2
APP24	2019Q2
APP25	2019Q2
APP26	2019Q2
APP27	2019Q2
APP28	2019Q3
APP29	2019Q3
APP30	2019Q3
APP31	2019Q4
APP32	2019Q4
APP33	2019Q3
APP34	2019Q3
APP35	2019Q3
APP36	2019Q3
APP37	2019Q3
APP38	2019Q3
APP39	2019Q3
APP40	2019Q3
APP41	2019Q4
APP42	2019Q4
APP43	2019Q4
APP44	2019Q4
APP45	2019Q4
APP46	2019Q4
APP47	2019Q4
APP48	2019Q4

And then distribution of frequencies may be:

QUARTER	COUNT	FREQ
2019Q1	10	20,8%
2019Q2	17	35,4%
2019Q3	11	22,9%
2019Q4	10	20,8%
	0	0,0%

I will be thankful for any suggestions.

P.S. Sorry for my english.

JohnHoughton · Posted 10-06-2018 02:03 PM

Hi. There was a flaw in the code of my previous post, but here is a fix. Clumsy in places but it works.

data have;
	input app_id  $8. quarter $6.;
	datalines;
APP1    2019Q1
APP2    2019Q1
APP3    2019Q1
APP4    2019Q1
APP5    2019Q1
APP6    2019Q1
APP7    2019Q1
APP8    2019Q1
APP9    2019Q1
APP10   2019Q1
APP11   2019Q2
APP12   2019Q2
APP13   2019Q2
APP14   2019Q2
APP15   2019Q2
APP16   2019Q2
APP17   2019Q2
APP18   2019Q2
APP19   2019Q2
APP20   2019Q2
APP21   2019Q2
APP22   2019Q2
APP23   2019Q2
APP24   2019Q2
APP25   2019Q2
APP26   2019Q2
APP27   2019Q2
APP28   2019Q3
APP29   2019Q3
APP30   2019Q3
APP31   2019Q4
APP32   2019Q4
APP33   
APP34   
APP35    
APP36    
APP37    
APP38    
APP39    
APP40    
APP41    
APP42    
APP43    
APP44    
APP45    
APP46    
APP47    
APP48    
;
run;

proc sql noprint;
	select count(distinct quarter), count(*) /count(distinct quarter) into :Nqtrs,:target from have;
quit;

proc sql;
	create table temp_a as 
	select quarter, count(*) as N
	from have 
	group by quarter
	having count(*)<=&target or quarter='';
	
	create table temp_b as 
	select quarter, 
		(select sum(N) from temp_a)/(select count (distinct quarter) from temp_a where quarter^='')-N as need , 
		(select N from temp_A where quarter='') as available 
		from temp_a 
		where quarter ^='';
quit;


data temp_c;set temp_b;
retain cumneed;
if _N_=1 then do; 
	cumneed=need; 
	need2=floor(need);
	end;
else do;
	cumneed=need+cumneed;
end;
cumneed2=floor(cumneed);
 need2=cumneed2-lag(cumneed2);
 if need2=. then need2=floor(need);
run;



data temp_d (keep=quarter);
	set temp_c end=eof;
	retain left ;
	if _N_=1 then left=available;
	do i=1 to need2;
		left=left-1;
		output;
	end;
run;

data temp_e;
	set have (where=(quarter=''));
	set temp_d;
run;

data want;
	set have (where=(quarter^='')) temp_e;
run;

*As a check , compare count from the have and want datasets;
proc sql;
	select a.*, b.want from (select h.quarter, count(*) as have from have h group 
		by h.quarter) a left join 
(select w.quarter, count(*) as want from want w group by w.quarter) b on 
		a.quarter=b.quarter;
quit;

View solution in original post

Ksharp · Posted 10-06-2018 06:41 AM

You should post it at Operation Research Forum.and calling @RobPratt

JohnHoughton · Posted 10-06-2018 10:49 AM

Hi. I've tested this with a few scenarios and it seems to give what you want.

I've edited to remove this code. Updated version is in another post

(Edit - this had errors so I removed the code)

JohnHoughton · Posted 10-06-2018 02:03 PM

Hi. There was a flaw in the code of my previous post, but here is a fix. Clumsy in places but it works.

data have;
	input app_id  $8. quarter $6.;
	datalines;
APP1    2019Q1
APP2    2019Q1
APP3    2019Q1
APP4    2019Q1
APP5    2019Q1
APP6    2019Q1
APP7    2019Q1
APP8    2019Q1
APP9    2019Q1
APP10   2019Q1
APP11   2019Q2
APP12   2019Q2
APP13   2019Q2
APP14   2019Q2
APP15   2019Q2
APP16   2019Q2
APP17   2019Q2
APP18   2019Q2
APP19   2019Q2
APP20   2019Q2
APP21   2019Q2
APP22   2019Q2
APP23   2019Q2
APP24   2019Q2
APP25   2019Q2
APP26   2019Q2
APP27   2019Q2
APP28   2019Q3
APP29   2019Q3
APP30   2019Q3
APP31   2019Q4
APP32   2019Q4
APP33   
APP34   
APP35    
APP36    
APP37    
APP38    
APP39    
APP40    
APP41    
APP42    
APP43    
APP44    
APP45    
APP46    
APP47    
APP48    
;
run;

proc sql noprint;
	select count(distinct quarter), count(*) /count(distinct quarter) into :Nqtrs,:target from have;
quit;

proc sql;
	create table temp_a as 
	select quarter, count(*) as N
	from have 
	group by quarter
	having count(*)<=&target or quarter='';
	
	create table temp_b as 
	select quarter, 
		(select sum(N) from temp_a)/(select count (distinct quarter) from temp_a where quarter^='')-N as need , 
		(select N from temp_A where quarter='') as available 
		from temp_a 
		where quarter ^='';
quit;


data temp_c;set temp_b;
retain cumneed;
if _N_=1 then do; 
	cumneed=need; 
	need2=floor(need);
	end;
else do;
	cumneed=need+cumneed;
end;
cumneed2=floor(cumneed);
 need2=cumneed2-lag(cumneed2);
 if need2=. then need2=floor(need);
run;



data temp_d (keep=quarter);
	set temp_c end=eof;
	retain left ;
	if _N_=1 then left=available;
	do i=1 to need2;
		left=left-1;
		output;
	end;
run;

data temp_e;
	set have (where=(quarter=''));
	set temp_d;
run;

data want;
	set have (where=(quarter^='')) temp_e;
run;

*As a check , compare count from the have and want datasets;
proc sql;
	select a.*, b.want from (select h.quarter, count(*) as have from have h group 
		by h.quarter) a left join 
(select w.quarter, count(*) as want from want w group by w.quarter) b on 
		a.quarter=b.quarter;
quit;

YellowBanana · Posted 10-08-2018 12:16 PM

It works. Thanks a lot for help! 🙂

Fill missing values with other values depending on frequency

Re: Fill missing values with other values depending on frequency

Re: Fill missing values with other values depending on frequency

Re: Fill missing values with other values depending on frequency

Re: Fill missing values with other values depending on frequency

Re: Fill missing values with other values depending on frequency

Fill missing values with other values depending on frequency

Re: Fill missing values with other values depending on frequency

Re: Fill missing values with other values depending on frequency

Re: Fill missing values with other values depending on frequency

Re: Fill missing values with other values depending on frequency

Re: Fill missing values with other values depending on frequency

Click image to register for webinar

Classroom Training Available!