Solved: Re: Calculate percent missing in long dataset

rfarmenta · Posted 09-28-2018 02:21 PM

I have a long dataset with HR data and time in seconds. Some participants have up to 8000 observations of heart rate. At times, the HR did not get collected because there was something wrong with the monitor. I need to identify how much missing data each participant has and if a participant has more than 5% missing heart rate values across time, I will exclude them. I know how to find missing data overall, but I am not sure sure how to calculate the % missing for a long data set by participant. Attached is data for 2 people that gives the structure of my data set. Any suggestion/ help would be greatly appreciated.

Reeza · Posted 09-28-2018 03:07 PM

*generate counts of N/NMISS;
proc means data=have stackods n nmiss NWAY;
class subjid;
var hr;
ods output summary=summary_stats;
run;

*calculate percent missing;
data percent_missing;
set summary_stats;
length exclude_flag $4.;
Pct_Missing = nmiss/sum(n, nmiss);
if pct_missing>0.05 then exclude_flag='Yes';
else exclude_flag='No';
run; 

*filtered original input removing those will more than 5% missing;
proc sql;
create table filtered as
select *
from have where subjid not in (select subjid from percent_missing where exclude_flag = 'Yes');
quit;

@rfarmenta wrote:

I have a long dataset with HR data and time in seconds. Some participants have up to 8000 observations of heart rate. At times, the HR did not get collected because there was something wrong with the monitor. I need to identify how much missing data each participant has and if a participant has more than 5% missing heart rate values across time, I will exclude them. I know how to find missing data overall, but I am not sure sure how to calculate the % missing for a long data set by participant. Attached is data for 2 people that gives the structure of my data set. Any suggestion/ help would be greatly appreciated.

EDIT: corrected if percent>5 to if percent>0.05 since percents are between 0 and 1 with this calculation.

View solution in original post

Reeza · Posted 09-28-2018 02:39 PM

PROC FREQ with the MISSING option.

*create sample data to work with;

data class;
	set sashelp.class;

	if age=14 then
		call missing(height, weight, sex);

	if name='Alfred' then
		call missing(sex, age, height);
	label age="Fancy Age Label";
run;

*create format for missing;

proc format ;
	value $ missfmt ' '="Missing" other="Not Missing";
	value nmissfmt .="Missing" other="Not Missing";
run;

*Proc freq to count missing/non missing;
ods table onewayfreqs=temp;

proc freq data=class;
	table _all_ / missing;
	format _numeric_ nmissfmt. _character_ $missfmt.;
run;

*Format output;

data long;
	length variable $32. variable_value $50.;
	set temp;
	Variable=scan(table, 2);
	Variable_Value=strip(trim(vvaluex(variable)));
	presentation=catt(frequency, " (", trim(put(percent/100, percent7.1)), ")");
	keep variable variable_value frequency percent cum: presentation;
	label variable='Variable' variable_value='Variable Value';
run;

proc sort data=long;
	by variable;
run;

proc transpose data=long out=want;
	by variable;
	id variable_value;
	var percent;
run;

AMSAS · Posted 09-28-2018 02:49 PM

Here's a way to do it with SQL, you could probably do it in fewer steps, but I prefer to break things up to keep them simple.

/* Create example data */
data source ;
	do study_id = 1 to 5 ;
		do time = 1 to 100 ;
			if ranuni(2)>0.05 then
				hr=int(ranuni(0)*10)+90 ;
			else
				hr=. ;
			output ;
		end ;
	end ;
run ;

proc sql ;
	/* Count the missing records by study_id */
	create table missing as
	select study_id, count(*) as missingCnt 
	from source
	where
		hr=.
	group by study_id 
	;
	/* Count all the records by study_id */
	create table all as
	select study_id, count(*) as total
	from source
	group by study_id 
	;
	/* join the 2 tables and calculate percentage by study_id */
	create table percent as 
	select 
		a.study_id,
		m.missingCnt,
		a.total,
		m.missingCnt/a.total as percent
	from
		missing as m, all as a
	where
		m.study_id=a.study_id 
	;
	/* join the percent table to the source data where the percent is > 0.05 (5%) */
	create table results as
	select 
		s.*,
		p.percent
	from
		source as s,
		percent as p
	where
		s.study_id=p.study_id and
		p.percent<0.05 
	;

	/* Check the results */
	proc sql ;
		select 
			unique(study_id)
		from
			results ;


quit ;

Reeza · Posted 09-28-2018 02:45 PM

Actually since you only have one variable proc means is easier. Use it to get NMISS and N (with STACKODS) and then do the calculation in a data step.

novinosrin · Posted 09-28-2018 02:54 PM

While you have got workable solutions, would be nice to include the expected output for the sample you provided

Reeza · Posted 09-28-2018 03:07 PM

*generate counts of N/NMISS;
proc means data=have stackods n nmiss NWAY;
class subjid;
var hr;
ods output summary=summary_stats;
run;

*calculate percent missing;
data percent_missing;
set summary_stats;
length exclude_flag $4.;
Pct_Missing = nmiss/sum(n, nmiss);
if pct_missing>0.05 then exclude_flag='Yes';
else exclude_flag='No';
run; 

*filtered original input removing those will more than 5% missing;
proc sql;
create table filtered as
select *
from have where subjid not in (select subjid from percent_missing where exclude_flag = 'Yes');
quit;

@rfarmenta wrote:

I have a long dataset with HR data and time in seconds. Some participants have up to 8000 observations of heart rate. At times, the HR did not get collected because there was something wrong with the monitor. I need to identify how much missing data each participant has and if a participant has more than 5% missing heart rate values across time, I will exclude them. I know how to find missing data overall, but I am not sure sure how to calculate the % missing for a long data set by participant. Attached is data for 2 people that gives the structure of my data set. Any suggestion/ help would be greatly appreciated.

EDIT: corrected if percent>5 to if percent>0.05 since percents are between 0 and 1 with this calculation.

rfarmenta · Posted 09-28-2018 05:32 PM

This solution worked very well. Thank you!

Classroom Training Available!