Hi,
I have a dataset with around 520 variable with many missing values. I want to delete variables with >90% missing values (I dont want to impute). Is there any macro or anything else to do it?
I tried ChatGPT, but I got an error that GPT could not solve.
Here is GPTcode:
/* Step 1: Calculate the percentage of missing values for each variable */
ods output onewayfreqs=missing;
proc freq data=dxccsr_transposed;
tables _all_ / missing;
run;
/* Step 2: Identify variables with 90% or more missing values */
data _null_;
set missing end=lastobs;
if _n_ = 1 then call symputx('numvars',_nobs_);
array pctmiss(*) _:;
array vars(*) $ _CHARACTER_;
do i = 1 to dim(pctmiss);
if pctmiss(i) >= 90 then vars(i) = scan(vlabel(pctmiss(i)), 1, ' ');
end;
if lastobs then call symputx('delvars',catx(' ', of vars(*)));
run;
/* Step 3: Define a macro to drop variables */
%macro dropvars;
data dxccsr_transposed(drop=&delvars);
set dxccsr_transposed;
run;
%mend;
/* Step 4: Call the macro to drop variables */
%dropvars;
The log error:
NOTE: Line generated by the macro variable "DELVARS".
1 Table DXCCSR_EAR006 3
-
214
23
ERROR 214-322: Variable name 3 is not valid.
ERROR 23-7: Invalid value for the DROP option.
Thanks
%macro drop_vars(dsn=,pct=);
proc transpose data=&dsn.(obs=0) out=vname;
var _all_;
run;
data _null_;
set vname end=last;
if _n_=1 then call execute('proc sql;create table n_miss as select ');
call execute(catx(' ','nmiss(',_name_,') as ',_name_));
if last then call execute("from &dsn.;quit;");
else call execute(',');
run;
proc transpose data=n_miss out=n_miss2;
var _all_;
run;
%let dsid=%sysfunc(open(&dsn.));
%let nobs=%sysfunc(attrn(&dsid.,nlobs));
%let dsid=%sysfunc(close(&dsid.));
data n_miss3;
set n_miss2;
per_missing=col1/&nobs.;
run;
proc sql noprint;
select _NAME_ into :drops separated by ','
from n_miss3
where per_missing > &pct. ;
alter table &dsn.
drop &drops.;
quit;
%mend;
data have;
set sashelp.heart;
run;
/*
dsn is the dataset you want modify
pct is the percent of missing you want to drop
*/
%drop_vars(dsn=have,pct=0.6)
Names of your original variables? I ask because you are using
array pctmiss(*) _:;
And the ODS output for onewayfreqs will only have variables with names that start with _ for the variables I have created, none of the Proc Freq variables. If that is the case and some of your variables of interest do not start with _ that could be the problem. Or did you mean to use F_: for the formatted variables that Ods output creates?
The proc freq code you show would have the variable PERCENT with values of 90 or greater as your value of interest, not any of the other variables. I don't see where you are checking for any missing values. I am afraid that your code checks to see if the original value was greater than 90, not the percent missing.
This will show the variables that have a value with a percent > 90 but not checking for missing.
data reduce; set missing; length var $ 32; var=scan(table,2); if percent>90; keep var percent; run;
You need a SAS programmer to help you, not a ChatGPT bull*hit, So, good you writing here 🙂
Try this:
data test;
output;
output;
output;
a=4;
output;
b=4;
output;
c=4;
output;
d=4;
output;
e=4;
output;
f=4;
output;
g=4;
output;
h=4;
output;
i=4;
output;
j=4;
output;
k=4;
output;
l=4;
m=4;
n=4;
o=4;
output;
run;
proc print data=test;
run;
data _null_;
if 0 then set test;
array _variables_ _NUMERIC_; /* list your variables here */
length _variableName_ $ 32 _numberOfMissing_ 8;
declare hash _H_(ordered:'a');
_H_.defineKey("_variableName_");
_H_.defineData("_variableName_");
_H_.defineData("_numberOfMissing_");
_H_.DefineDone();
do until(EOF);
set test NOBS=NOBS end=EOF;
do over _variables_;
_variableName_ = vname(_variables_);
_numberOfMissing_ = 0;
_RC_=_H_.find();
_numberOfMissing_ + nmiss(_variables_);
_H_.replace();
end;
end;
_H_.output(dataset:"missing_count");
stop;
run;
proc print data=missing_count;
run;
data missing_count;
if 0 then set test(drop=_ALL_) nobs=_nobs_;
set missing_count;
nobs=_nobs_;
_percentOfMissing_ = _numberOfMissing_/nobs;
format _percentOfMissing_ percent10.2;
run;
proc print data=missing_count;
run;
proc sql noprint;
select _variableName_
into :_variableNameMissing_ separated by " "
from missing_count
where _percentOfMissing_ > .90 /* <------------- 90% */
;
run;
options symbolgen;
data test2;
set test(drop=&_variableNameMissing_.);
run;
Bart
%macro drop_vars(dsn=,pct=);
proc transpose data=&dsn.(obs=0) out=vname;
var _all_;
run;
data _null_;
set vname end=last;
if _n_=1 then call execute('proc sql;create table n_miss as select ');
call execute(catx(' ','nmiss(',_name_,') as ',_name_));
if last then call execute("from &dsn.;quit;");
else call execute(',');
run;
proc transpose data=n_miss out=n_miss2;
var _all_;
run;
%let dsid=%sysfunc(open(&dsn.));
%let nobs=%sysfunc(attrn(&dsid.,nlobs));
%let dsid=%sysfunc(close(&dsid.));
data n_miss3;
set n_miss2;
per_missing=col1/&nobs.;
run;
proc sql noprint;
select _NAME_ into :drops separated by ','
from n_miss3
where per_missing > &pct. ;
alter table &dsn.
drop &drops.;
quit;
%mend;
data have;
set sashelp.heart;
run;
/*
dsn is the dataset you want modify
pct is the percent of missing you want to drop
*/
%drop_vars(dsn=have,pct=0.6)
Great. Thanks for your help.
good point,
I went back & ran a proc contents after the sas import,
and here's the python describe on the dataframe
both have read in 66907 rows and 170 columns.
the problem seems to be at the point of dropping columns with 100% missing values.
OK. Here is another way to check if a variable is all missing.
data have;
set sashelp.heart;
call missing(sex,weight);
run;
ods select none;
ods output nlevels=want(where=(NNonMissLevels=0));
proc freq data=have nlevels;
table _all_;
run;
ods select all;
You can compare it with my PROC SQL.
BTW, my code drop both character and numeric type variables , I am not sure your Python code would drop both of them.
I am unable to test it due to not access to that csv .
and also you could check the missing value in both sas and python to see if there are some difference.
As an aside, because I already see a couple answers posted...
That code that ChatGPT spit out is really horrific. You can imagine what it was trying to do (or I guess what it was predicting should be done), but it seems to me as so far off that it's not helpful. I wonder if ChatGPT is significantly better at predicting code for other languages, where it might have had more training data. I've seen plenty of programming folks saying they could use ChatGPT as a first draft of code, or to generate code for the basic/boring stuff. But I'm not at all convinced that the SAS code it can generate is helpful. Interesting, maybe, but I'm not sure it's helpful.
@Quentin wrote:
As an aside, because I already see a couple answers posted...
That code that ChatGPT spit out is really horrific. You can imagine what it was trying to do (or I guess what it was predicting should be done), but it seems to me as so far off that it's not helpful. I wonder if ChatGPT is significantly better at predicting code for other languages, where it might have had more training data. I've seen plenty of programming folks saying they could use ChatGPT as a first draft of code, or to generate code for the basic/boring stuff. But I'm not at all convinced that the SAS code it can generate is helpful. Interesting, maybe, but I'm not sure it's helpful.
Considering that SAS Proc Import for text files, which knows what you are attempting to do and does a fair job, has recurring issues with making many id type variables numeric when likely not the best choice I sort of cringe at what Chatgpt might suggest for some of the files I read.
Of course we have absolutely no idea what @lansoprazole asked ChatGPT to do in the first place. Considering the percentage of poorly formed questions we see on this forum I would be surprised if the question was phrased well to begin with and then the bot used the very few parts that it understood, >90 for example, to build from.
Not to mention the likelihood of it coming up with use of a custom informat to provide data validations on reading is extremely small.
Apparently I wrote a macro to do this a few years ago...
https://gist.github.com/statgeek/feedc3fc520cb0d2018ca2a8cab241d8
%macro drop_missing_pct(input_dsn = , output_dsn=, pct = , id_vars=);
*input_dsn = input data set name;
*output_dsn = output data set name;
*pct = missing percent, variables with a percentage of missing above this value are dropped;
*id_vars = space delimited list of variables that you do not want to include in the analysis such as ID variables;
*create format for missing;
proc format;
value $ missfmt ' '="Missing" other="Not Missing";
value nmissfmt .="Missing" other="Not Missing";
run;
*Proc freq to count missing/non missing;
ods select none;
*turns off the output so the results do not get too messy;
ods table onewayfreqs=temp;
proc freq data=&INPUT_DSN. (drop = &ID_Vars);
table _all_ / missing;
format _numeric_ nmissfmt. _character_ $missfmt.;
run;
ods select all;
*Format and organize output;
data long;
length variable $32. variable_value $50.;
set temp;
Variable=scan(table, 2);
Variable_Value=strip(trim(vvaluex(variable)));
presentation=catt(frequency, " (", trim(put(percent/100, percent7.1)), ")");
keep variable variable_value frequency percent cum: presentation;
label variable='Variable' variable_value='Variable Value';
run;
*not required for display purposes;
proc sort data=long;
by variable;
run;
*select variables more than x% missing;
proc sql noprint;
select variable into :drop_var_list separated by " "
from long where variable_value = 'Missing' and percent > &pct;
quit;
*Drop variables;
data &output_dsn;
set &input_dsn;
drop &drop_var_list;
run;
*clean up;
*uncomment after testing;
/* proc sql; */
/* drop table long; */
/* drop table temp; */
/* quit; */
%mend;
***************************************************************************************************
*Example Usage
***************************************************************************************************;
data class;
set sashelp.class;
if age=14 then
call missing(height, weight, sex);
if name='Alfred' then
call missing(sex, age, height);
label age="Fancy Age Label";
run;
%drop_missing_pct(input_dsn = class, output_dsn = want, pct = 20, id_vars = Name);
*check output;
proc contents data=want;
run;
Registration is now open for SAS Innovate 2025 , our biggest and most exciting global event of the year! Join us in Orlando, FL, May 6-9.
Sign up by Dec. 31 to get the 2024 rate of just $495.
Register now!
Learn how use the CAT functions in SAS to join values from multiple variables into a single value.
Find more tutorials on the SAS Users YouTube channel.
Ready to level-up your skills? Choose your own adventure.