BookmarkSubscribeRSS Feed
ldelly622
Calcite | Level 5

Hello! I am running a propensity score match and would like some help. My variables are multi-leveled as described below:

 

age_cat: less than 45, 45-54, 55-64, 65+
region: NE, MW, S, W, Unknown
gendercode: M, F
raceethnicitycode: W, A, B, O
cci_cat4: miss, 0, 1, 2, 3plus

 

I am using the code below to run the proc psmatch and then to calculate the SMDs based on the proportions. Is this a proper way to do so or should i be using binomal variables? and if I should be, should i do a binomial variable for every level of the multi-level variables? and how do i know if i want to do an exact match and on what variables?

code:

proc psmatch data=redacted;
/* Categorical covariates */
class treatment age_cat region gendercode raceethnicitycode cci_cat4;

/* PS model: include the variables you want balanced */
psmodel treatment(treated='1') = 
age_cat
region
gendercode
raceethnicitycode
cci_cat4
;

/* Matching options */
match
method = greedy(k=1) /* 1:1 nearest neighbor */
distance = lps /* caliper applies on logit(PS) */
caliper = 0.2
;

/* Version-friendly balance request */
assess;

/* Output rich diagnostics and flags */
output out=matched_scsdx
ps = _ps_
lps = _lps_
weight = _matchwgt_
matchid = _matchid_;
quit;

 

data matched_only;
set matched_scsdx;
if _matchwgt_ > 0;
run;

 

/* -------------------------------
Utility: ensure a weight exists
------------------------------- */
%macro _ensure_weight(in=, weight=, out=, wname=_w_);
data &out.;
set &in.;
%if %length(&weight.) = 0 %then %do; &wname. = 1; %end;
%else %do;
&wname. = &weight.;
if missing(&wname.) then &wname. = 0;
%end;
run;
%mend;


/* -----------------------------------------------------
CATEGORICAL balance (multi-level) with per-level SMD
Produces per-level rows and a MAX_ABS_SMD summary row
----------------------------------------------------- */
%macro balance_categorical(data=, treat=treatment, weight=, catvars=, matched_only=N, out=cat_balance);
%local _base _w_;
%let _base = _catbase_;
%let _w_ = _w_;

/* Subset to matched-only if requested */
%if %upcase(&matched_only.)=Y %then %do;
data &_base._sub;
set &data.;
if _matchwgt_ > 0;
run;
%end;
%else %do;
data &_base._sub; set &data.; run;
%end;

/* Ensure weight */
%_ensure_weight(in=&_base._sub, weight=&weight., out=&_base._w, wname=&_w_.);

/* Loop over categorical variables */
%let _i=1;
%let _var=%scan(&catvars., &_i.);
%do %while(%length(&_var.)>0);

/* Weighted counts by level and treatment */
proc summary data=&_base._w nway;
class &treat. &_var.;
var &_w_.;
output out=&_base._cnt_&_i. (drop=_type_ _freq_) sum=wt_sum;
run;
/* Denominator by treatment */
proc summary data=&_base._w nway;
class &treat.;
var &_w_.;
output out=&_base._den_&_i. (drop=_type_ _freq_) sum=wt_total;
run;

proc sort data=&_base._cnt_&_i.; by &treat.; run;
proc sort data=&_base._den_&_i.; by &treat.; run;

data &_base._props_&_i.;
merge &_base._cnt_&_i.(in=a) &_base._den_&_i.(in=b);
by &treat.;
if a and b;
length level $200 varname $64;
level = vvalue(&_var.);
varname = "&_var.";
run;

proc sql noprint;
create table &_base._wide_&_i. as
select varname, level,
sum(case when &treat.=1 then wt_sum end) as wt_t,
max(case when &treat.=1 then wt_total end) as nt,
sum(case when &treat.=0 then wt_sum end) as wt_c,
max(case when &treat.=0 then wt_total end) as nc
from &_base._props_&_i.
group by varname, level;
quit;

data &_base._smd_&_i.;
set &_base._wide_&_i.;
length phase $8;
if nt>0 then pt=wt_t/nt; else pt=.;
if nc>0 then pc=wt_c/nc; else pc=.;
if n(pt,pc,nt,nc)=4 then do;
p_pool=(pt*nt + pc*nc)/(nt+nc);
if 0<p_pool<1 then smd=(pt-pc)/sqrt(p_pool*(1-p_pool));
else smd=.;
end;
else smd=.;
abs_smd=abs(smd);
keep varname level pt pc smd abs_smd;
run;

/* Max |SMD| summary row */
proc sql noprint;
create table &_base._max_&_i. as
select varname, 'MAX_ABS_SMD' as level length=12,
. as pt, . as pc, . as smd, max(abs_smd) as abs_smd
from &_base._smd_&_i.
group by varname;
quit;

data &_base._out_&_i.; set &_base._smd_&_i. &_base._max_&_i.; run;

%let _i=%eval(&_i.+1);
%let _var=%scan(&catvars., &_i.);
%end;

%if %eval(&_i.-1) > 0 %then %do;
data &out.; set &_base._out_:; run;
%end;
%else %do;
data &out.; length varname $64 level $200 pt pc smd abs_smd 8.; stop; run;
%end;

proc datasets library=work nolist;
delete &_base._: ;
quit;
%mend;

 

/* ----------------------------------------------
CONTINUOUS balance: SMD using means/pooled SD
---------------------------------------------- */
%macro balance_continuous(data=, treat=treatment, weight=, contvars=, matched_only=N, out=cont_balance);
%local _base _w_;
%let _base = _contbase_;
%let _w_ = _w_;

%if %upcase(&matched_only.)=Y %then %do;
data &_base._sub;
set &data.;
if _matchwgt_ > 0;
run;
%end;
%else %do;
data &_base._sub; set &data.; run;
%end;

%_ensure_weight(in=&_base._sub, weight=&weight., out=&_base._w, wname=&_w_.);

%let _i=1;
%let _var=%scan(&contvars., &_i.);
%do %while(%length(&_var.)>0);

proc means data=&_base._w nway;
class &treat.;
var &_var.;
weight &_w_.;
output out=&_base._stats_&_i. (drop=_type_ _freq_) mean=mean var=var;
run;

proc sql noprint;
create table &_base._smd_&_i. as
select "&_var." as varname length=64,
max(case when &treat.=1 then mean end) as mean1,
max(case when &treat.=0 then mean end) as mean0,
max(case when &treat.=1 then var end) as var1,
max(case when &treat.=0 then var end) as var0
from &_base._stats_&_i.;
quit;

data &_base._smd_&_i.;
set &_base._smd_&_i.;
length level $200;
level = ''; /* no levels for continuous */
if n(var1,var0)>0 then do;
denom=sqrt((var1+var0)/2);
if denom>0 then smd=(mean1-mean0)/denom;
else smd=.;
end;
else smd=.;
abs_smd=abs(smd);
keep varname level mean1 mean0 smd abs_smd;
run;

%let _i=%eval(&_i.+1);
%let _var=%scan(&contvars., &_i.);
%end;

%if %eval(&_i.-1) > 0 %then %do;
data &out.; set &_base._smd_:; run;
%end;
%else %do;
data &out.; length varname $64 level $200 mean1 mean0 smd abs_smd 8.; stop; run;
%end;

proc datasets library=work nolist;
delete &_base._: ;
quit;
%mend;


/* ---------------------------------------------------
WRAPPER: Creates combined balance tables
- inputdata: dataset (pre or post)
- treat: treatment var (default: treatment)
- weight: leave blank for PRE; set _matchwgt_ for POST
- matched_only: Y to restrict POST to matched rows
- catvars: list of categorical variables
- contvars: list of continuous variables
- out: combined output table
--------------------------------------------------- */
%macro create_balance(inputdata=,
treat=treatment,
weight=,
matched_only=N,
catvars=,
contvars=,
out=balance_out);

%balance_categorical(data=&inputdata., treat=&treat., weight=&weight.,
catvars=&catvars., matched_only=&matched_only.,
out=_cat_bal_);

%balance_continuous(data=&inputdata., treat=&treat., weight=&weight.,
contvars=&contvars., matched_only=&matched_only.,
out=_cont_bal_);

data &out.;
length Category $20;
set _cat_bal_(in=a) _cont_bal_(in=b);
if a then Category='Categorical';
else if b then Category='Continuous';
run;

proc datasets library=work nolist; delete _cat_bal_ _cont_bal_; quit;
%mend;


/* PRE-match: unweighted */
%create_balance(
inputdata = combined_demog_cci_final_2_NOU,
treat = treatment,
weight = , /* none */
matched_only= N,
catvars = age_cat region gendercode raceethnicitycode cci_cat4,
contvars = ageatendyear CCI,
out = balance_pre
);

/* Print summary: per-level and max |SMD| rows included */
title "PRE-match Balance (SMDs)";
proc print data=balance_pre noobs;
var Category varname level pt pc mean1 mean0 smd abs_smd;
format pt pc percent8.2 smd abs_smd 6.3;
run;


/* POST-match: weighted & restricted to matched-only */
%create_balance(
inputdata = matched_only,
treat = treatment,
weight = _matchwgt_,
matched_only= Y,
catvars = age_cat region gendercode raceethnicitycode cci_cat4,
contvars = ageatendyear CCI,
out = balance_post
);

 

1 REPLY 1
quickbluefish
Barite | Level 11

I have to assume this giant pile of code was generated by an over-eager LLM.  I would really suggest that you ask that it first give you something *very bare-bones* (a "one-off") and explain what it did .  And then, *as needed* add on stuff bit by bit with the LLM's help.  

 

Re: a couple of your questions, exact matching is up to your own judgement - basically, it's for, at most, a few key variables that are too sharply delineating to leave up to the black box of the PS score to take care of.  For instance, if your study was on the effectiveness of treatment A vs. treatment B on breast cancer survival, you might think that menopause was such a strong confounder that, instead of just putting menarche in the propensity model, you actually matched pre-menopausal women on treatment A only to other pre-menopausal women on treatment B, and similarly for post-menopausal women.  Or if you were looking at the effectiveness of using some new bike helmet technology vs. standard bike helmets in reducing brain injuries, you might do exact matching on rurality, on the notion that you expect bike accidents in rural areas to carry fundamentally different risks than in dense urban areas.  

What is ANOVA?

ANOVA, or Analysis Of Variance, is used to compare the averages or means of two or more populations to better understand how they differ. Watch this tutorial for more.

Find more tutorials on the SAS Users YouTube channel.

Discussion stats
  • 1 reply
  • 113 views
  • 0 likes
  • 2 in conversation