Match Pair Code Optimization : proc modeclus

superking · Posted 06-21-2022 03:23 PM

Below is proc modeclus match pair that I'm running.

I'm asking for help to help me optimize the following: % of control file and # of duplicated controls.

Test is 7K

Control is 12MM, if I select 1% of control, it would be 120K

The test is being matched to look for a matched pair / twin in the control files.
While the match rate is 100%, meaning I'm able to find a match paired control for every single test account.

However the problem that I'm getting is that 22% of the the match paired controls (matched to test) I found are being used as match paired controls for more than 1 test.

if I increase the control from 1% to 10%, it would reduce that 22% to much lower %.

Is there a more optimal way to help me to determine the % of control I should use or less of duplicated account in the match pair control accounts .

/***** Define macros - Run locally and remotely *****/
%let ym=202108; 

%let population = work.base_&ym.; /*input dataset name */
%let output = work.base_matched_pairs_&ym.; /*output dataset name */
%let id = account_id; /* account identifier */
%let class = segment ; /* list of categorical variables we wish to control for */
%let vars = 
var1
var2
var3
var4
var5
var6
var7
var8
var9
var10
var11
var12
var13
; /* list of numeric variables we wish to control for */
%let k = 1; /* number of "control" accounts we wish to find for each treatment */
%let j = 9; /* number of matches to find before saying an account has no good matches */





/******************************************************** Run matching code - No edits needed below this line **************************************************************/

/* Create negative account id for treatment accounts as a way to identify them */
/*rsubmit;*/
data population2;
set &population;
if treatment=1 then &id=-1*&id;
format &vars best8.;
run;



/* Standardize numeric variables */
proc standard data = population2 mean =0 std=1 out=z_data replace;
var &vars;
run;

proc sort data=z_data; by &class; run;



/* Run matching algorithm */
%let dk=%sysfunc(sum(&k,&j));
ods _all_ close;


proc modeclus data=z_data dk=&dk  neighbor;
by &class;
var &vars;
id &id;
ods output neighbor=matches;
run;



/* Format output to define a key between treatment and control accounts */
data matches2 (
/*where=(treatment_id>0) */
keep=treatment_id control_id distance match_rank);  /* Only keep accounts where treatment=1 from original population */
set matches;
/*if input(nbor,best16.)>0; */                                                     /* Do not use any treatment accounts as the control for another treatment account */
control_id = input(nbor,best16.);
if ^missing(id) then do;
	 treatment_id = -1*input(id,best16.);
     match_rank = 1;
	 output; 
end;
else do; 
	match_rank+1;
	treatment_id=lag_id;
	output;
end;
lag_id=treatment_id;
retain lag_id;
run;

proc sql;
create table matches3 as
select a.*,b.control_id from
(select distinct
treatment_id,
min(case when control_id>0 then match_rank else 99 end) as match_rank
from matches2 a
where treatment_id>0 
group by 1) a inner join
(select distinct
treatment_id,
case when control_id>0 then control_id else -1*control_id end as control_id,
case when control_id>0 then match_rank else 99 end as match_rank
from matches2) b on a.treatment_id=b.treatment_id and a.match_rank=b.match_rank
order by a.treatment_id, a.match_rank;
quit;

data matches4; set matches3;
by treatment_id match_rank;
if first.treatment_id;
run;

data matched_pairs; set matches4(where=(match_rank<99)); run;





/* Create treatment and control populations using matched pairs key */
proc sort data=matched_pairs; by treatment_id; run;
proc sort data=&population; by &id; run;
data treatment;
merge matched_pairs(in=a /*where=(match_rank le &k)*/ rename=(treatment_id=&id)) &population;
by &id;
if a;
treatment_id=&id;
run;

proc sort data=matched_pairs; by control_id; run;
data control;
merge matched_pairs(in=a /*where=(match_rank le &k)*/ rename=(control_id=&id)) &population;
by &id;
if a;
run;



/* Combine results and run summary statistics */
data &output;
set 
treatment
control;
run;

proc sort data=&output; by &id; run;
/*endrsubmit; */
/**/
/*proc means data=&output mean p10 p25 p50 p75 p90; class        treatment ; var &vars; run;*/
/*proc means data=&output mean p10 p25 p50 p75 p90; class &class treatment ; var &vars; run;*/

proc sort data=&output; by treatment_id; run;
data &output;
merge &output(in=a) work.max_txn_202106(rename=(account_id=treatment_id));
by treatment_id;
if a;
run;
proc sort data=&output; by &id; run;


/* Audit -identifying duplicate controls or controls being used multiple time */

proc sort data=work.base_matched_pairs_&ym. ; by account_id; run;


data single dup;     
set work.base_matched_pairs_&ym.;     
by account_id;     
if first.account_id and last.account_id  then output single;     
else output dup;
run;

data dup ;
set dup;
Dup=1;
run;

data single_t ;
set single;
where test=1;
run;

data single_c ;
set single;
dup=0;
where test=0;
run;

proc sql;
create table single_t2 as
select
a.*
,b.dup
from single_t  a
left join dup b on (a.control_id=b.account_id)
;
run;

proc sort data=single_t2 nodupkey ; by account_id; run;

data work.base_matched_pairs2_&ym. ;
set 
single_c
single_t2
dup 
;
if dup=. then dup=0;
run;

proc freq data=work.base_matched_pairs2_&ym.;
tables test*dup / missing list;
run;

sbxkoenk · Posted 06-25-2022 02:44 PM

Hello,

What is exactly the question you want us to answer?

Do you want an estimation algorithm that says :

With 1% of control group, 22% of the the match paired controls (matched to test) are being used as match paired controls for more than 1 test case.
And with X% of control group, Y% of the the match paired controls (matched to test) are being used as match paired controls for more than 1 test case.

??

Or do you just want to achieve that each control is used only once?

In the latter case, what is the problem with using all 12 MM controls? Is it taking too long?

Also, you can work in multiple iterations :

> If a control is used 5 times, ... then only retain the best match (minimal multivariate distance between test and control).

> Remove all controls that have already been used from the controls group
> Remove all test cases that already have a match from the test group

> do a second run of your matching with PROC MODECLUS

> repeat this until all test cases have a match with a control case

> ( you can report about the test cases that have not found a "good" match. )

The above way, every control that is matched to a test case is used only once !

Thanks,

Koen

Match Pair Code Optimization : proc modeclus

Re: Match Pair Code Optimization : proc modeclus