<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Match Pair Code Optimization : proc modeclus in Statistical Procedures</title>
    <link>https://communities.sas.com/t5/Statistical-Procedures/Match-Pair-Code-Optimization-proc-modeclus/m-p/819443#M40539</link>
    <description>&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Below is&amp;nbsp;proc modeclus match pair that I'm running.&lt;/P&gt;&lt;P&gt;I'm asking for help to help me optimize the following: % of control file and # of duplicated controls.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Test is 7K&lt;/P&gt;&lt;P&gt;Control is 12MM, if I select 1% of control, it would be 120K&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;The test is being matched to look for a matched pair / twin in the control files.&lt;BR /&gt;While the match rate is 100%, meaning I'm able to find a match paired control for every single test account.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;However the problem that I'm getting is that 22% of the the match paired controls (matched to test) I found are being used as match paired controls for more than 1 test.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;if I increase the control from 1% to 10%, it would reduce that 22% to much lower %.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Is there a more optimal way to help me to determine the % of control I should use or less of duplicated account in the match pair control accounts .&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;PRE&gt;&lt;CODE class=""&gt;/***** Define macros - Run locally and remotely *****/
%let ym=202108; &lt;BR /&gt;
%let population = work.base_&amp;amp;ym.; /*input dataset name */
%let output = work.base_matched_pairs_&amp;amp;ym.; /*output dataset name */
%let id = account_id; /* account identifier */
%let class = segment ; /* list of categorical variables we wish to control for */
%let vars = 
var1
var2
var3
var4
var5
var6
var7
var8
var9
var10
var11
var12
var13
; /* list of numeric variables we wish to control for */
%let k = 1; /* number of "control" accounts we wish to find for each treatment */
%let j = 9; /* number of matches to find before saying an account has no good matches */





/******************************************************** Run matching code - No edits needed below this line **************************************************************/

/* Create negative account id for treatment accounts as a way to identify them */
/*rsubmit;*/
data population2;
set &amp;amp;population;
if treatment=1 then &amp;amp;id=-1*&amp;amp;id;
format &amp;amp;vars best8.;
run;



/* Standardize numeric variables */
proc standard data = population2 mean =0 std=1 out=z_data replace;
var &amp;amp;vars;
run;

proc sort data=z_data; by &amp;amp;class; run;



/* Run matching algorithm */
%let dk=%sysfunc(sum(&amp;amp;k,&amp;amp;j));
ods _all_ close;


proc modeclus data=z_data dk=&amp;amp;dk  neighbor;
by &amp;amp;class;
var &amp;amp;vars;
id &amp;amp;id;
ods output neighbor=matches;
run;



/* Format output to define a key between treatment and control accounts */
data matches2 (
/*where=(treatment_id&amp;gt;0) */
keep=treatment_id control_id distance match_rank);  /* Only keep accounts where treatment=1 from original population */
set matches;
/*if input(nbor,best16.)&amp;gt;0; */                                                     /* Do not use any treatment accounts as the control for another treatment account */
control_id = input(nbor,best16.);
if ^missing(id) then do;
	 treatment_id = -1*input(id,best16.);
     match_rank = 1;
	 output; 
end;
else do; 
	match_rank+1;
	treatment_id=lag_id;
	output;
end;
lag_id=treatment_id;
retain lag_id;
run;

proc sql;
create table matches3 as
select a.*,b.control_id from
(select distinct
treatment_id,
min(case when control_id&amp;gt;0 then match_rank else 99 end) as match_rank
from matches2 a
where treatment_id&amp;gt;0 
group by 1) a inner join
(select distinct
treatment_id,
case when control_id&amp;gt;0 then control_id else -1*control_id end as control_id,
case when control_id&amp;gt;0 then match_rank else 99 end as match_rank
from matches2) b on a.treatment_id=b.treatment_id and a.match_rank=b.match_rank
order by a.treatment_id, a.match_rank;
quit;

data matches4; set matches3;
by treatment_id match_rank;
if first.treatment_id;
run;

data matched_pairs; set matches4(where=(match_rank&amp;lt;99)); run;





/* Create treatment and control populations using matched pairs key */
proc sort data=matched_pairs; by treatment_id; run;
proc sort data=&amp;amp;population; by &amp;amp;id; run;
data treatment;
merge matched_pairs(in=a /*where=(match_rank le &amp;amp;k)*/ rename=(treatment_id=&amp;amp;id)) &amp;amp;population;
by &amp;amp;id;
if a;
treatment_id=&amp;amp;id;
run;

proc sort data=matched_pairs; by control_id; run;
data control;
merge matched_pairs(in=a /*where=(match_rank le &amp;amp;k)*/ rename=(control_id=&amp;amp;id)) &amp;amp;population;
by &amp;amp;id;
if a;
run;



/* Combine results and run summary statistics */
data &amp;amp;output;
set 
treatment
control;
run;

proc sort data=&amp;amp;output; by &amp;amp;id; run;
/*endrsubmit; */
/**/
/*proc means data=&amp;amp;output mean p10 p25 p50 p75 p90; class        treatment ; var &amp;amp;vars; run;*/
/*proc means data=&amp;amp;output mean p10 p25 p50 p75 p90; class &amp;amp;class treatment ; var &amp;amp;vars; run;*/

proc sort data=&amp;amp;output; by treatment_id; run;
data &amp;amp;output;
merge &amp;amp;output(in=a) work.max_txn_202106(rename=(account_id=treatment_id));
by treatment_id;
if a;
run;
proc sort data=&amp;amp;output; by &amp;amp;id; run;


/* Audit -identifying duplicate controls or controls being used multiple time */

proc sort data=work.base_matched_pairs_&amp;amp;ym. ; by account_id; run;


data single dup;     
set work.base_matched_pairs_&amp;amp;ym.;     
by account_id;     
if first.account_id and last.account_id  then output single;     
else output dup;
run;

data dup ;
set dup;
Dup=1;
run;

data single_t ;
set single;
where test=1;
run;

data single_c ;
set single;
dup=0;
where test=0;
run;

proc sql;
create table single_t2 as
select
a.*
,b.dup
from single_t  a
left join dup b on (a.control_id=b.account_id)
;
run;

proc sort data=single_t2 nodupkey ; by account_id; run;

data work.base_matched_pairs2_&amp;amp;ym. ;
set 
single_c
single_t2
dup 
;
if dup=. then dup=0;
run;

proc freq data=work.base_matched_pairs2_&amp;amp;ym.;
tables test*dup / missing list;
run;&lt;/CODE&gt;&lt;/PRE&gt;</description>
    <pubDate>Tue, 21 Jun 2022 19:23:14 GMT</pubDate>
    <dc:creator>superking</dc:creator>
    <dc:date>2022-06-21T19:23:14Z</dc:date>
    <item>
      <title>Match Pair Code Optimization : proc modeclus</title>
      <link>https://communities.sas.com/t5/Statistical-Procedures/Match-Pair-Code-Optimization-proc-modeclus/m-p/819443#M40539</link>
      <description>&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Below is&amp;nbsp;proc modeclus match pair that I'm running.&lt;/P&gt;&lt;P&gt;I'm asking for help to help me optimize the following: % of control file and # of duplicated controls.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Test is 7K&lt;/P&gt;&lt;P&gt;Control is 12MM, if I select 1% of control, it would be 120K&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;The test is being matched to look for a matched pair / twin in the control files.&lt;BR /&gt;While the match rate is 100%, meaning I'm able to find a match paired control for every single test account.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;However the problem that I'm getting is that 22% of the the match paired controls (matched to test) I found are being used as match paired controls for more than 1 test.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;if I increase the control from 1% to 10%, it would reduce that 22% to much lower %.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Is there a more optimal way to help me to determine the % of control I should use or less of duplicated account in the match pair control accounts .&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;PRE&gt;&lt;CODE class=""&gt;/***** Define macros - Run locally and remotely *****/
%let ym=202108; &lt;BR /&gt;
%let population = work.base_&amp;amp;ym.; /*input dataset name */
%let output = work.base_matched_pairs_&amp;amp;ym.; /*output dataset name */
%let id = account_id; /* account identifier */
%let class = segment ; /* list of categorical variables we wish to control for */
%let vars = 
var1
var2
var3
var4
var5
var6
var7
var8
var9
var10
var11
var12
var13
; /* list of numeric variables we wish to control for */
%let k = 1; /* number of "control" accounts we wish to find for each treatment */
%let j = 9; /* number of matches to find before saying an account has no good matches */





/******************************************************** Run matching code - No edits needed below this line **************************************************************/

/* Create negative account id for treatment accounts as a way to identify them */
/*rsubmit;*/
data population2;
set &amp;amp;population;
if treatment=1 then &amp;amp;id=-1*&amp;amp;id;
format &amp;amp;vars best8.;
run;



/* Standardize numeric variables */
proc standard data = population2 mean =0 std=1 out=z_data replace;
var &amp;amp;vars;
run;

proc sort data=z_data; by &amp;amp;class; run;



/* Run matching algorithm */
%let dk=%sysfunc(sum(&amp;amp;k,&amp;amp;j));
ods _all_ close;


proc modeclus data=z_data dk=&amp;amp;dk  neighbor;
by &amp;amp;class;
var &amp;amp;vars;
id &amp;amp;id;
ods output neighbor=matches;
run;



/* Format output to define a key between treatment and control accounts */
data matches2 (
/*where=(treatment_id&amp;gt;0) */
keep=treatment_id control_id distance match_rank);  /* Only keep accounts where treatment=1 from original population */
set matches;
/*if input(nbor,best16.)&amp;gt;0; */                                                     /* Do not use any treatment accounts as the control for another treatment account */
control_id = input(nbor,best16.);
if ^missing(id) then do;
	 treatment_id = -1*input(id,best16.);
     match_rank = 1;
	 output; 
end;
else do; 
	match_rank+1;
	treatment_id=lag_id;
	output;
end;
lag_id=treatment_id;
retain lag_id;
run;

proc sql;
create table matches3 as
select a.*,b.control_id from
(select distinct
treatment_id,
min(case when control_id&amp;gt;0 then match_rank else 99 end) as match_rank
from matches2 a
where treatment_id&amp;gt;0 
group by 1) a inner join
(select distinct
treatment_id,
case when control_id&amp;gt;0 then control_id else -1*control_id end as control_id,
case when control_id&amp;gt;0 then match_rank else 99 end as match_rank
from matches2) b on a.treatment_id=b.treatment_id and a.match_rank=b.match_rank
order by a.treatment_id, a.match_rank;
quit;

data matches4; set matches3;
by treatment_id match_rank;
if first.treatment_id;
run;

data matched_pairs; set matches4(where=(match_rank&amp;lt;99)); run;





/* Create treatment and control populations using matched pairs key */
proc sort data=matched_pairs; by treatment_id; run;
proc sort data=&amp;amp;population; by &amp;amp;id; run;
data treatment;
merge matched_pairs(in=a /*where=(match_rank le &amp;amp;k)*/ rename=(treatment_id=&amp;amp;id)) &amp;amp;population;
by &amp;amp;id;
if a;
treatment_id=&amp;amp;id;
run;

proc sort data=matched_pairs; by control_id; run;
data control;
merge matched_pairs(in=a /*where=(match_rank le &amp;amp;k)*/ rename=(control_id=&amp;amp;id)) &amp;amp;population;
by &amp;amp;id;
if a;
run;



/* Combine results and run summary statistics */
data &amp;amp;output;
set 
treatment
control;
run;

proc sort data=&amp;amp;output; by &amp;amp;id; run;
/*endrsubmit; */
/**/
/*proc means data=&amp;amp;output mean p10 p25 p50 p75 p90; class        treatment ; var &amp;amp;vars; run;*/
/*proc means data=&amp;amp;output mean p10 p25 p50 p75 p90; class &amp;amp;class treatment ; var &amp;amp;vars; run;*/

proc sort data=&amp;amp;output; by treatment_id; run;
data &amp;amp;output;
merge &amp;amp;output(in=a) work.max_txn_202106(rename=(account_id=treatment_id));
by treatment_id;
if a;
run;
proc sort data=&amp;amp;output; by &amp;amp;id; run;


/* Audit -identifying duplicate controls or controls being used multiple time */

proc sort data=work.base_matched_pairs_&amp;amp;ym. ; by account_id; run;


data single dup;     
set work.base_matched_pairs_&amp;amp;ym.;     
by account_id;     
if first.account_id and last.account_id  then output single;     
else output dup;
run;

data dup ;
set dup;
Dup=1;
run;

data single_t ;
set single;
where test=1;
run;

data single_c ;
set single;
dup=0;
where test=0;
run;

proc sql;
create table single_t2 as
select
a.*
,b.dup
from single_t  a
left join dup b on (a.control_id=b.account_id)
;
run;

proc sort data=single_t2 nodupkey ; by account_id; run;

data work.base_matched_pairs2_&amp;amp;ym. ;
set 
single_c
single_t2
dup 
;
if dup=. then dup=0;
run;

proc freq data=work.base_matched_pairs2_&amp;amp;ym.;
tables test*dup / missing list;
run;&lt;/CODE&gt;&lt;/PRE&gt;</description>
      <pubDate>Tue, 21 Jun 2022 19:23:14 GMT</pubDate>
      <guid>https://communities.sas.com/t5/Statistical-Procedures/Match-Pair-Code-Optimization-proc-modeclus/m-p/819443#M40539</guid>
      <dc:creator>superking</dc:creator>
      <dc:date>2022-06-21T19:23:14Z</dc:date>
    </item>
    <item>
      <title>Re: Match Pair Code Optimization : proc modeclus</title>
      <link>https://communities.sas.com/t5/Statistical-Procedures/Match-Pair-Code-Optimization-proc-modeclus/m-p/820393#M40572</link>
      <description>&lt;P&gt;Hello,&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;What is exactly the question you want us to answer?&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Do you want an estimation algorithm that says :&lt;/P&gt;
&lt;UL class="lia-list-style-type-square"&gt;
&lt;LI&gt;With 1% of control group, 22% of the the match paired controls (matched to test) are being used as match paired controls for more than 1 test case.&lt;/LI&gt;
&lt;LI&gt;And with &lt;STRONG&gt;&lt;FONT color="#008000"&gt;X&lt;/FONT&gt;&lt;/STRONG&gt;% of control group, &lt;STRONG&gt;&lt;FONT color="#FF0000"&gt;Y&lt;/FONT&gt;&lt;/STRONG&gt;% of the the match paired controls (matched to test) are being used as match paired controls for more than 1 test case.&lt;/LI&gt;
&lt;/UL&gt;
&lt;P&gt;??&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&lt;FONT color="#0000FF"&gt;Or do you just want to achieve that each control is used only once?&lt;/FONT&gt;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;In the latter case, what is the problem with using all 12 MM controls? Is it taking too long?&lt;/P&gt;
&lt;P&gt;Also, you can work in multiple iterations :&lt;/P&gt;
&lt;P&gt;&amp;gt; If a control is used 5 times, ... then only retain the best match (minimal multivariate distance between test and control).&lt;/P&gt;
&lt;P&gt;&amp;gt; Remove all controls that have already been used from the controls group &lt;BR /&gt;&amp;gt; Remove all test cases that already have a match from the test group&lt;/P&gt;
&lt;P&gt;&amp;gt; do a second run of your matching with PROC MODECLUS&lt;/P&gt;
&lt;P&gt;&amp;gt; repeat this until all test cases have a match with a control case&lt;/P&gt;
&lt;P&gt;&amp;gt; ( you can report about the test cases that have not found a "good" match. )&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;The above way, every control that is matched to a test case is used only once !&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Thanks,&lt;/P&gt;
&lt;P&gt;Koen&lt;/P&gt;
&lt;DIV id="ConnectiveDocSignExtentionInstalled" data-extension-version="1.0.4"&gt;&amp;nbsp;&lt;/DIV&gt;</description>
      <pubDate>Sat, 25 Jun 2022 18:44:08 GMT</pubDate>
      <guid>https://communities.sas.com/t5/Statistical-Procedures/Match-Pair-Code-Optimization-proc-modeclus/m-p/820393#M40572</guid>
      <dc:creator>sbxkoenk</dc:creator>
      <dc:date>2022-06-25T18:44:08Z</dc:date>
    </item>
  </channel>
</rss>

