Solved: Re: Minimization challenge

MetinBulus · Posted 03-25-2016 01:32 AM

Hi everyone, I have a table similiar to this:

g p1 p2 p3 p4 p5 x

0 0.80 0.86 0.19 0.01 0.96 6

0 0.15 0.57 0.87 0.36 0.03 7

0 0.41 0.75 0.33 0.39 0.27 3

0 0.38 0.88 0.53 0.49 0.17 5

0 0.14 0.32 0.09 0.35 0.63 9

0 0.22 0.49 0.32 0.58 0.96 2

0 0.48 0.33 0.98 0.19 0.42 8

0 0.30 0.88 0.87 0.84 0.31 7

1 0.32 0.31 0.10 0.08 0.36 10

1 0.85 0.83 0.30 0.40 0.97 8

1 0.45 0.85 0.98 0.43 0.16 7

1 0.29 0.47 0.53 0.10 0.13 11

1 0.84 0.58 0.95 0.43 0.81 5

1 0.28 0.18 0.06 0.74 0.95 24

1 0.08 0.37 0.97 0.17 0.88 9

1 0.92 1.00 0.47 0.27 0.82 4

For each group I will have a weighted mean (1/pi)*x/8. I would like to minimize weighted mean difference between group 1 and 0, by selecting one pi from each row. Any idea, help will appreciated.

Note: Bold numbers are to show the idea and were not obtained using a minimization technique.

RobPratt · Posted 03-25-2016 10:08 AM

The optimal objective value is smaller:

6.9765E-07

You can solve this by using PROC OPTMODEL in SAS/OR as follows:

%let n = 5;

data x;
   input g p1-p&n x;
   cards;
0   0.80 0.86 0.19 0.01 0.96   6
0   0.15 0.57 0.87 0.36 0.03   7
0   0.41 0.75 0.33 0.39 0.27   3
0   0.38 0.88 0.53 0.49 0.17   5
0   0.14 0.32 0.09 0.35 0.63   9
0   0.22 0.49 0.32 0.58 0.96   2
0   0.48 0.33 0.98 0.19 0.42   8
0   0.30 0.88 0.87 0.84 0.31   7
1   0.32 0.31 0.10 0.08 0.36  10
1   0.85 0.83 0.30 0.40 0.97   8
1   0.45 0.85 0.98 0.43 0.16   7
1   0.29 0.47 0.53 0.10 0.13  11
1   0.84 0.58 0.95 0.43 0.81   5
1   0.28 0.18 0.06 0.74 0.95  24
1   0.08 0.37 0.97 0.17 0.88   9
1   0.92 1.00 0.47 0.27 0.82   4
;

proc optmodel;
   set OBS;
   set GROUPS = 0..1;
   set NSET = 1..&n;
   num group {OBS};
   num p {OBS, NSET};
   num x {OBS};
   read data x into OBS=[_N_] group=g {n in NSET} <p[_N_,n]=col('p'||n)> x;
   set OBS_g {g in GROUPS} = {i in OBS: group[i] = g};

   var Assign {OBS, NSET} binary;
   con AssignCon {i in OBS}:
      sum {n in NSET} Assign[i,n] = 1;
   impvar Mean {g in GROUPS} 
      = sum {i in OBS_g[g], n in NSET} (x[i]/p[i,n]) * Assign[i,n] / card(OBS_g[g]);
   var Surplus >= 0, Slack >= 0;
   min AbsDiff = Surplus + Slack;
   con AbsDiffCon:
      Mean[0] - Mean[1] = Surplus - Slack;

   solve with MILP / inttol=1e-9;
   print {i in OBS, n in NSET: Assign[i,n].sol > 0.5} p;
   for {i in OBS, n in NSET} fix Assign[i,n] = round(Assign[i,n]);
   print Mean (abs(Mean[0]-Mean[1]));
quit;

The resulting output is:

[1]	[2]	p
1	2	0.86
2	5	0.03
3	5	0.27
4	3	0.53
5	3	0.09
6	3	0.32
7	5	0.42
8	2	0.88
9	1	0.32
10	3	0.30
11	5	0.16
12	5	0.13
13	3	0.95
14	1	0.28
15	1	0.08
16	1	0.92

[1]	Mean
0	49.263
1	49.263

6.9765E-07

View solution in original post

Ksharp · Posted 03-25-2016 03:12 AM

How do you define a group's  weighted mean  (1/pi)*x/8 ?

For group 0 :
(1/0.86 )*x/8 + (1/0.87)*x/8 + (1/0.75)*x/8 +.....+(1/0.30)*x/8      ?

MetinBulus · Posted 03-25-2016 07:44 AM

Xia, yes, I forgot to put a summation sign. 
weighted_mean_for_group_0 = (1/0.86 )*6/8 + (1/0.87)*7/8 + (1/0.75)*3/8 +.....+(1/0.30)*7/8

Ksharp · Posted 03-25-2016 08:41 AM

OK. 
Here is my GA code, you can run it under SAS University Edition. Maybe someone could give you some SAS/OR code .
I really want to know how to do it by SAS/OR .
Good Luck.
The minimize difference is 0.0000176.

g   p1    p2    p3    p4    p5      x
0                 0.19                     6
0                           0.36           7
0                           0.39           3
0         0.88                             5
0                           0.35           9
0   0.22                                   2
0          0.33                           8
0                            0.84         7
1                                   0.36  10
1   0.85                                  8
1   0.45                                  7
1          0.47                          11
1                            0.43        5
1                                   0.95  24
1                                   0.88   9
1                  0.47                    4









data x;
input g   p1    p2    p3    p4    p5      x;
cards;
0   0.80 0.86 0.19 0.01 0.96   6
0   0.15 0.57 0.87 0.36 0.03   7
0   0.41 0.75 0.33 0.39 0.27   3
0   0.38 0.88 0.53 0.49 0.17   5
0   0.14 0.32 0.09 0.35 0.63   9
0   0.22 0.49 0.32 0.58 0.96   2
0   0.48 0.33 0.98 0.19 0.42   8
0   0.30 0.88 0.87 0.84 0.31   7
1   0.32 0.31 0.10 0.08 0.36  10
1   0.85 0.83 0.30 0.40 0.97   8
1   0.45 0.85 0.98 0.43 0.16   7
1   0.29 0.47 0.53 0.10 0.13  11
1   0.84 0.58 0.95 0.43 0.81   5
1   0.28 0.18 0.06 0.74 0.95  24
1   0.08 0.37 0.97 0.17 0.88   9
1   0.92 1.00 0.47 0.27 0.82   4
;
run;
data have;
 set x;
 length vname $ 40;
 array z{*} p1-p5 ;
 n+1;
 do i=1 to dim(z);
  vname=vname(z{i});v=z{i};output;
 end;
 drop i p1-p5;
run;


proc iml;
use have nobs nobs;
read all var {g n v x vname};
close have;

start_end=loc(n^=t({.}||remove(n,nobs))) //
              loc(n^=t(remove(n,1)||{.})) ;
              
mattrib start_end rowname={start end};                         
start function(z) global(v,x,g);
   idx0=z[loc(g[z]=0)] ;
   idx1=z[loc(g[z]=1)] ; 

   pi0=1/(8#v[idx0]); 
   pi1=1/(8#v[idx1]); 

   w_mean0=sum(pi0#x[idx0]);
   w_mean1=sum(pi1#x[idx1]);

   dif=abs(w_mean0-w_mean1); 
 return (dif);
finish;

encoding=start_end;
size=ncol(encoding);
id=gasetup(2,size,1234);
call gasetobj(id,0,"function");
call gasetsel(id,100,1,1);
call gainit(id,1000,encoding);
niter = 10000;
summary = j(niter,2);
do i = 1 to niter;
 call garegen(id);
 call gagetval(value, id);
 summary[i,1] = value[1];
 summary[i,2] = value[:];
end;
call gagetmem(mem, value, id, 1);
Memebers=x[mem];
obs=n[mem];
group=g[mem];
varname=vname[mem];
vv=v[mem];
print "Best Members:"  group obs varname vv[l="value"] Memebers[l="x"],
      "Min Difference:   " value[l = ""],
      (summary[niter-20:niter ,])[c = {"Min Difference"  "Avg Difference"} l="The last 20 iteration"]  ;
call gaend(id);
quit;









OUTPUT:

 	group	obs	varname	value	x
Best Members:	0	1	p3	0.19	6
 	0	2	p4	0.36	7
 	0	3	p4	0.39	3
 	0	4	p2	0.88	5
 	0	5	p4	0.35	9
 	0	6	p1	0.22	2
 	0	7	p2	0.33	8
 	0	8	p4	0.84	7
 	1	9	p5	0.36	10
 	1	10	p1	0.85	8
 	1	11	p1	0.45	7
 	1	12	p2	0.47	11
 	1	13	p4	0.43	5
 	1	14	p5	0.95	24
 	1	15	p5	0.88	9
 	1	16	p3	0.47	4
Min Difference:	0.0000176
The last 20 iteration
Min Difference	Avg Difference
0.0000176	0.1089383
0.0000176	0.1679158
0.0000176	0.126787
0.0000176	0.0798593
0.0000176	0.1253699
0.0000176	0.1260094
0.0000176	0.0681392
0.0000176	0.1495233
0.0000176	0.082787
0.0000176	0.1652973
0.0000176	0.1008417
0.0000176	0.1230794
0.0000176	0.078061
0.0000176	0.1358966
0.0000176	0.1289343
0.0000176	0.0961847
0.0000176	0.1474286
0.0000176	0.1196352
0.0000176	0.0874685
0.0000176	0.1541393
0.0000176	0.1372184

RobPratt · Posted 03-25-2016 10:08 AM

The optimal objective value is smaller:

6.9765E-07

You can solve this by using PROC OPTMODEL in SAS/OR as follows:

%let n = 5;

data x;
   input g p1-p&n x;
   cards;
0   0.80 0.86 0.19 0.01 0.96   6
0   0.15 0.57 0.87 0.36 0.03   7
0   0.41 0.75 0.33 0.39 0.27   3
0   0.38 0.88 0.53 0.49 0.17   5
0   0.14 0.32 0.09 0.35 0.63   9
0   0.22 0.49 0.32 0.58 0.96   2
0   0.48 0.33 0.98 0.19 0.42   8
0   0.30 0.88 0.87 0.84 0.31   7
1   0.32 0.31 0.10 0.08 0.36  10
1   0.85 0.83 0.30 0.40 0.97   8
1   0.45 0.85 0.98 0.43 0.16   7
1   0.29 0.47 0.53 0.10 0.13  11
1   0.84 0.58 0.95 0.43 0.81   5
1   0.28 0.18 0.06 0.74 0.95  24
1   0.08 0.37 0.97 0.17 0.88   9
1   0.92 1.00 0.47 0.27 0.82   4
;

proc optmodel;
   set OBS;
   set GROUPS = 0..1;
   set NSET = 1..&n;
   num group {OBS};
   num p {OBS, NSET};
   num x {OBS};
   read data x into OBS=[_N_] group=g {n in NSET} <p[_N_,n]=col('p'||n)> x;
   set OBS_g {g in GROUPS} = {i in OBS: group[i] = g};

   var Assign {OBS, NSET} binary;
   con AssignCon {i in OBS}:
      sum {n in NSET} Assign[i,n] = 1;
   impvar Mean {g in GROUPS} 
      = sum {i in OBS_g[g], n in NSET} (x[i]/p[i,n]) * Assign[i,n] / card(OBS_g[g]);
   var Surplus >= 0, Slack >= 0;
   min AbsDiff = Surplus + Slack;
   con AbsDiffCon:
      Mean[0] - Mean[1] = Surplus - Slack;

   solve with MILP / inttol=1e-9;
   print {i in OBS, n in NSET: Assign[i,n].sol > 0.5} p;
   for {i in OBS, n in NSET} fix Assign[i,n] = round(Assign[i,n]);
   print Mean (abs(Mean[0]-Mean[1]));
quit;

The resulting output is:

[1]	[2]	p
1	2	0.86
2	5	0.03
3	5	0.27
4	3	0.53
5	3	0.09
6	3	0.32
7	5	0.42
8	2	0.88
9	1	0.32
10	3	0.30
11	5	0.16
12	5	0.13
13	3	0.95
14	1	0.28
15	1	0.08
16	1	0.92

[1]	Mean
0	49.263
1	49.263

6.9765E-07

MetinBulus · Posted 03-25-2016 09:55 PM

thanks a lot Rob! I will try both algorithms on some simulated data, and see if I can decode the algorithms logic also.

MetinBulus · Posted 03-25-2016 09:53 PM

thanks a lot Xia!

FreelanceReinh · Posted 04-06-2016 10:23 AM

Hello @MetinBulus,

Many thanks for posting this interesting challenge recently.

Two advanced solutions have been provided, using genetic algorithms (SAS/IML) and mixed integer linear programming (SAS/OR), respectively. I'm not familiar with either of these techniques and I don't have a license for those SAS modules.

I'd like to add a third solution using only Base SAS. It's a brute force approach. So, it may not scale well to a much larger input dataset (unlike the other two solutions, I assume, because neither @Ksharp nor @RobPratt asked about the size of the real dataset). With the sample data from your initial post it takes less than one second (on my workstation) to obtain five solutions with the same optimal (minimum) difference, which turns out to be exactly zero.

I have verified that in all five cases these zero differences are not just due to rounding (of weighted means with unit 1E-13, which I did in order to counter rounding errors in the calculation). Of course, this assumes that the values p1, ..., p5 and x of the input dataset are exact.

/* Read test data */

data have;
input g p1-p5 x;
cards;
0   0.80 0.86 0.19 0.01 0.96   6
0   0.15 0.57 0.87 0.36 0.03   7
0   0.41 0.75 0.33 0.39 0.27   3
0   0.38 0.88 0.53 0.49 0.17   5
0   0.14 0.32 0.09 0.35 0.63   9
0   0.22 0.49 0.32 0.58 0.96   2
0   0.48 0.33 0.98 0.19 0.42   8
0   0.30 0.88 0.87 0.84 0.31   7
1   0.32 0.31 0.10 0.08 0.36  10
1   0.85 0.83 0.30 0.40 0.97   8
1   0.45 0.85 0.98 0.43 0.16   7
1   0.29 0.47 0.53 0.10 0.13  11
1   0.84 0.58 0.95 0.43 0.81   5
1   0.28 0.18 0.06 0.74 0.95  24
1   0.08 0.37 0.97 0.17 0.88   9
1   0.92 1.00 0.47 0.27 0.82   4
;

/* Determine group sizes */

proc sql noprint;
select count(*) into :n0-:n1
from have
group by g;
quit;

%let rdunit=1E-13; /* rounding unit for weighted means (data dependent, though) */

/* Calculate "weighted means" in group 0 and group 1 */

%macro calc(g=, nrows=, ncols=5);
  %local i;
  data wm&g;
  set have end=last;
  where g=&g;
  array p p:;
  array a[&nrows, &ncols] _temporary_;
  array k[&nrows];
  do j=1 to &ncols;
    a[_n_,j]=x/p[j]; 
  end;
  if last then do;
  %do i=1 %to &nrows;
    do k&i=1 to &ncols;
  %end;
      m=0;
      do i=1 to &nrows;
        m+a[i, k[i]];
      end;
      m=round(m/&nrows, &rdunit); 
      output;
  %do i=1 %to &nrows;
    end;
  %end;
  end;
  keep g k: m;
  run;

  proc sort data=wm&g;
  by m;
  run;
%mend calc;
%calc(g=0, nrows=&n0);
%calc(g=1, nrows=&n1);

/* Bring weighted means of similar size together */

data wm;
set wm0
    wm1;
by m;
run;

/* Compute between-group differences of weighted means and determine
   the minimum difference (taking rounding issues into consideration) */

data diff;
retain min_d;
set wm end=last;
by g notsorted;
if last.g | first.g & _n_>1;
d=dif(m);
if ~first.g then d=.;
if d>. then min_d=min(min_d, d);
if last then call symputx('smalldiff', min_d+&rdunit);
drop min_d;
run;

/* Select pairs of obs. with (approx.) minimum difference
   (If there were clusters of more than two observations, this code 
    could be amended to select all pairs from these clusters.) */
    
data sel;
set diff end=last;
c+1;
if .<d<=&smalldiff then do;
  s+1;
  do i=c-1 to c;
    set diff point=i;
    if i<c then d=.;
    output;
  end;
end;
if last then call symputx('s',s);
label g='Group'
      s='Solution no.'
      m='Weighted mean'
      d='Difference';
drop c;
run;

proc sort data=sel;
by s g;
run;

/* Prepare and create output dataset WANT */

data haves;
set have;
do s=1 to &s;
  output;
end;
proc sort;
by s;
run;

data want;
merge haves
      sel;
by s g;
array a p:;
array k k:;
if first.g then j=0;
j+1;
i=k[j];
p=a[i];
keep s g i p x;
label i='Column';
run;

/* Show results */

* options formdlim=' ' nodate nonumber;
* title;

proc print data=want label noobs;
by s;
var g i p x;
run;

proc print data=sel width=min label noobs;
format m d 17.13;
run;
* options formdlim='';

RobPratt · Posted 04-06-2016 02:15 PM

Nice job, @FreelanceReinh. Yes, the MILP approach should scale better than brute force. You can use the MILP solver's ABSOBJGAP= option to avoid terminating early with a solution that is "only" within 1e-6 of optimal. Here's another near miss:

[1]	[2]	p
1	3	0.19
2	5	0.03
3	5	0.27
4	1	0.38
5	5	0.63
6	1	0.22
7	5	0.42
8	1	0.30
9	3	0.10
10	5	0.97
11	5	0.16
12	1	0.29
13	3	0.95
14	4	0.74
15	1	0.08
16	4	0.27

[1]	Mean
0	44.367
1	44.367

2.6602E-10

@MetinBulus, what is the original motivation for this problem?

Registration is open

SAS Training: Just a Click Away