Solved: Re: MACRO CLUSTPRO

ron_horne · Posted 03-11-2021 12:43 PM

dear members,

i would like to use the macro %clustpro:

https://support.sas.com/resources/papers/proceedings/proceedings/sugi23/Posters/p204.pdf

I am having an issue extracting the script from the PDF. Apparently the formatting in the PDF file has messed up the text in a way that produces many errors when I try to run it.

I would be very thankful if anyone can extract the script, and re-post it here in working order. perhaps as a text file.

I did correct all quotation marks that were not recognized and added some missing spaces. yet, this is not enough and I get different results than the example.

many thanks!

FreelanceReinh · Posted 03-12-2021 10:02 AM

Hello @ron_horne,

I think the error is here:

/* Sii’ calculated */
%do l=1 %to &num_t;
%do m=1 %to &num_t;
%let s&l&m=%sysevalf(&&tcmp&l&m*&part1&part3);
%end;
%end;

With the example data, &part1=1.05, &part3=2601, which are 21/20 and 51**2, respectively, computed from the integer constants 21 and 51 involved in the study data. Apparently an operator is missing between &part1 and &part3 in the calculation of s&l&m because concatenation doesn't make sense. The fact that many computations in the macro are done with macro variables (none of which is declared as local ...) is a weakness of the code anyway.

It seems that the division operator (/) does a good job here (but without the Obuchowski paper [behind a paywall] I'm only guessing).

%let s&l&m=%sysevalf(&&tcmp&l&m*&part1/&part3);

At least the results get much closer to those in the article:

    p_hat1 = 0.7843137255
    p_hat2 = 0.9019607843
    chi-square statistic = 2.85714285599953
    degrees of freedom = 1
    p-value =  0.090968948

    S
     0.00510488158443
                   0.00093340907387
     0.00093340907387
                   0.00160622722075

The only significant difference occurs in value &s22 where the paper has an additional zero after the decimal point: 0.00016062272. But this must be a typo in the paper because otherwise the relationship

chisq = (p_hat1 - p_hat2)**2 / (s11+s22-2*s12)

would be violated and this equality holds also in the example found in the NESUG 2006 paper "Statistical Analysis of Clustered Data using SAS® System" (see Fig. 10, p. 6).

Note that matrix S is described in the Lieber/Ashley paper as "the variance-covariance matrix for p₁ and p₂", i.e., the covariance matrix of two random variables with values between 0 and 1 (if I interpret this correctly). As such the diagonal entries s11 and s22 must be between 0 and 0.25 and the other two (equal) entries s12 must be between -0.25 and 0.25 (regardless of the input data). So, values like 13.31, 2.43 and 4.19 as obtained with the uncorrected code are definitely wrong.

View solution in original post

Reeza · Posted 03-11-2021 01:32 PM

If you post what you've extracted so far, maybe someone can help you debug it further?
I quick search (Google/Github) didn't show anything obvious to me.

I would also check if that methodology hasn't been added to SAS/Stat functionality, SUGI23 was a long time ago...23 years in fact.

ron_horne · Posted 03-11-2021 02:11 PM

thank you @Reeza

please see the code i have so far.

i do not think it was added to the program.

/* infil=input datafile -
should contain i, j, xij, and nj(NOTE: Path to file should be enclosed in single quotes.)
num_c=number of clusters(patients) */

%macro clustpro (infil=,num_c=);

/* Dataset created from data file */
data tmp;
infile &infil;
input i j xij nj;
run;

/* number of tests initialized to 2 */
%let num_t=2;

/* Input data set sorted by i */
proc sort data=tmp;
by i;
run;

/* Length of num_t and num_c computed (used later in formatting upper loop values */
%let lent=%length(&num_t);
%let lenc=%length(&num_c);

/* "." added to end of lengths (for formatting purposes) */
data _null_;
call symput('fort', &lent||'.');
call symput('forc', &lenc||'.');
run;

/* xij, nj put into macro variables and trimmed */
data _null_;
set tmp;

call symput ('x'||left(put(i, &fort))||left(put(j, &forc)), xij);
call symput ('n'||left(put(j, &forc)), nj);
run;

%do k=1 %to &num_c;
	%let n&k=%eval(&&n&k);

	%do l=1 %to &num_t;
		%let x&l&k=%eval(&&x&l&k);
	%end;
%end;

/* sums of xij and nj computed */
proc means data=tmp noprint;
by i;
var xij nj;
output out=a sum=sumx sumn;
run;

/* sum of njs put into macro variable (used later) */
data _null_;
set a;
call symput ('sumn', sumn);
run;

/* &sum=sum of p_hats. Will change with each iteration of loop. */
%let sum=0;

/* p_hat1, p_hat2 computed, put into macro variables */
%do k=1 %to &num_t;

data b&k;
set a;
if i=&k;
p_hat&k=sumx/sumn;
run;

data _null_;
set b&k;
tmp=&k;
call symput('p_hat'||left(put(tmp, 1.)), p_hat&k);
run;
%let sum=%sysevalf(&sum+&&p_hat&k);
%end;

/* p bar computed */
%let p_bar=%sysevalf (&sum/&num_t);

/* part1=j/(j-1) (Used in computing the matrix S) */
%let part1=%sysevalf (&num_c*(1/(&num_c-1)));

/* part3=(sum of njs)^2 (Used in computing the matrix S) */
%let part3=%eval (&sumn*&sumn);

/* Part 2 computed */
/* Each iteration of part 2 initialized to 0 */
%do l=1 %to &num_t;
%do m=1 %to &num_t;
%let tcmp&l&m=0;
%end;
%end;

/* l represents i, m represents i’ */
%do l=1 %to &num_t;
%do m=1 %to &num_t;

/* Since S is symmetric, tcmp&l&m is only calculated if tcmp&m&l hasn’t been calculated */
%if &l > &m %then %do;
%let tcmp&l&m=&&tcmp&m&l;
%end;

%else %do;

/* Each iteration of the loop represents an iteration of the summation from 1 to j */
				%do k=1 %to &num_c;
					%let comp&l&m&k=%sysevalf((&&x&l&k-(&&n&k*&p_bar))*(&&x&m&k-(&&n&k*&p_bar)));
					%let tcmp&l&m=%sysevalf(&&tcmp&l&m+&&comp&l&m&k);
			%end;
		%end;
	%end;
%end;

/* Sii’ calculated */
%do l=1 %to &num_t;
%do m=1 %to &num_t;
%let s&l&m=%sysevalf(&&tcmp&l&m*&part1&part3);
%end;
%end;

/* Chi-square value calculated, put into data set variable */
%let phatcalc=%sysevalf (&p_hat1 - &p_hat2);
%let numer=%sysevalf (&phatcalc*&phatcalc);
%let denom1=%sysevalf (&s11+&s22);
%let denom2=%sysevalf (2*&s12);
%let denom=%sysevalf (&denom1-&denom2);
%let chisq=%sysevalf (&numer/&denom);
data final;
chi=&chisq;
run;

/* p-value calculated with i-1 degrees of freedom, put into macro variable */
data _null_;
set final;
pvalue=1-probchi(chi, %eval(&num_t-1));
call symput ('pvalue', pvalue);
run;

/* Degrees of freedom put into macro variable */
%let dof=%eval(&num_t-1);

/* Calculations output to output window */
data _null_;
file print;
%do l=1 %to &num_t;
put @5 "p_hat&l = &&p_hat&l";
%end;
put;
put @5 "chi-square statistic = &chisq";
put;
put @5 "degrees of freedom = &dof";
put;
put @5 "p-value = &pvalue";
put;
put;
put @5 "S";
put;
%do l=1 %to &num_t;
%do m=1 %to &num_t;
%let fmt=%eval( %sysevalf (14*&m)-9);
put @&fmt "&&s&l&m" @;
%end;
put;
%end;
put;
put;
run;
%mend clustpro;

%clustpro (INFIL='C:\Users\Smart\Documents\sample.dat', NUM_C=21);

Reeza · Posted 03-11-2021 03:15 PM

This runs for me and generates the following:

  p_hat1 = 0.7843137255                                                                                                           
    p_hat2 = 0.9019607843                                                                                                           
    chi-square statistic = 0.00109576424212                                                                                         
    degrees of freedom = 1                                                                                                          
    p-value = 0.9735929848                                                                                                          
    S                                                                                                                               
     13.3106880011136                                                                                                               
                   2.43381100114915                                                                                                 
     2.43381100114915                                                                                                               
                   4.18814600118416

The values in Orange do not match the values in the paper using the sample data they provided.

I would personally find another example to verify my numbers as this is a user written paper - you have no certainty that this code is correct or accurate. Either the code is wrong or the data is wrong, but unless you exactly understand the process that would be very hard to know.

Full code to reproduce numbers, note that I modified the macro slightly to take an input data set instead of reading the data from a text file.

data exampleData;
infile cards ;
input i j xij nj;
cards;
1 1 0 3
1 2 2 3
1 3 3 3
1 4 1 1
1 5 2 3
1 6 4 4
1 7 3 3
1 8 2 2
1 9 2 2
1 10 1 1
1 11 2 3
1 12 2 2
1 13 3 3
1 14 2 2
1 15 0 2
1 16 2 3
1 17 2 3
1 18 2 3
1 19 2 2
1 20 1 1
1 21 2 2
2 1 2 3
2 2 3 3
2 3 3 3
2 4 1 1
2 5 3 3
2 6 4 4
2 7 3 3
2 8 2 2
2 9 1 2
2 10 1 1
2 11 2 3
2 12 2 2
2 13 3 3
2 14 2 2
2 15 2 2
2 16 2 3
2 17 2 3
2 18 3 3
2 19 2 2
2 20 1 1
2 21 2 2
;;;;
run;

/* infil=input datafile -
should contain i, j, xij, and nj(NOTE: Path to file should be enclosed in single quotes.)
num_c=number of clusters(patients) */

%macro clustpro (infil=, num_c=);

/* Dataset created from data file */
data tmp;
set &infil.;
run;

/* number of tests initialized to 2 */
%let num_t=2;

/* Input data set sorted by i */
proc sort data=tmp;
by i;
run;

/* Length of num_t and num_c computed (used later in formatting upper loop values */
%let lent=%length(&num_t);
%let lenc=%length(&num_c);

/* "." added to end of lengths (for formatting purposes) */
data _null_;
call symput('fort', &lent||'.');
call symput('forc', &lenc||'.');
run;

/* xij, nj put into macro variables and trimmed */
data _null_;
set tmp;

call symput ('x'||left(put(i, &fort))||left(put(j, &forc)), xij);
call symput ('n'||left(put(j, &forc)), nj);
run;

%do k=1 %to &num_c;
	%let n&k=%eval(&&n&k);

	%do l=1 %to &num_t;
		%let x&l&k=%eval(&&x&l&k);
	%end;
%end;

/* sums of xij and nj computed */
proc means data=tmp noprint;
by i;
var xij nj;
output out=a sum=sumx sumn;
run;

/* sum of njs put into macro variable (used later) */
data _null_;
set a;
call symput ('sumn', sumn);
run;

/* &sum=sum of p_hats. Will change with each iteration of loop. */
%let sum=0;

/* p_hat1, p_hat2 computed, put into macro variables */
%do k=1 %to &num_t;

data b&k;
set a;
if i=&k;
p_hat&k=sumx/sumn;
run;

data _null_;
set b&k;
tmp=&k;
call symput('p_hat'||left(put(tmp, 1.)), p_hat&k);
run;
%let sum=%sysevalf(&sum+&&p_hat&k);
%end;

/* p bar computed */
%let p_bar=%sysevalf (&sum/&num_t);

/* part1=j/(j-1) (Used in computing the matrix S) */
%let part1=%sysevalf (&num_c*(1/(&num_c-1)));

/* part3=(sum of njs)^2 (Used in computing the matrix S) */
%let part3=%eval (&sumn*&sumn);

/* Part 2 computed */
/* Each iteration of part 2 initialized to 0 */
%do l=1 %to &num_t;
%do m=1 %to &num_t;
%let tcmp&l&m=0;
%end;
%end;

/* l represents i, m represents i’ */
%do l=1 %to &num_t;
%do m=1 %to &num_t;

/* Since S is symmetric, tcmp&l&m is only calculated if tcmp&m&l hasn’t been calculated */
%if &l > &m %then %do;
%let tcmp&l&m=&&tcmp&m&l;
%end;

%else %do;

/* Each iteration of the loop represents an iteration of the summation from 1 to j */
				%do k=1 %to &num_c;
					%let comp&l&m&k=%sysevalf((&&x&l&k-(&&n&k*&p_bar))*(&&x&m&k-(&&n&k*&p_bar)));
					%let tcmp&l&m=%sysevalf(&&tcmp&l&m+&&comp&l&m&k);
			%end;
		%end;
	%end;
%end;

/* Sii’ calculated */
%do l=1 %to &num_t;
%do m=1 %to &num_t;
%let s&l&m=%sysevalf(&&tcmp&l&m*&part1&part3);
%end;
%end;

/* Chi-square value calculated, put into data set variable */
%let phatcalc=%sysevalf (&p_hat1 - &p_hat2);
%let numer=%sysevalf (&phatcalc*&phatcalc);
%let denom1=%sysevalf (&s11+&s22);
%let denom2=%sysevalf (2*&s12);
%let denom=%sysevalf (&denom1-&denom2);
%let chisq=%sysevalf (&numer/&denom);
data final;
chi=&chisq;
run;

/* p-value calculated with i-1 degrees of freedom, put into macro variable */
data _null_;
set final;
pvalue=1-probchi(chi, %eval(&num_t-1));
call symput ('pvalue', pvalue);
run;

/* Degrees of freedom put into macro variable */
%let dof=%eval(&num_t-1);

/* Calculations output to output window */
data _null_;
file print;
%do l=1 %to &num_t;
put @5 "p_hat&l = &&p_hat&l";
%end;
put;
put @5 "chi-square statistic = &chisq";
put;
put @5 "degrees of freedom = &dof";
put;
put @5 "p-value = &pvalue";
put;
put;
put @5 "S";
put;
%do l=1 %to &num_t;
%do m=1 %to &num_t;
%let fmt=%eval( %sysevalf (14*&m)-9);
put " ";
put @&fmt " &&s&l&m " @;
%end;
put;
%end;
put;
put;
run;
%mend clustpro;

%clustpro(INFIL=exampleData, NUM_C=21);

ron_horne · Posted 03-11-2021 03:39 PM

thank you @Reeza ,

this is what i get, but in the paper the results are as follows.

there is no error message in the log so i do not know how to fix it.

p_hat1 = 0.7843137255
p_hat2 = 0.9019607843
chi-square statistic = 2.8571429014
degrees of freedom = 1
p-value = 0.0909689455
S
0.0051048816 0.0009334091
0.0009334091 0.00016062272

FreelanceReinh · Posted 03-12-2021 10:02 AM

Hello @ron_horne,

I think the error is here:

/* Sii’ calculated */
%do l=1 %to &num_t;
%do m=1 %to &num_t;
%let s&l&m=%sysevalf(&&tcmp&l&m*&part1&part3);
%end;
%end;

With the example data, &part1=1.05, &part3=2601, which are 21/20 and 51**2, respectively, computed from the integer constants 21 and 51 involved in the study data. Apparently an operator is missing between &part1 and &part3 in the calculation of s&l&m because concatenation doesn't make sense. The fact that many computations in the macro are done with macro variables (none of which is declared as local ...) is a weakness of the code anyway.

It seems that the division operator (/) does a good job here (but without the Obuchowski paper [behind a paywall] I'm only guessing).

%let s&l&m=%sysevalf(&&tcmp&l&m*&part1/&part3);

At least the results get much closer to those in the article:

    p_hat1 = 0.7843137255
    p_hat2 = 0.9019607843
    chi-square statistic = 2.85714285599953
    degrees of freedom = 1
    p-value =  0.090968948

    S
     0.00510488158443
                   0.00093340907387
     0.00093340907387
                   0.00160622722075

The only significant difference occurs in value &s22 where the paper has an additional zero after the decimal point: 0.00016062272. But this must be a typo in the paper because otherwise the relationship

chisq = (p_hat1 - p_hat2)**2 / (s11+s22-2*s12)

would be violated and this equality holds also in the example found in the NESUG 2006 paper "Statistical Analysis of Clustered Data using SAS® System" (see Fig. 10, p. 6).

Note that matrix S is described in the Lieber/Ashley paper as "the variance-covariance matrix for p₁ and p₂", i.e., the covariance matrix of two random variables with values between 0 and 1 (if I interpret this correctly). As such the diagonal entries s11 and s22 must be between 0 and 0.25 and the other two (equal) entries s12 must be between -0.25 and 0.25 (regardless of the input data). So, values like 13.31, 2.43 and 4.19 as obtained with the uncorrected code are definitely wrong.

ron_horne · Posted 03-24-2021 11:53 AM

Dear @FreelanceReinh

THANK YOU! you must be correct.

sorry for the late response, the message went to my spam box.

your solution is correct and produces the right chiSqr as well as Pvalue.

this data is used in the following R package called: clust.bin.pair - https://cran.r-project.org/web/packages/clust.bin.pair/index.html also described here: https://cran.r-project.org/package=clust.bin.pair/clust.bin.pair.pdf
I must admit, this was not an easy task but you have hit the nail on the head.

All the best!

Ron

ron_horne · Posted 03-24-2021 12:10 PM

Just to conclude, thanks to @FreelanceReinh ,

this is a working version from a to z:

data exampleData;
infile cards ;
input i j xij nj;
cards;
1 1 0 3
1 2 2 3
1 3 3 3
1 4 1 1
1 5 2 3
1 6 4 4
1 7 3 3
1 8 2 2
1 9 2 2
1 10 1 1
1 11 2 3
1 12 2 2
1 13 3 3
1 14 2 2
1 15 0 2
1 16 2 3
1 17 2 3
1 18 2 3
1 19 2 2
1 20 1 1
1 21 2 2
2 1 2 3
2 2 3 3
2 3 3 3
2 4 1 1
2 5 3 3
2 6 4 4
2 7 3 3
2 8 2 2
2 9 1 2
2 10 1 1
2 11 2 3
2 12 2 2
2 13 3 3
2 14 2 2
2 15 2 2
2 16 2 3
2 17 2 3
2 18 3 3
2 19 2 2
2 20 1 1
2 21 2 2
;;;;
run;

/* infil=input datafile -
should contain i, j, xij, and nj(NOTE: Path to file should be enclosed in single quotes.)
num_c=number of clusters(patients) */

%macro clustpro (infil=, num_c=);

/* Dataset created from data file */
data tmp;
set &infil.;
run;

/* number of tests initialized to 2 */
%let num_t=2;

/* Input data set sorted by i */
proc sort data=tmp;
by i;
run;

/* Length of num_t and num_c computed (used later in formatting upper loop values */
%let lent=%length(&num_t);
%let lenc=%length(&num_c);

/* "." added to end of lengths (for formatting purposes) */
data _null_;
call symput('fort', &lent||'.');
call symput('forc', &lenc||'.');
run;

/* xij, nj put into macro variables and trimmed */
data _null_;
set tmp;

call symput ('x'||left(put(i, &fort))||left(put(j, &forc)), xij);
call symput ('n'||left(put(j, &forc)), nj);
run;

%do k=1 %to &num_c;
	%let n&k=%eval(&&n&k);

	%do l=1 %to &num_t;
		%let x&l&k=%eval(&&x&l&k);
	%end;
%end;

/* sums of xij and nj computed */
proc means data=tmp noprint;
by i;
var xij nj;
output out=a sum=sumx sumn;
run;

/* sum of njs put into macro variable (used later) */
data _null_;
set a;
call symput ('sumn', sumn);
run;

/* &sum=sum of p_hats. Will change with each iteration of loop. */
%let sum=0;

/* p_hat1, p_hat2 computed, put into macro variables */
%do k=1 %to &num_t;

data b&k;
set a;
if i=&k;
p_hat&k=sumx/sumn;
run;

data _null_;
set b&k;
tmp=&k;
call symput('p_hat'||left(put(tmp, 1.)), p_hat&k);
run;
%let sum=%sysevalf(&sum+&&p_hat&k);
%end;

/* p bar computed */
%let p_bar=%sysevalf (&sum/&num_t);

/* part1=j/(j-1) (Used in computing the matrix S) */
%let part1=%sysevalf (&num_c*(1/(&num_c-1)));

/* part3=(sum of njs)^2 (Used in computing the matrix S) */
%let part3=%eval (&sumn*&sumn);

/* Part 2 computed */
/* Each iteration of part 2 initialized to 0 */
%do l=1 %to &num_t;
%do m=1 %to &num_t;
%let tcmp&l&m=0;
%end;
%end;

/* l represents i, m represents i’ */
%do l=1 %to &num_t;
%do m=1 %to &num_t;

/* Since S is symmetric, tcmp&l&m is only calculated if tcmp&m&l hasn’t been calculated */
%if &l > &m %then %do;
%let tcmp&l&m=&&tcmp&m&l;
%end;

%else %do;

/* Each iteration of the loop represents an iteration of the summation from 1 to j */
				%do k=1 %to &num_c;
					%let comp&l&m&k=%sysevalf((&&x&l&k-(&&n&k*&p_bar))*(&&x&m&k-(&&n&k*&p_bar)));
					%let tcmp&l&m=%sysevalf(&&tcmp&l&m+&&comp&l&m&k);
			%end;
		%end;
	%end;
%end;

/* Sii’ calculated */
%do l=1 %to &num_t;
%do m=1 %to &num_t;
%let s&l&m=%sysevalf(&&tcmp&l&m*&part1/&part3);
%end;
%end;

/* Chi-square value calculated, put into data set variable */
%let phatcalc=%sysevalf (&p_hat1 - &p_hat2);
%let numer=%sysevalf (&phatcalc*&phatcalc);
%let denom1=%sysevalf (&s11+&s22);
%let denom2=%sysevalf (2*&s12);
%let denom=%sysevalf (&denom1-&denom2);
%let chisq=%sysevalf (&numer/&denom);
data final;
chi=&chisq;
run;

/* p-value calculated with i-1 degrees of freedom, put into macro variable */
data _null_;
set final;
pvalue=1-probchi(chi, %eval(&num_t-1));
call symput ('pvalue', pvalue);
run;

/* Degrees of freedom put into macro variable */
%let dof=%eval(&num_t-1);

/* Calculations output to output window */
data _null_;
file print;
%do l=1 %to &num_t;
put @5 "p_hat&l = &&p_hat&l";
%end;
put;
put @5 "chi-square statistic = &chisq";
put;
put @5 "degrees of freedom = &dof";
put;
put @5 "p-value = &pvalue";
put;
put;
put @5 "S";
put;
%do l=1 %to &num_t;
%do m=1 %to &num_t;
%let fmt=%eval( %sysevalf (14*&m)-9);
put " ";
put @&fmt " &&s&l&m " @;
%end;
put;
%end;
put;
put;
run;
%mend clustpro;

%clustpro(INFIL=exampleData, NUM_C=21);

Registration is open