I think I figured this out. Just posting the answer for others.
The /o apparently just like the o option in the SAS function compress. If you have something complicated in your arguments, the system will figure out that complication only once, rather than each time. It's good if you know the resolved argument isn't going to change. However, upon reading, people generally say that one should never use /o anymore in Perl, because it automatically does this kind of check itself and /o can introduce other logic issues.
Regarding \w, this matches any letter at all that is deemed a valid letter by your language selection. ô is a valid letter on my computer, as are é, ß, õ, ü, î, etc. So \w will select any of these, and not \w will select none of these. I had to use [_A-Za-z0-9] without the /i option or [_a-z0-9] with the /i option in order to truly select valid SAS name characters. For the formula suggested, it required the 'not' version so it became [^_a-z0-9] with the /i option.
Lastly, prxChange is very slow. I figured it would be faster than SAS statements, but after running tests on various size datasets (1 million - 100 million records), SAS statements seem 3-5x faster. Even when requiring multiple statements, SAS was still quicker than prxChange. I was quite surprised. Here are some times I measured from the log.
Time of simple substitution (s) Time of substitution to valid SAS name (s) Using prxChange Using SAS statements Using prxChange Using SAS statements 1M records 4.83 4.73 1.46 1.28 5.01 4.90 0.99 0.97
5M records 22.50 25.15 7.63 7.47
10M records 46.42 48.07 15.78 14.24
15M records 68.60 72.20 23.13 24.49
25M records 114.34 120.24 41.45 37.58
50M records 246.17 245.42 77.19 75.68
75M records 392.11 360.53 123.49 118.56
100M records 516.63 472.15 156.91 151.37 143.99 512.24 113.03 108.49 147.51
* Set up datasets of various sizes with random strings. *;
data test100 test075 test050 test025 test015 test010 test005 test001;
length i j length 8. string $ 32. _1-_32 $ 1.;
array __ {*} _1-_32;
do i = 1 to 100000000;
length = floor(32*rand('uniform')) + 1;
do j = 1 to 32;
if j <= length
then __[j] = byte(floor(256*rand('uniform')));
else __[j] = '';
end;
string = cat(of __{*});
output test100;
if i <= 75000000 then output test075;
if i <= 50000000 then output test050;
if i <= 25000000 then output test025;
if i <= 15000000 then output test015;
if i <= 10000000 then output test010;
if i <= 5000000 then output test005;
if i <= 1000000 then output test001;
end;
drop i j _:;
run;
* Test replacing without restriction on the first character not being a digit. *;
%macro repeat;
%do i = 1 %to 17;
data test001PRX; set test001; length valid $ 32.; valid = prxchange('s/[^_A-Z0-9]/_/i',-1,strip(string)); run;
data test005PRX; set test005; length valid $ 32.; valid = prxchange('s/[^_A-Z0-9]/_/i',-1,strip(string)); run;
data test010PRX; set test010; length valid $ 32.; valid = prxchange('s/[^_A-Z0-9]/_/i',-1,strip(string)); run;
data test015PRX; set test015; length valid $ 32.; valid = prxchange('s/[^_A-Z0-9]/_/i',-1,strip(string)); run;
data test025PRX; set test025; length valid $ 32.; valid = prxchange('s/[^_A-Z0-9]/_/i',-1,strip(string)); run;
data test050PRX; set test050; length valid $ 32.; valid = prxchange('s/[^_A-Z0-9]/_/i',-1,strip(string)); run;
data test075PRX; set test075; length valid $ 32.; valid = prxchange('s/[^_A-Z0-9]/_/i',-1,strip(string)); run;
data test100PRX; set test100; length valid $ 32.; valid = prxchange('s/[^_A-Z0-9]/_/i',-1,strip(string)); run;
data test001TRN; set test001; length valid $ 32.; valid = translate(strip(string), repeat('_', length(strip(string))), compress(strip(string),'_abcdefghijklmnopqrstuvwxyz', 'di')); run;
data test005TRN; set test005; length valid $ 32.; valid = translate(strip(string), repeat('_', length(strip(string))), compress(strip(string),'_abcdefghijklmnopqrstuvwxyz', 'di')); run;
data test010TRN; set test010; length valid $ 32.; valid = translate(strip(string), repeat('_', length(strip(string))), compress(strip(string),'_abcdefghijklmnopqrstuvwxyz', 'di')); run;
data test015TRN; set test015; length valid $ 32.; valid = translate(strip(string), repeat('_', length(strip(string))), compress(strip(string),'_abcdefghijklmnopqrstuvwxyz', 'di')); run;
data test025TRN; set test025; length valid $ 32.; valid = translate(strip(string), repeat('_', length(strip(string))), compress(strip(string),'_abcdefghijklmnopqrstuvwxyz', 'di')); run;
data test050TRN; set test050; length valid $ 32.; valid = translate(strip(string), repeat('_', length(strip(string))), compress(strip(string),'_abcdefghijklmnopqrstuvwxyz', 'di')); run;
data test075TRN; set test075; length valid $ 32.; valid = translate(strip(string), repeat('_', length(strip(string))), compress(strip(string),'_abcdefghijklmnopqrstuvwxyz', 'di')); run;
data test100TRN; set test100; length valid $ 32.; valid = translate(strip(string), repeat('_', length(strip(string))), compress(strip(string),'_abcdefghijklmnopqrstuvwxyz', 'di')); run;
%end;
%mend repeat;
%repeat;
proc compare base = test100TRN compare = test100PRX listAll; run;
* Test again with condition of making first character an underscore if a digit too. *;
%macro repeat2;
%do i = 1 %to 17;
data test001PRX; set test001; length valid $ 32.; valid = prxchange('s/^[^_A-Z]|[^_A-Z0-9]/_/i',-1,strip(string)); run;
data test005PRX; set test005; length valid $ 32.; valid = prxchange('s/^[^_A-Z]|[^_A-Z0-9]/_/i',-1,strip(string)); run;
data test010PRX; set test010; length valid $ 32.; valid = prxchange('s/^[^_A-Z]|[^_A-Z0-9]/_/i',-1,strip(string)); run;
data test015PRX; set test015; length valid $ 32.; valid = prxchange('s/^[^_A-Z]|[^_A-Z0-9]/_/i',-1,strip(string)); run;
data test025PRX; set test025; length valid $ 32.; valid = prxchange('s/^[^_A-Z]|[^_A-Z0-9]/_/i',-1,strip(string)); run;
data test050PRX; set test050; length valid $ 32.; valid = prxchange('s/^[^_A-Z]|[^_A-Z0-9]/_/i',-1,strip(string)); run;
data test075PRX; set test075; length valid $ 32.; valid = prxchange('s/^[^_A-Z]|[^_A-Z0-9]/_/i',-1,strip(string)); run;
data test100PRX; set test100; length valid $ 32.; valid = prxchange('s/^[^_A-Z]|[^_A-Z0-9]/_/i',-1,strip(string)); run;
data test001TRN; set test001; length valid $ 32.; valid = translate(strip(string), repeat('_', length(strip(string))), compress(strip(string),'_abcdefghijklmnopqrstuvwxyz', 'di')); if anyDigit(substr(valid, 1, 1)) then substr(valid, 1, 1) = '_'; run;
data test005TRN; set test005; length valid $ 32.; valid = translate(strip(string), repeat('_', length(strip(string))), compress(strip(string),'_abcdefghijklmnopqrstuvwxyz', 'di')); if anyDigit(substr(valid, 1, 1)) then substr(valid, 1, 1) = '_'; run;
data test010TRN; set test010; length valid $ 32.; valid = translate(strip(string), repeat('_', length(strip(string))), compress(strip(string),'_abcdefghijklmnopqrstuvwxyz', 'di')); if anyDigit(substr(valid, 1, 1)) then substr(valid, 1, 1) = '_'; run;
data test015TRN; set test015; length valid $ 32.; valid = translate(strip(string), repeat('_', length(strip(string))), compress(strip(string),'_abcdefghijklmnopqrstuvwxyz', 'di')); if anyDigit(substr(valid, 1, 1)) then substr(valid, 1, 1) = '_'; run;
data test025TRN; set test025; length valid $ 32.; valid = translate(strip(string), repeat('_', length(strip(string))), compress(strip(string),'_abcdefghijklmnopqrstuvwxyz', 'di')); if anyDigit(substr(valid, 1, 1)) then substr(valid, 1, 1) = '_'; run;
data test050TRN; set test050; length valid $ 32.; valid = translate(strip(string), repeat('_', length(strip(string))), compress(strip(string),'_abcdefghijklmnopqrstuvwxyz', 'di')); if anyDigit(substr(valid, 1, 1)) then substr(valid, 1, 1) = '_'; run;
data test075TRN; set test075; length valid $ 32.; valid = translate(strip(string), repeat('_', length(strip(string))), compress(strip(string),'_abcdefghijklmnopqrstuvwxyz', 'di')); if anyDigit(substr(valid, 1, 1)) then substr(valid, 1, 1) = '_'; run;
data test100TRN; set test100; length valid $ 32.; valid = translate(strip(string), repeat('_', length(strip(string))), compress(strip(string),'_abcdefghijklmnopqrstuvwxyz', 'di')); if anyDigit(substr(valid, 1, 1)) then substr(valid, 1, 1) = '_'; run;
%end;
%mend repeat2;
%repeat2;
proc compare base = test100TRN compare = test100PRX listAll; run;
... View more