Thanks Jan. Your suggestion to output to PDF helped me figure out the problem. I didn't include enough iterations in my do loop in the code below. I used 30 before (see i), but updating it to 45 made all of the players show up. The strange thing was that when I used 30 I had two blank rows after the last row containing player information which made me think, in error, I had reached the end of the string. Coincidentally, when I did a proc print (ODS HTML...) on the HTML string I was using it stopped at the same player (perhaps due to the limit of the ODS HTML output?). When I output to pdf I was able to get all of the players.
data Stats_pre;
set http;
keep Player_name iso_player Player_name ESPNID ESPNNAME Position Minutes fg _3pt ft oreb dreb reb ast stl blk to pf plusminus pts;
length iso_player $750 ESPNNAME $50 Player_name Position Minutes fg _3pt ft oreb dreb reb ast stl blk to pf plusminus pts $25;
/* Set pattern matching */
delete1_re=prxparse('s/\"\@\~//i');
replace_delim_re=prxparse('s/(playercard)/@/i');
replace_delim2_re=prxparse('s/(class\=)/~/i');
delete2_re=prxparse('s/class\=//i');
delete3_re=prxparse('s/[\<\>]//i');
starters_re=prxparse('/Starters/');
ESPNID_re=prxparse('@\/id\/\d+\/@'); /* /id/3970/demarre-carroll */
ESPNNAME_re=prxparse('%\/[a-z\-]+\>\<%');
_null_re=prxparse("/zzzzzzz/i");
Player_name_re=prxparse("/\>\w+\.\s*[a-z\-\s\.\']+\</i");
Position_re=prxparse("/\>\w+\</i");
Minutes_re=prxparse("/\>\d+\</i");
fg_re=prxparse("/\>\d+\-\d+\</i");
num_re=prxparse("/\>\d+\</i");
plusminus_re=prxparse("/\>[\+\-]\d+\</i");
record=compress(record,'"'); /* delete double quotes */
call prxchange(delete1_re,-1,record); /* delete @ */
call prxchange(replace_delim_re,-1,record); /* replace "playercard" with @ as delimiter */
call prxchange(replace_delim2_re,-1,record); /* replace "class=" with ~ as delimiter */
if prxmatch('/boxscore_tabs/i',record) then do;
call prxsubstr(starters_re,record,pos,len); /* Strip all text prior to boxscore data */
record=substr(record,pos,30000-pos);
if substr(record,1,5) eq 'Start' then do;
do i=1 to 45;
iso_player=scan(record,i,'@');
array cols [17] _null Player_name Position Minutes fg _3pt ft oreb dreb reb ast stl blk to pf plusminus pts;
array colre[17] _null_re Player_name_re Position_re Minutes_re fg_re fg_re fg_re num_re num_re num_re num_re num_re num_re num_re num_re plusminus_re num_re ;
do j=1 to 17;
temp=scan(iso_player,j,'~');
if j eq 1 then do;
call prxsubstr(ESPNID_re,temp,pos1,len1);
if pos1 ne 0 then ESPNID=input(substr(temp,pos1+4,len1-5),8.); else ESPNID=.;
call prxsubstr(ESPNNAME_re,temp,pos2,len2);
if pos2 ne 0 then ESPNNAME=substr(temp,pos2+1,len2-3); else ESPNNAME='';
end;
call prxsubstr(colre{j},temp,posJ,lenJ);
if posJ ne 0 then cols{j}=substr(temp,posJ,lenJ); else cols{j}='';
call prxchange(delete3_re,-1,cols{j});
if find(iso_player,'DNP') and j not in (2 3) then cols{j}='';
if j eq 17 and ESPNID ne . then output;
end;
end;
end;
end;
run;
... View more