Hello, I am trying to build a Macro program for simple text analysis following the methodology of the authors in Paper 2557-2018 "A simple approach to text analysis using SAS functions". (Available at <https://www.sas.com/content/dam/SAS/support/en/sas-global-forum-proceedings/2018/2557-2018.pdf>). However I have been unable to replicate the results that the authors achieved for my own data, including when using the data example the authors provided sashelp.adsmsg. I believe that this is due to how SAS is reading the selected variable in the INDEX and INDEXW functions in the MACRO, although I am still learning SAS and my debugging skills leave a bit to be desired. Any help or ideas would be greatly appreciated! I have included my full code use to make the program below. The first module works well: %LET ds_name = sashelp.adsmsg;
%LET Var_Data_Source = TEXT;
DATA Bag_of_words;
SET &ds_name;
Var_Data_Source=COMPBL(TRANSLATE(&Var_Data_Source, " " , "."","";"":""?""!""-""/""\""
""%""1""2""3""4""5""6""7""8""9""0""$""@""#"")""("));
N_words=COUNTW(&Var_Data_Source);
ARRAY word {1:1000} $50 _TEMPORARY_;
DO i=1 TO N_words;
RETAIN word_i;
word(i)=SCAN( Var_Data_Source, i, ' ' );
word_i=UPCASE(word(i));
IF (i>1) THEN word_2i= UPCASE(catx(" ",word(i-1), word_i));
IF (i>2) THEN word_3i= UPCASE(catx(" ",word(i-2), word(i-1), word_i));
IF (i>3) THEN word_4i= UPCASE(catx(" ",word(i-3), word(i-2), word(i-1), word_i));
IF (i>4) THEN word_5i= UPCASE(catx(" ",word(i-4), word(i-3), word(i-2), word(i-1), word_i));
OUTPUT;
END;
RUN;
PROC PRINT DATA = Bag_of_words; RUN;
PROC FREQ DATA = Bag_of_words; TABLE word_i word_2i word_3i word_4i/NOCUM; RUN;
Data Term_matrix;
INFILE datalines;
INPUT in_words $1-50;
in_words=TRANSLATE(UPCASE(in_words), "" , '.'','';'':''?''!''-''/''\');
out_words=SUBSTR(TRANSLATE(STRIP(in_words),"_"," "),1,32);
DATALINES;
cultion
calcultion
calculation
;
RUN;
PROC PRINT DATA= Term_matrix; RUN; The second module uses the Macro program (listed later): %INCLUDE "/folders/myfolders/EPG194/Macro_KW_search.sas";
DATA _NULL_;
SET Term_matrix END=eof;
in_words_1=STRIP("'"||in_words||"'");
CALL SYMPUT("Keyword_in",UPCASE(in_words_1));
CALL SYMPUT("Keyword_out",out_words);
IF (_N_=1) THEN DO;
CALL EXECUTE('%CERTAINTY_FACTOR (Data_IN=&ds_name, Data_OUT=Term_Doc_Matrix)');
END;
CALL EXECUTE('%KW_SEARCH (KW=&Keyword_in, Var_KW_out=&Keyword_out, Var_Target_doc=TEXT,
Data_IN=Term_Doc_Matrix, Data_OUT=Term_Doc_Matrix)');
RUN;
PROC SQL NOPRINT;
SELECT out_words INTO: v_list_comma_sep separated BY ',' FROM Term_matrix;
SELECT in_words INTO: v_list_blank_sep separated BY ' ' FROM Term_matrix;
QUIT;
PROC TABULATE DATA=Term_Doc_Matrix;
CLASS certainty_factor TEXT;
Var &v_list_blank_sep;
Table (&v_list_blank_sep), (certainty_factor='certainty factor' ALL='No of Terms found')*SUM='' N='No of Documents';
Table TEXT='Target Documents'*(&v_list_blank_sep), (certainty_factor='certainty factor' ALL='Total')*SUM='';
RUN; This is the Macro program that the authors use and where I believe that I have a problem, although I am not sure. You can see that if the INDEXW and INDEX functions return a positive value that this should then be recorded in the temporary array and then in a count variable, however I am unable to achieve this. I tried debugging the program directly by using the INDEX and INDEXW functions directly with the variable value that I was looking to match (I am not sure if this is very conventional) and had values returned for Target_truncated but not for KW variable - leading me to thing that the issue is with the KW variable. %MACRO CERTAINTY_FACTOR ( Data_OUT=, Data_IN=);
DATA &Data_OUT;
SET &Data_IN;
DO Certainty_factor=1 to 3;
OUTPUT;
END;
%MEND;
%MACRO KW_SEARCH(KW=, Var_KW_in=, Var_KW_out=, Data_IN=, Data_OUT=, Var_Target_Doc=);
DATA &Data_OUT Replace;
ATTRIB Flag_success length=3;
KW= UPCASE(&KW);
SET &Data_IN;
Target_string=UPCASE(COMPBL(TRANSLATE(&Var_Target_Doc, " " , ".%,;:?!-/\")));
KW_words = COUNTW(KW);
N_words = COUNTW(Target_string);
ARRAY word {1:1000} $50 _TEMPORARY_ ;
ARRAY IdW {1:100} _TEMPORARY_ ;
ARRAY Idx {1:100} _TEMPORARY_ ;
ARRAY Sdx {1:100} _TEMPORARY_ ;
Soundex_Count=0; Index_count=0; IndexW_count=0;
DO i=1 TO (N_words);
word(i)=SCAN( Target_string, i, ' ' );
IF i GE(KW_words) THEN DO;
length Target_truncated $50;
Target_truncated='';
DO j=1 TO KW_words;
Target_truncated= UPCASE(STRIP(CATX(" ", word(i-j+1) ,Target_truncated)));
END;
END;
IF (INDEXW(Target_truncated, STRIP(KW))>0) THEN IdW(i)=1; ELSE IdW(i)=0;
IF (INDEX(COMPRESS(Target_truncated), COMPRESS(KW))>0) THEN Idx(i) =1; ELSE Idx(i) =0;
IF (INDEX(SOUNDEX(Target_truncated), SOUNDEX(KW))>0) THEN Sdx(i) =1; ELSE Sdx(i) =0;
IndexW_count=IndexW_count + IdW(i);
Index_count=Index_count + Idx(i);
Soundex_Count=Soundex_Count + Sdx(i);
END;
IF (Certainty_factor=1) AND (IndexW_count>=0) THEN DO;
&Var_KW_out=IndexW_count; END;
IF (Certainty_factor=2) AND ((Index_count-IndexW_count)>=0) THEN DO;
&Var_KW_out=(Index_count-IndexW_count); END;
IF (Certainty_factor=3) AND ((Soundex_Count-IndexW_count)>=0) THEN DO;
&Var_KW_out=(Soundex_Count-IndexW_count); END;
IF &Var_KW_out>=0 THEN Flag_success=1;
DROP i j KW KW_words N_words Soundex_Count Index_count IndexW_count Target_truncated Target_string;
%MEND; If you have any ideas about how I can figure out what the problem might be for me I would really appreciate the help. Thanks in advance!
... View more