<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic TF-IDF with SAS Viya in SAS Data Science</title>
    <link>https://communities.sas.com/t5/SAS-Data-Science/TF-IDF-with-SAS-Viya/m-p/700408#M8500</link>
    <description>&lt;P&gt;Hello everyone,&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;is it possible to build a TF-IDF Matrix in SAS Viya Model Studio? The Text Mining Node just create topic columns..&lt;/P&gt;&lt;P&gt;Is there a way to build the "classic" TF-IDF?&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Thanks a lot!&lt;/P&gt;&lt;P&gt;Melanie&lt;/P&gt;</description>
    <pubDate>Fri, 20 Nov 2020 07:28:29 GMT</pubDate>
    <dc:creator>Melanie3</dc:creator>
    <dc:date>2020-11-20T07:28:29Z</dc:date>
    <item>
      <title>TF-IDF with SAS Viya</title>
      <link>https://communities.sas.com/t5/SAS-Data-Science/TF-IDF-with-SAS-Viya/m-p/700408#M8500</link>
      <description>&lt;P&gt;Hello everyone,&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;is it possible to build a TF-IDF Matrix in SAS Viya Model Studio? The Text Mining Node just create topic columns..&lt;/P&gt;&lt;P&gt;Is there a way to build the "classic" TF-IDF?&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Thanks a lot!&lt;/P&gt;&lt;P&gt;Melanie&lt;/P&gt;</description>
      <pubDate>Fri, 20 Nov 2020 07:28:29 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Data-Science/TF-IDF-with-SAS-Viya/m-p/700408#M8500</guid>
      <dc:creator>Melanie3</dc:creator>
      <dc:date>2020-11-20T07:28:29Z</dc:date>
    </item>
    <item>
      <title>Re: TF-IDF with SAS Viya</title>
      <link>https://communities.sas.com/t5/SAS-Data-Science/TF-IDF-with-SAS-Viya/m-p/835854#M10316</link>
      <description>&lt;P&gt;Hi Mellanie,&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;I was reading old messages and see your message which is not responded. I use this approach with good results in a customer, of course it depends of your text data and assumptions of your model, but to a general of classification of a document in a class, it works fine. You can generate TF-IDF / SVDD Features and use in a classification model.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;		/* TF-IDF / SVDD Parameters */
	%let min_freq_doc =  150;
	%let min_freq_overall = 150;
	%let numLabels = 10;
	%let max_k = 50;
	%let LimitTermQty = 750;

	/*Partition*/
	proc partition data=&amp;amp;caslibname..BASE_DEFERIMENTO_PROC_SS  samppct=15 samppct2=15 seed=8;
		by DEFERIMENTO;
		output out=&amp;amp;caslibname..partitioned_data copyvars=(_ALL_);
	run;

	/*SVDD*/
	proc textmine data=&amp;amp;caslibname..partitioned_data_train LANGUAGE=Portuguese;
	doc_id Document_ID ;
	var texto;
	parse
	   nonoungroups
	   termwgt  = ENTROPY
	   cellwgt  = LOG
	   entities = NONE
	   reducef= &amp;amp;min_freq_doc.
	/*   start = RDMF.STARTLIST - If you have one*/
	   outparent= &amp;amp;caslibname..outparent_train
	   outterms = &amp;amp;caslibname..outterms_train
	   outpos = &amp;amp;caslibname..outpos_train
	   outchild = &amp;amp;caslibname..outchild_train
	   outconfig= &amp;amp;caslibname..outconfig_train;
	   
	   select "PPOS" "DET" "PN" "N" /ignore;
	svd 
	   max_k=&amp;amp;max_k. 
	   numlabels=&amp;amp;numLabels.
	   outdocpro=&amp;amp;caslibname..outdocpro_train
	   svdu=&amp;amp;caslibname..svdu_train
	   outtopics=&amp;amp;caslibname..outtopics_train;
	   savestate rstore=&amp;amp;caslibname..outsvdmodel_train;
	run;

/* save in a physical lib to score test data set*/
	data RDMF.outterms_train;
		set &amp;amp;caslibname..outterms_train;
	run;
	
	data RDMF.outconfig_train;
		set &amp;amp;caslibname..outconfig_train;
	run;
	data RDMF.svdu_train;
		set &amp;amp;caslibname..svdu_train;
	run;
	
	/* SORT */
	PROC SORT data = &amp;amp;caslibname..outterms_train out=&amp;amp;caslibname..outterms_train_aux NODUPKEY;
	by Parent_id Term;
	RUN;

	/*Limiting Terms by a frequency in overall documentos*/
	PROC SORT data=&amp;amp;caslibname..outterms_train_aux out=&amp;amp;caslibname..outterms_train_aux NODUPKEY;
		BY Parent_id Term;
		WHERE ((Term not in (""," ")) and (LENGTH(Term) &amp;gt; 3) and (LENGTH(Term) &amp;lt;= 25) and (Freq &amp;gt; &amp;amp;min_freq_overall.));
	RUN;

	PROC SORT data=&amp;amp;caslibname..outterms_train_aux out=&amp;amp;caslibname..outterms_train_aux NODUPKEY;
		BY Parent_id Term;
		WHERE ((Term not in (""," ")) and (LENGTH(Term) &amp;gt; 3) and (LENGTH(Term) &amp;lt;= 25) and (Freq &amp;gt; &amp;amp;min_freq_overall.));
	RUN;
	
	/* RENAME */
	DATA RDMF.outterms_train_aux;
		SET &amp;amp;caslibname..outterms_train_aux (RENAME=(Parent_id=_TERMNUM_));
		Term = compress(Term);
	RUN;
	
	PROC SORT DATA=RDMF.outterms_train_aux OUT=RDMF.outterms_train_aux;
		BY _TERMNUM_;
	RUN;
	
	/* LIMITING TERMS QUANTITY - To algorithms with a huge cost to lead with a high number of features*/
	PROC SQL;
		CREATE TABLE outparent_train_aux AS
		SELECT _TERMNUM_, STD(_COUNT_) AS FREQ_TERM
		FROM &amp;amp;caslibname..outparent_train
		GROUP BY _TERMNUM_
		ORDER BY FREQ_TERM DESC;
	QUIT;

	DATA &amp;amp;caslibname..outparent_train_aux;
		SET outparent_train_aux(obs=&amp;amp;LimitTermQty);
	RUN;

	PROC SORT DATA=&amp;amp;caslibname..outparent_train_aux;
		BY _TERMNUM_;
	RUN;

	DATA RDMF.outparent_train_aux;
		SET &amp;amp;caslibname..outparent_train_aux;
	RUN;
	
	DATA &amp;amp;caslibname..outparent_train_red;
	MERGE &amp;amp;caslibname..outparent_train (IN=PAR) &amp;amp;caslibname..outparent_train_aux (IN=AUX KEEP=_TERMNUM_);
	BY _TERMNUM_;
	IF AUX;
	RUN;

	proc contents data=&amp;amp;caslibname..outparent_train_red;run;
	
	data &amp;amp;caslibname..outterms_train_aux ;
	set RDMF.outterms_train_aux ;
	run;

	/* Merge */
	DATA RDMF.MATRIZ_TERMOS_DOCUMENTOS_AUX;
	MERGE &amp;amp;caslibname..outparent_train_red (IN=PAR) &amp;amp;caslibname..outterms_train_aux (IN=TERMS KEEP=Term _TERMNUM_);
	BY _TERMNUM_;
	IF PAR;
	RUN;

	proc sort data=RDMF.MATRIZ_TERMOS_DOCUMENTOS_AUX out=RDMF.MATRIZ_TERMOS_DOCUMENTOS_AUX NODUPKEY;
		BY _DOCUMENT_ _TERMNUM_;
	RUN;

	/*Add A Suffix to TF-IF variables*/	
	data  RDMF.MATRIZ_TERMOS_DOCUMENTOS_AUX;
	set  RDMF.MATRIZ_TERMOS_DOCUMENTOS_AUX;
	FORMAT DATASET $20.;
	TF_IDF_ = 'TF_IDF_'; 
	DATASET = 'Train';
	run;


	/*TRANSPOSE to get the matrix*/
	proc transpose data = RDMF.MATRIZ_TERMOS_DOCUMENTOS_AUX 
	out = RDMF.MATRIZ_TERMOS_DOCUMENTOS PREFIX=TF_IDF_;
	by _DOCUMENT_;
	id _TERMNUM_;	
	var _COUNT_;
	run;
	
	/*Finally: After you have to score test data set*/
		/* Score TF-IDF */
	proc tmscore
	  data      = &amp;amp;caslibname..partitioned_data_test 
	  terms     = &amp;amp;caslibname..outterms_train
	  config    = &amp;amp;caslibname..outconfig_train
	  outparent = &amp;amp;caslibname..outparent_test
	  svdu = &amp;amp;caslibname..svdu_train
	  svddocpro = &amp;amp;caslibname..outdocpro_test;
	  doc_id      numero_processo;
	  var  texto;
	run;
	
	/*Do the same previous process to test dataset*/
	/*Do the same process to validation and test datasets and and join all to use as input of your model*/
	
&lt;/CODE&gt;&lt;/PRE&gt;</description>
      <pubDate>Thu, 29 Sep 2022 14:31:49 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Data-Science/TF-IDF-with-SAS-Viya/m-p/835854#M10316</guid>
      <dc:creator>BigRider</dc:creator>
      <dc:date>2022-09-29T14:31:49Z</dc:date>
    </item>
    <item>
      <title>Re: TF-IDF with SAS Viya</title>
      <link>https://communities.sas.com/t5/SAS-Data-Science/TF-IDF-with-SAS-Viya/m-p/907003#M10673</link>
      <description>&lt;P&gt;Hello there, &lt;a href="https://communities.sas.com/t5/user/viewprofilepage/user-id/300631"&gt;@BigRider&lt;/a&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Thank you for sharing your code.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;As I reviewed the SAS code provided, I couldn't locate the specific step where TF and IDF values are multiplied to calculate the TF-IDF scores. If you could kindly point me in the right direction or provide some tips on this, I'd greatly appreciate it.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Thanks in advance.&lt;/P&gt;</description>
      <pubDate>Fri, 08 Dec 2023 17:26:08 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Data-Science/TF-IDF-with-SAS-Viya/m-p/907003#M10673</guid>
      <dc:creator>alisio_meneses</dc:creator>
      <dc:date>2023-12-08T17:26:08Z</dc:date>
    </item>
    <item>
      <title>Re: TF-IDF with SAS Viya</title>
      <link>https://communities.sas.com/t5/SAS-Data-Science/TF-IDF-with-SAS-Viya/m-p/907441#M10676</link>
      <description>&lt;P&gt;Hello&amp;nbsp;&lt;a href="https://communities.sas.com/t5/user/viewprofilepage/user-id/396980"&gt;@alisio_meneses&lt;/a&gt;&amp;nbsp;,&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;In PROC TEXTMINE , these are the&lt;/P&gt;
&lt;P&gt;&lt;STRONG&gt;Term-by-Document Matrix Creation Options&lt;/STRONG&gt;&lt;/P&gt;
&lt;UL&gt;
&lt;LI&gt;CELLWGT=&amp;nbsp; &amp;nbsp; Specifies how cells are weighted&lt;/LI&gt;
&lt;LI&gt;REDUCEF=&amp;nbsp; &amp;nbsp; &amp;nbsp;Specifies the frequency for term filtering&lt;/LI&gt;
&lt;LI&gt;TERMWGT=&amp;nbsp; Specifies how terms are weighted&lt;/LI&gt;
&lt;/UL&gt;
&lt;P&gt;in the PARSE statement.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Koen&lt;/P&gt;</description>
      <pubDate>Mon, 11 Dec 2023 22:55:31 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Data-Science/TF-IDF-with-SAS-Viya/m-p/907441#M10676</guid>
      <dc:creator>sbxkoenk</dc:creator>
      <dc:date>2023-12-11T22:55:31Z</dc:date>
    </item>
  </channel>
</rss>

