<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: how to sum the values of similar words? in SAS Programming</title>
    <link>https://communities.sas.com/t5/SAS-Programming/how-to-sum-the-values-of-similar-words/m-p/315170#M68700</link>
    <description>&lt;P&gt;Your example shows that the first 5 characters of WORDS looks to be good. You may choose 4 or 3 or 2 characters from any position using susbstr() function.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;In the following code, I have declared W as 5-character string to which &amp;nbsp;when I assign WORDS takes the first leftmost&lt;/P&gt;&lt;P&gt;5 characters. Then DOW loop is used to get the sums. You may pre-sort the HAVE data set by W. In this example sorting is not a must as words are already GROUPED(so NOTSORTED option is used).&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;data have;
length w $5;
input words :$10. d1 d2 d3 d4;
w = words;
datalines;
beast .  5  4  8
beasts   1  7  3  6
apopt 2  .  .  3
apopto   4  8  7  5
succinate   5  9  .  3
succinates  6  6  .  2
;
run;

data need;
   do until(last.w);
      set have ;
      by w notsorted;

      array sum[4];
      array k[4] d1 - d4;
      if first.w then call missing(of sum[*]);
      do i = 1 to dim(k); sum[i] + k[i]; end;
   end;
keep words sum:;
run;&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Tue, 29 Nov 2016 12:51:25 GMT</pubDate>
    <dc:creator>KachiM</dc:creator>
    <dc:date>2016-11-29T12:51:25Z</dc:date>
    <item>
      <title>how to sum the values of similar words?</title>
      <link>https://communities.sas.com/t5/SAS-Programming/how-to-sum-the-values-of-similar-words/m-p/314947#M68652</link>
      <description>&lt;P&gt;Hi,&lt;/P&gt;&lt;P&gt;I have a dataset like&lt;/P&gt;&lt;TABLE&gt;&lt;TBODY&gt;&lt;TR&gt;&lt;TD&gt;words&lt;/TD&gt;&lt;TD&gt;d1&lt;/TD&gt;&lt;TD&gt;d2&lt;/TD&gt;&lt;TD&gt;d3&lt;/TD&gt;&lt;TD&gt;d4&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;beast&lt;/TD&gt;&lt;TD&gt;.&lt;/TD&gt;&lt;TD&gt;5&lt;/TD&gt;&lt;TD&gt;4&lt;/TD&gt;&lt;TD&gt;8&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;beasts&lt;/TD&gt;&lt;TD&gt;1&lt;/TD&gt;&lt;TD&gt;7&lt;/TD&gt;&lt;TD&gt;3&lt;/TD&gt;&lt;TD&gt;6&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;apopt&lt;/TD&gt;&lt;TD&gt;2&lt;/TD&gt;&lt;TD&gt;.&lt;/TD&gt;&lt;TD&gt;.&lt;/TD&gt;&lt;TD&gt;3&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;apopto&lt;/TD&gt;&lt;TD&gt;4&lt;/TD&gt;&lt;TD&gt;8&lt;/TD&gt;&lt;TD&gt;7&lt;/TD&gt;&lt;TD&gt;5&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;succinate&lt;/TD&gt;&lt;TD&gt;5&lt;/TD&gt;&lt;TD&gt;9&lt;/TD&gt;&lt;TD&gt;.&lt;/TD&gt;&lt;TD&gt;3&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;succinates&lt;/TD&gt;&lt;TD&gt;6&lt;/TD&gt;&lt;TD&gt;6&lt;/TD&gt;&lt;TD&gt;.&lt;/TD&gt;&lt;TD&gt;2&lt;/TD&gt;&lt;/TR&gt;&lt;/TBODY&gt;&lt;/TABLE&gt;&lt;P&gt;I want my result looks like&lt;/P&gt;&lt;TABLE&gt;&lt;TBODY&gt;&lt;TR&gt;&lt;TD&gt;words&lt;/TD&gt;&lt;TD&gt;d1&lt;/TD&gt;&lt;TD&gt;d2&lt;/TD&gt;&lt;TD&gt;d3&lt;/TD&gt;&lt;TD&gt;d4&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;beasts&lt;/TD&gt;&lt;TD&gt;1&lt;/TD&gt;&lt;TD&gt;12&lt;/TD&gt;&lt;TD&gt;7&lt;/TD&gt;&lt;TD&gt;14&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;apopto&lt;/TD&gt;&lt;TD&gt;6&lt;/TD&gt;&lt;TD&gt;8&lt;/TD&gt;&lt;TD&gt;7&lt;/TD&gt;&lt;TD&gt;8&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;succinates&lt;/TD&gt;&lt;TD&gt;11&lt;/TD&gt;&lt;TD&gt;15&lt;/TD&gt;&lt;TD&gt;.&lt;/TD&gt;&lt;TD&gt;5&lt;/TD&gt;&lt;/TR&gt;&lt;/TBODY&gt;&lt;/TABLE&gt;&lt;P&gt;The original dataset is much larger, there are thousands of values of words. And many of them have typos but I need to sum the values of similar words. I am thinking using something like 'sound like' operator to calculate the sum, but it seems only work in where statement. I really have no idea what to do, can some one help me out?&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Mon, 28 Nov 2016 20:26:15 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/how-to-sum-the-values-of-similar-words/m-p/314947#M68652</guid>
      <dc:creator>hx</dc:creator>
      <dc:date>2016-11-28T20:26:15Z</dc:date>
    </item>
    <item>
      <title>Re: how to sum the values of similar words?</title>
      <link>https://communities.sas.com/t5/SAS-Programming/how-to-sum-the-values-of-similar-words/m-p/314959#M68653</link>
      <description>&lt;P&gt;You may check next links documentation on &lt;STRONG&gt;SOUNDEX&lt;/STRONG&gt; function:&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&lt;A href="http://support.sas.com/resources/papers/proceedings12/122-2012.pdf" target="_self"&gt;http://support.sas.com/resources/papers/proceedings12/122-2012.pdf&lt;/A&gt;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&lt;A href="http://support.sas.com/documentation/cdl/en/sqlproc/69822/HTML/default/viewer.htm#p0a62rd151ctown1x38ihdpjozyv.htm" target="_self"&gt;http://support.sas.com/documentation/cdl/en/sqlproc/69822/HTML/default/viewer.htm#p0a62rd151ctown1x38ihdpjozyv.htm&lt;/A&gt;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;I suggest you to run next code to check how soundex realy can help you:&lt;/P&gt;
&lt;P&gt;&amp;nbsp; &amp;nbsp;proc sql;&lt;/P&gt;
&lt;P&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; create table test as select&lt;/P&gt;
&lt;P&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;words, soundex(words) as sounds&lt;/P&gt;
&lt;P&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;from have&lt;/P&gt;
&lt;P&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;sort by sounds;&lt;/P&gt;
&lt;P&gt;quit;&lt;/P&gt;</description>
      <pubDate>Mon, 28 Nov 2016 21:04:32 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/how-to-sum-the-values-of-similar-words/m-p/314959#M68653</guid>
      <dc:creator>Shmuel</dc:creator>
      <dc:date>2016-11-28T21:04:32Z</dc:date>
    </item>
    <item>
      <title>Re: how to sum the values of similar words?</title>
      <link>https://communities.sas.com/t5/SAS-Programming/how-to-sum-the-values-of-similar-words/m-p/314987#M68660</link>
      <description>Thank you Shmuel! the thing is I need to calculate the sum of d1 d2s according to the words. And I found that if I use soundex, it will have questions such that word like Acid,acute and aged would be treated as the same, but that's not what I want. So i'm thinking using like creating some kind of loop to get this done, yet still no progress..</description>
      <pubDate>Mon, 28 Nov 2016 22:19:41 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/how-to-sum-the-values-of-similar-words/m-p/314987#M68660</guid>
      <dc:creator>hx</dc:creator>
      <dc:date>2016-11-28T22:19:41Z</dc:date>
    </item>
    <item>
      <title>Re: how to sum the values of similar words?</title>
      <link>https://communities.sas.com/t5/SAS-Programming/how-to-sum-the-values-of-similar-words/m-p/315008#M68665</link>
      <description>&lt;P&gt;I guess the question is "how do you define similar?". It could be like soundex() returns the same value.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;You could also have a look into function complev() and compged()&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;In the end of the day: You need to cluster word into groups and then you sum by the groups.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;I'm not sure if SAS offers tools for such word clustering as part of SAS Text Analytics but if you've got the license then sure worth checking.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;If you don't find anything in the SAS toolbox available to you then eventually consider using some other tool for clustering/standardizing your words. I'm sure there is a lot out there.&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&lt;A href="https://github.com/mattalcock/blog/blob/master/2012/12/5/python-spell-checker.rst" target="_blank"&gt;https://github.com/mattalcock/blog/blob/master/2012/12/5/python-spell-checker.rst&lt;/A&gt;&lt;/P&gt;
&lt;P&gt;&lt;A href="https://lucene.apache.org/core/3_5_0/api/contrib-spellchecker/org/apache/lucene/search/spell/SpellChecker.html" target="_blank"&gt;https://lucene.apache.org/core/3_5_0/api/contrib-spellchecker/org/apache/lucene/search/spell/SpellChecker.html&lt;/A&gt;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Mon, 28 Nov 2016 23:37:24 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/how-to-sum-the-values-of-similar-words/m-p/315008#M68665</guid>
      <dc:creator>Patrick</dc:creator>
      <dc:date>2016-11-28T23:37:24Z</dc:date>
    </item>
    <item>
      <title>Re: how to sum the values of similar words?</title>
      <link>https://communities.sas.com/t5/SAS-Programming/how-to-sum-the-values-of-similar-words/m-p/315059#M68667</link>
      <description>&lt;P&gt;As&amp;nbsp;&lt;a href="https://communities.sas.com/t5/user/viewprofilepage/user-id/12447"&gt;@Patrick﻿&lt;/a&gt;&amp;nbsp;said:&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;In the end of the day: You need to cluster word into groups and then you sum by the groups.&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;In order to cluster words, I would check results of several methods by running:&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt; proc sql;
          create table temp as select
             words, 
             &lt;STRONG&gt;&lt;EM&gt;function&lt;/EM&gt;&lt;/STRONG&gt;(words) as &lt;STRONG&gt;clusterx&lt;/STRONG&gt;
             from have
           sort by sounds;
quit;&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;where&amp;nbsp;&lt;CODE class=" language-sas"&gt;&lt;STRONG&gt;&lt;EM&gt;function&amp;nbsp;&lt;/EM&gt;&lt;/STRONG&gt;is replaced by any suggested function or even a combination of them:&lt;/CODE&gt;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;1) &amp;nbsp; compress(words,'aeiou') &amp;nbsp; /* suppressing voels */&lt;/P&gt;
&lt;P&gt;2) &amp;nbsp; substr(words,1,4) &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; /* check first 4 chatacters only */&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;in order to use&amp;nbsp;&lt;SPAN&gt;&lt;STRONG&gt;complev&lt;/STRONG&gt;(),&amp;nbsp;&lt;STRONG&gt;compged&lt;/STRONG&gt;() functions you need to gather at least two strings in order to compare.&lt;/SPAN&gt;&lt;/P&gt;
&lt;P&gt;&lt;SPAN&gt;that can be done by:&lt;/SPAN&gt;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp;proc sort data=have; by words; run;
&amp;nbsp; &amp;nbsp; data temp;
&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;set have;
&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; length prev_word $20;
&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;retain prev_word;
&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;if _n_ &amp;nbsp;= 1 then prev_word = words;
&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; else do;&lt;BR /&gt;                 &lt;STRONG&gt;cluster1&lt;/STRONG&gt; = complev(words, prev_word);&lt;BR /&gt;                 &lt;STRONG&gt;cluster2&lt;/STRONG&gt; = compged(words, prev_word);
&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;output;
&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;prev_word = words;
&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; end;
&amp;nbsp; &amp;nbsp;run;&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;&amp;nbsp; &amp;nbsp;then check which one of the two - cluster1 or cluster2 - fits more to desired results.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp; &amp;nbsp;Finally you need to decide which clustering method gives the best results, then you can sum by&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;proc means data=temp;
     class &lt;STRONG&gt;clusterx&lt;/STRONG&gt;;
     var d1 - d4;
     output out=want sum=;
run;&lt;/CODE&gt;&lt;/PRE&gt;</description>
      <pubDate>Tue, 29 Nov 2016 01:57:37 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/how-to-sum-the-values-of-similar-words/m-p/315059#M68667</guid>
      <dc:creator>Shmuel</dc:creator>
      <dc:date>2016-11-29T01:57:37Z</dc:date>
    </item>
    <item>
      <title>Re: how to sum the values of similar words?</title>
      <link>https://communities.sas.com/t5/SAS-Programming/how-to-sum-the-values-of-similar-words/m-p/315097#M68673</link>
      <description>&lt;PRE&gt;

Firstly you need find out which name belong to the same cluster.
Once you have done that (check WANT dataset), 
you can do the math to calculate SUM. I think it is easy for you.



data x;
infile cards truncover expandtabs;
input words	: $40. d1	d2	d3	d4;
id+1;
cards;
beast	.	5	4	8
beasts	1	7	3	6
apopt	2	.	.	3
apopto	4	8	7	5
succinate	5	9	.	3
succinates	6	6	.	2
;
run;

proc sql;
create table have as
select a.words as from,b.words as to,spedis(a.words,b.words) as distance
 from x as a,x as b
  where a.id ne b.id
   group by a.words
    having calculated distance=min(calculated distance);
quit;


data full;
  set have end=last;
  if _n_ eq 1 then do;
   declare hash h();
    h.definekey('node');
     h.definedata('node');
     h.definedone();
  end;
  output;
  node=from; h.replace();
  from=to; to=node;
  output;
  node=from; h.replace();
  if last then h.output(dataset:'node');
  drop node;
run;


data want(keep=node household);
declare hash ha(ordered:'a');
declare hiter hi('ha');
ha.definekey('count');
ha.definedata('last');
ha.definedone();
declare hash _ha(hashexp: 20);
_ha.definekey('key');
_ha.definedone();

if 0 then set full;
declare hash from_to(dataset:'full(where=(from is not missing and to is not missing))',hashexp:20,multidata:'y');
 from_to.definekey('from');
 from_to.definedata('to');
 from_to.definedone();

if 0 then set node;
declare hash no(dataset:'node');
declare hiter hi_no('no');
 no.definekey('node');
 no.definedata('node');
 no.definedone();
 

do while(hi_no.next()=0);
 household+1; output;
 count=1;
 key=node;_ha.add();
 last=node;ha.add();
 rc=hi.first();
 do while(rc=0);
   from=last;rx=from_to.find();
   do while(rx=0);
     key=to;ry=_ha.check();
      if ry ne 0 then do;
       node=to;output;rr=no.remove(key:node);
       key=to;_ha.add();
       count+1;
       last=to;ha.add();
      end;
      rx=from_to.find_next();
   end;
   rc=hi.next();
end;
ha.clear();_ha.clear();
end;
stop;
run;

&lt;/PRE&gt;</description>
      <pubDate>Tue, 29 Nov 2016 05:44:06 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/how-to-sum-the-values-of-similar-words/m-p/315097#M68673</guid>
      <dc:creator>Ksharp</dc:creator>
      <dc:date>2016-11-29T05:44:06Z</dc:date>
    </item>
    <item>
      <title>Re: how to sum the values of similar words?</title>
      <link>https://communities.sas.com/t5/SAS-Programming/how-to-sum-the-values-of-similar-words/m-p/315170#M68700</link>
      <description>&lt;P&gt;Your example shows that the first 5 characters of WORDS looks to be good. You may choose 4 or 3 or 2 characters from any position using susbstr() function.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;In the following code, I have declared W as 5-character string to which &amp;nbsp;when I assign WORDS takes the first leftmost&lt;/P&gt;&lt;P&gt;5 characters. Then DOW loop is used to get the sums. You may pre-sort the HAVE data set by W. In this example sorting is not a must as words are already GROUPED(so NOTSORTED option is used).&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;data have;
length w $5;
input words :$10. d1 d2 d3 d4;
w = words;
datalines;
beast .  5  4  8
beasts   1  7  3  6
apopt 2  .  .  3
apopto   4  8  7  5
succinate   5  9  .  3
succinates  6  6  .  2
;
run;

data need;
   do until(last.w);
      set have ;
      by w notsorted;

      array sum[4];
      array k[4] d1 - d4;
      if first.w then call missing(of sum[*]);
      do i = 1 to dim(k); sum[i] + k[i]; end;
   end;
keep words sum:;
run;&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Tue, 29 Nov 2016 12:51:25 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/how-to-sum-the-values-of-similar-words/m-p/315170#M68700</guid>
      <dc:creator>KachiM</dc:creator>
      <dc:date>2016-11-29T12:51:25Z</dc:date>
    </item>
  </channel>
</rss>

