<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Base SAS word Count with millions of rows of word in SAS Programming</title>
    <link>https://communities.sas.com/t5/SAS-Programming/Base-SAS-word-Count-with-millions-of-rows-of-word/m-p/698723#M213712</link>
    <description>&lt;P&gt;If you just want to count how many times each value appears (and don't need to generate an statistics) then there is no need to use PROC FREQ.&amp;nbsp; You could count with a simple SQL statement.&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;proc sql;
create table outputdata as
  select word,count(*) as count
  from inputdata 
  group by word
;
quit;&lt;/CODE&gt;&lt;/PRE&gt;</description>
    <pubDate>Fri, 13 Nov 2020 17:20:03 GMT</pubDate>
    <dc:creator>Tom</dc:creator>
    <dc:date>2020-11-13T17:20:03Z</dc:date>
    <item>
      <title>Base SAS word Count with millions of rows of word</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Base-SAS-word-Count-with-millions-of-rows-of-word/m-p/698721#M213710</link>
      <description>&lt;P&gt;This is a challenge for me especially with insufficient memory running proc freq or proc sql counts. I split the text fields into individual words and just have one column called "word". It had 140mil words, I split the data into three datasets as 50+50+40 mil. But still I can't run proc freq even using order=freq option.&lt;/P&gt;
&lt;P&gt;Code is like below...&lt;/P&gt;
&lt;P&gt;&lt;FONT face="Arial" color="#000080"&gt;&lt;STRONG&gt;proc&lt;/STRONG&gt;&lt;/FONT&gt; &lt;STRONG&gt;&lt;FONT face="Arial" color="#000080"&gt;freq&lt;/FONT&gt;&lt;/STRONG&gt; &lt;FONT face="Arial" color="#0000ff"&gt;data&lt;/FONT&gt;&lt;FONT face="Arial"&gt;=inputdata &lt;/FONT&gt;&lt;FONT face="Arial" color="#0000ff"&gt;order&lt;/FONT&gt;&lt;FONT face="Arial"&gt;=freq;&lt;/FONT&gt;&lt;/P&gt;
&lt;P&gt;&lt;FONT face="Arial" color="#0000ff"&gt;tables&lt;/FONT&gt;&lt;FONT face="Arial"&gt; word / &lt;/FONT&gt;&lt;FONT face="Arial" color="#0000ff"&gt;noprint&lt;/FONT&gt; &lt;FONT face="Arial" color="#0000ff"&gt;nocum&lt;/FONT&gt; &lt;FONT face="Arial" color="#0000ff"&gt;nopercent&lt;/FONT&gt; &lt;FONT face="Arial" color="#0000ff"&gt;out&lt;/FONT&gt;&lt;FONT face="Arial"&gt;=outputdata;&lt;/FONT&gt;&lt;/P&gt;
&lt;P&gt;&lt;FONT face="Arial" color="#000080"&gt;&lt;STRONG&gt;run&lt;/STRONG&gt;&lt;/FONT&gt;&lt;FONT face="Arial"&gt;;&lt;/FONT&gt;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;I get the below...&lt;/P&gt;
&lt;P&gt;ERROR: The SAS System stopped processing this step because of insufficient memory.&lt;/P&gt;
&lt;P&gt;NOTE: There were 3347033 observations read from the data set &lt;FONT face="Arial"&gt;inputdata&lt;/FONT&gt;.&lt;/P&gt;
&lt;P&gt;&lt;LI-WRAPPER&gt;&lt;/LI-WRAPPER&gt;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;I wanted to get counts of each word and sort them in&amp;nbsp;descending order.&lt;/P&gt;
&lt;P&gt;I will be fine even having top 500 words with their counts or top 100 words with their frequency. I tried nocum nopercent etc., to avoid larger dataset, but still I am getting lot of errors related to memory issues. I even try sending into out dataset but similar memory issues show up. Please suggest if I can use PROC IML or other effective procedures that saves memory.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Fri, 13 Nov 2020 17:12:33 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Base-SAS-word-Count-with-millions-of-rows-of-word/m-p/698721#M213710</guid>
      <dc:creator>Venkat4</dc:creator>
      <dc:date>2020-11-13T17:12:33Z</dc:date>
    </item>
    <item>
      <title>Re: Base SAS word Count with millions of rows of word</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Base-SAS-word-Count-with-millions-of-rows-of-word/m-p/698722#M213711</link>
      <description>&lt;P&gt;Add the option NOPRINT to the proc freq statement and see if that helps.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Most of the "memory" used was likely trying to create a results table with millions of rows.&lt;/P&gt;</description>
      <pubDate>Fri, 13 Nov 2020 17:16:18 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Base-SAS-word-Count-with-millions-of-rows-of-word/m-p/698722#M213711</guid>
      <dc:creator>ballardw</dc:creator>
      <dc:date>2020-11-13T17:16:18Z</dc:date>
    </item>
    <item>
      <title>Re: Base SAS word Count with millions of rows of word</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Base-SAS-word-Count-with-millions-of-rows-of-word/m-p/698723#M213712</link>
      <description>&lt;P&gt;If you just want to count how many times each value appears (and don't need to generate an statistics) then there is no need to use PROC FREQ.&amp;nbsp; You could count with a simple SQL statement.&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;proc sql;
create table outputdata as
  select word,count(*) as count
  from inputdata 
  group by word
;
quit;&lt;/CODE&gt;&lt;/PRE&gt;</description>
      <pubDate>Fri, 13 Nov 2020 17:20:03 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Base-SAS-word-Count-with-millions-of-rows-of-word/m-p/698723#M213712</guid>
      <dc:creator>Tom</dc:creator>
      <dc:date>2020-11-13T17:20:03Z</dc:date>
    </item>
    <item>
      <title>Re: Base SAS word Count with millions of rows of word</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Base-SAS-word-Count-with-millions-of-rows-of-word/m-p/698725#M213713</link>
      <description>&lt;P&gt;If &lt;a href="https://communities.sas.com/t5/user/viewprofilepage/user-id/13884"&gt;@ballardw&lt;/a&gt;&amp;nbsp;'s suggestion doesn't work, then you could sort the dataset by word - then create a dataset with the word value and word frequency, using a data step or a proc using a "by word" statement.&amp;nbsp; You then would have a dataset with two vars: word and word_freq:&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;proc sort data=inputdata out=need;
  by word;
run;

data outputdata;
  do word_freq=1 by 1 until (last.word);
    set need;
    by word;
  end;
run;&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;It might take a while to do the sort, but it is a simple program.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Fri, 13 Nov 2020 17:23:54 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Base-SAS-word-Count-with-millions-of-rows-of-word/m-p/698725#M213713</guid>
      <dc:creator>mkeintz</dc:creator>
      <dc:date>2020-11-13T17:23:54Z</dc:date>
    </item>
    <item>
      <title>Re: Base SAS word Count with millions of rows of word</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Base-SAS-word-Count-with-millions-of-rows-of-word/m-p/698732#M213717</link>
      <description>&lt;P&gt;&lt;FONT style="background-color: #ffffff;"&gt;Thanks, even with the noprint option I got memory issues....and it stopped right after 5mil obs&lt;/FONT&gt;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&lt;FONT style="background-color: #ffffff;"&gt;ERROR: The SAS System stopped processing this step because of insufficient memory.&lt;BR /&gt;NOTE: There were 5242529 observations read from the data set xxxxx&lt;/FONT&gt;&lt;/P&gt;</description>
      <pubDate>Fri, 13 Nov 2020 17:36:45 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Base-SAS-word-Count-with-millions-of-rows-of-word/m-p/698732#M213717</guid>
      <dc:creator>Venkat4</dc:creator>
      <dc:date>2020-11-13T17:36:45Z</dc:date>
    </item>
    <item>
      <title>Re: Base SAS word Count with millions of rows of word</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Base-SAS-word-Count-with-millions-of-rows-of-word/m-p/698736#M213718</link>
      <description>&lt;P&gt;Thanks, this is what I tried in the beginning. It stops right away due to memory issues. I am using the grid and they have 500GB limit for each user, this is automatically shuts down because it is taking more than 500GB - just the dataset itself is only 2GB. I think it is using some kind of intermittent table for calculations and that is taking too much space even using word or permanent library.&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Fri, 13 Nov 2020 17:40:10 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Base-SAS-word-Count-with-millions-of-rows-of-word/m-p/698736#M213718</guid>
      <dc:creator>Venkat4</dc:creator>
      <dc:date>2020-11-13T17:40:10Z</dc:date>
    </item>
    <item>
      <title>Re: Base SAS word Count with millions of rows of word</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Base-SAS-word-Count-with-millions-of-rows-of-word/m-p/698737#M213719</link>
      <description>&lt;BLOCKQUOTE&gt;&lt;HR /&gt;&lt;a href="https://communities.sas.com/t5/user/viewprofilepage/user-id/43935"&gt;@Venkat4&lt;/a&gt;&amp;nbsp;wrote:&lt;BR /&gt;
&lt;P&gt;&lt;FONT style="background-color: #ffffff;"&gt;Thanks, even with the noprint option I got memory issues....and it stopped right after 5mil obs&lt;/FONT&gt;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&lt;FONT style="background-color: #ffffff;"&gt;ERROR: The SAS System stopped processing this step because of insufficient memory.&lt;BR /&gt;NOTE: There were 5242529 observations read from the data set xxxxx&lt;/FONT&gt;&lt;/P&gt;
&lt;HR /&gt;&lt;/BLOCKQUOTE&gt;
&lt;P&gt;I am sure PROC FREQ is trying to build some type of table of all of the values it finds in the dataset (probably a hash table).&amp;nbsp; So if you have 5 million different values of WORD then it needs at least 5 million * (length(word) +8 bytes for the count variable) bytes of memory.&amp;nbsp; If WORD is very long that quickly works out to a lot of memory.&lt;/P&gt;</description>
      <pubDate>Fri, 13 Nov 2020 17:40:41 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Base-SAS-word-Count-with-millions-of-rows-of-word/m-p/698737#M213719</guid>
      <dc:creator>Tom</dc:creator>
      <dc:date>2020-11-13T17:40:41Z</dc:date>
    </item>
    <item>
      <title>Re: Base SAS word Count with millions of rows of word</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Base-SAS-word-Count-with-millions-of-rows-of-word/m-p/698742#M213722</link>
      <description>&lt;BLOCKQUOTE&gt;&lt;HR /&gt;&lt;a href="https://communities.sas.com/t5/user/viewprofilepage/user-id/43935"&gt;@Venkat4&lt;/a&gt;&amp;nbsp;wrote:&lt;BR /&gt;
&lt;P&gt;&lt;FONT style="background-color: #ffffff;"&gt;Thanks, even with the noprint option I got memory issues....and it stopped right after 5mil obs&lt;/FONT&gt;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&lt;FONT style="background-color: #ffffff;"&gt;ERROR: The SAS System stopped processing this step because of insufficient memory.&lt;BR /&gt;NOTE: There were 5242529 observations read from the data set xxxxx&lt;/FONT&gt;&lt;/P&gt;
&lt;HR /&gt;&lt;/BLOCKQUOTE&gt;
&lt;P&gt;Which version of SAS are you running? If you are connecting to a server you may be running into memory limits, i.e. disk space, because of your admin set limits.&lt;/P&gt;</description>
      <pubDate>Fri, 13 Nov 2020 17:56:30 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Base-SAS-word-Count-with-millions-of-rows-of-word/m-p/698742#M213722</guid>
      <dc:creator>ballardw</dc:creator>
      <dc:date>2020-11-13T17:56:30Z</dc:date>
    </item>
    <item>
      <title>Re: Base SAS word Count with millions of rows of word</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Base-SAS-word-Count-with-millions-of-rows-of-word/m-p/698748#M213725</link>
      <description>&lt;BLOCKQUOTE&gt;&lt;HR /&gt;&lt;a href="https://communities.sas.com/t5/user/viewprofilepage/user-id/43935"&gt;@Venkat4&lt;/a&gt;&amp;nbsp;wrote:&lt;BR /&gt;
&lt;P&gt;Thanks, this is what I tried in the beginning. It stops right away due to memory issues. I am using the grid and they have 500GB limit for each user, this is automatically shuts down because it is taking more than 500GB - just the dataset itself is only 2GB. I think it is using some kind of intermittent table for calculations and that is taking too much space even using word or permanent library.&amp;nbsp;&lt;/P&gt;
&lt;HR /&gt;&lt;/BLOCKQUOTE&gt;
&lt;P&gt;I think you are confusing memory limits (which is what your error messages are referring to) with disk storage limit (which is almost certainly the 500GB you refer to).&amp;nbsp; If the SQL is actually reporting termination due to memory limits, it's not the 500GB - it's some other limit.&lt;BR /&gt;&lt;BR /&gt;To find out how much memory your SAS program typically has available, run this to get a report:&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;proc options option=memsize;
run;&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;Then you can check with your sysadmin's to see the maximum amount of ram you can request when starting a sas session - and how to do it in your environment.&lt;/P&gt;</description>
      <pubDate>Fri, 13 Nov 2020 18:27:03 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Base-SAS-word-Count-with-millions-of-rows-of-word/m-p/698748#M213725</guid>
      <dc:creator>mkeintz</dc:creator>
      <dc:date>2020-11-13T18:27:03Z</dc:date>
    </item>
    <item>
      <title>How long us your Re: Base SAS word Count with millions of rows of word</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Base-SAS-word-Count-with-millions-of-rows-of-word/m-p/698749#M213726</link>
      <description>&lt;P&gt;How long is your "word" variable defined, and how long is the longest value in it?&lt;/P&gt;</description>
      <pubDate>Fri, 13 Nov 2020 18:28:10 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Base-SAS-word-Count-with-millions-of-rows-of-word/m-p/698749#M213726</guid>
      <dc:creator>Kurt_Bremser</dc:creator>
      <dc:date>2020-11-13T18:28:10Z</dc:date>
    </item>
    <item>
      <title>Re: How long us your Re: Base SAS word Count with millions of rows of word</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Base-SAS-word-Count-with-millions-of-rows-of-word/m-p/698782#M213730</link>
      <description>It assigned 10000 not sure why, I don't have any longer word. I see that could be the reason this whole thing is blowing up. Thanks.</description>
      <pubDate>Fri, 13 Nov 2020 19:54:50 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Base-SAS-word-Count-with-millions-of-rows-of-word/m-p/698782#M213730</guid>
      <dc:creator>Venkat4</dc:creator>
      <dc:date>2020-11-13T19:54:50Z</dc:date>
    </item>
    <item>
      <title>Re: Base SAS word Count with millions of rows of word</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Base-SAS-word-Count-with-millions-of-rows-of-word/m-p/698784#M213731</link>
      <description>Thank you, much. This worked with the 3 way split dataset fairly quickly (15-20min each) and didn't throw any issues. I am working on combining them and getting the final counts.</description>
      <pubDate>Fri, 13 Nov 2020 19:56:12 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Base-SAS-word-Count-with-millions-of-rows-of-word/m-p/698784#M213731</guid>
      <dc:creator>Venkat4</dc:creator>
      <dc:date>2020-11-13T19:56:12Z</dc:date>
    </item>
    <item>
      <title>Re: Base SAS word Count with millions of rows of word</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Base-SAS-word-Count-with-millions-of-rows-of-word/m-p/698786#M213732</link>
      <description>It is SAS 9.4 M5 running as a client-server via EG 7.15</description>
      <pubDate>Fri, 13 Nov 2020 19:57:32 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Base-SAS-word-Count-with-millions-of-rows-of-word/m-p/698786#M213732</guid>
      <dc:creator>Venkat4</dc:creator>
      <dc:date>2020-11-13T19:57:32Z</dc:date>
    </item>
    <item>
      <title>Re: Base SAS word Count with millions of rows of word</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Base-SAS-word-Count-with-millions-of-rows-of-word/m-p/698787#M213733</link>
      <description>Understood, thank you.</description>
      <pubDate>Fri, 13 Nov 2020 19:58:05 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Base-SAS-word-Count-with-millions-of-rows-of-word/m-p/698787#M213733</guid>
      <dc:creator>Venkat4</dc:creator>
      <dc:date>2020-11-13T19:58:05Z</dc:date>
    </item>
    <item>
      <title>Re: Base SAS word Count with millions of rows of word</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Base-SAS-word-Count-with-millions-of-rows-of-word/m-p/698795#M213735</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://communities.sas.com/t5/user/viewprofilepage/user-id/43935"&gt;@Venkat4&lt;/a&gt;,&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Glad to see that mkeintz's suggestion worked for you. So, my thoughts below might be of interest for later readers with a similar problem.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;BLOCKQUOTE&gt;&lt;HR /&gt;&lt;a href="https://communities.sas.com/t5/user/viewprofilepage/user-id/43935"&gt;@Venkat4&lt;/a&gt;&amp;nbsp;wrote:&lt;BR /&gt;
&lt;P&gt;(...) I split the data into three datasets as 50+50+40 mil. But still I can't run proc freq (...)&lt;/P&gt;
&lt;HR /&gt;&lt;/BLOCKQUOTE&gt;
&lt;P&gt;Whether you use PROC FREQ or PROC SQL or a DATA step with a hash object, splitting the input dataset should help &lt;STRONG&gt;if&lt;/STRONG&gt; the subsets don't overlap. To ensure that, don't split by observation number, but by other criteria such as &lt;FONT face="courier new,courier"&gt;length(word)&lt;/FONT&gt;, &lt;FONT face="courier new,courier"&gt;lowcase(first(word))&lt;/FONT&gt;, etc.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Example:&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;data h1 h2 h3 h4 h5;
set inputdata;
select(length(word));
  when(1,2,3)  output h1;
  when(4,5)    output h2;
  when(6,7)    output h3;
  when(8,9,10) output h4;
  otherwise    output h5;
end;
run;&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;(Consider using the &lt;A href="https://documentation.sas.com/?cdcId=pgmsascdc&amp;amp;cdcVersion=9.4_3.5&amp;amp;docsetId=ledsoptsref&amp;amp;docsetTarget=n014hy7167t2asn1j7qo99qv16wa.htm&amp;amp;locale=en" target="_blank" rel="noopener"&gt;COMPRESS=YES&lt;/A&gt; dataset option on h1, h2, ...)&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;If you're unsure about suitable cut points, you can draw a reasonably sized random sample ...&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;proc surveyselect data=inputdata
method=srs n=100000
seed=2718 out=samp;
run;&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;[edit: use a DATA step for this if you don't have SAS/STAT]&lt;/P&gt;
&lt;P&gt;and analyze that quickly to get an idea of the distributions of lengths, first letters, etc. and even relative word frequencies. A frequency count of the combinations of the first two letters like&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;proc freq data=samp noprint;
format word $2.;
tables word / out=cnt;
run;&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;might be feasible even on the full dataset.&lt;/P&gt;</description>
      <pubDate>Fri, 13 Nov 2020 20:08:48 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Base-SAS-word-Count-with-millions-of-rows-of-word/m-p/698795#M213735</guid>
      <dc:creator>FreelanceReinh</dc:creator>
      <dc:date>2020-11-13T20:08:48Z</dc:date>
    </item>
    <item>
      <title>Re: How long us your Re: Base SAS word Count with millions of rows of word</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Base-SAS-word-Count-with-millions-of-rows-of-word/m-p/698813#M213741</link>
      <description>&lt;P&gt;Since any utility file is uncompressed (and the in-memory structures of procedures like FREQ also), this will invariably cause problems.&lt;/P&gt;
&lt;P&gt;Check for max(length(word)), and adjust your dataset accordingly. Also consider converting all words to lowercase, to avoid differences caused by words starting a sentence. That reduces the size of the in-memory tables.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;After all, in everyday language the number of words actually used is a few 1000, so this should not cause any memory problems.&lt;/P&gt;
&lt;P&gt;The &lt;A href="https://en.wikipedia.org/wiki/Oxford_Dictionary_of_English?wprov=sfla1" target="_blank" rel="noopener"&gt;Oxford Dictionary of English&lt;/A&gt; contains 355000 words, so even with a word length of 100 you'd need just 40 MB (50 including a search tree) for the freq table.&lt;/P&gt;</description>
      <pubDate>Fri, 13 Nov 2020 21:04:01 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Base-SAS-word-Count-with-millions-of-rows-of-word/m-p/698813#M213741</guid>
      <dc:creator>Kurt_Bremser</dc:creator>
      <dc:date>2020-11-13T21:04:01Z</dc:date>
    </item>
    <item>
      <title>Re: How long us your Re: Base SAS word Count with millions of rows of word</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Base-SAS-word-Count-with-millions-of-rows-of-word/m-p/698870#M213771</link>
      <description>As Kurt mentioned , open the dataset and right click the column and check the length of column, I doubt its length is very too big to make this error happen.</description>
      <pubDate>Sat, 14 Nov 2020 11:58:01 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Base-SAS-word-Count-with-millions-of-rows-of-word/m-p/698870#M213771</guid>
      <dc:creator>Ksharp</dc:creator>
      <dc:date>2020-11-14T11:58:01Z</dc:date>
    </item>
    <item>
      <title>Re: Base SAS word Count with millions of rows of word</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Base-SAS-word-Count-with-millions-of-rows-of-word/m-p/698933#M213793</link>
      <description>&lt;P&gt;Using SPDE should be faster than using proc sort.&lt;/P&gt;
&lt;P&gt;And SAS is not very efficient when dealing with 10k-char strings.&lt;/P&gt;
&lt;P&gt;Do you want to try this?&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;data WORDS;  * Generate sample data;
  length WORD $10000;
  do I=1 to 100e6;
    WORD=repeat( cats(int(ranuni(1)*100)), abs(int(rannor(1)*99)) );
    output;
  end;
run;
        
data _null_;  * find longest string;
  set WORDS ;
  retain LEN; 
  LEN=max(LEN,length(WORD));
  if LASTOBS then call symputx('len',LEN);
run;

%let wdir=%sysfunc(pathname(WORK));
libname W spde "&amp;amp;wdir" partsize=1T compress=binary;

data W.WORDS1;   * Copy data to SPDE and trim variable;
  length WORD $&amp;amp;len.;
proc append data=WORDS base=W.WORDS1 force;
run;

data COUNTS;  * Derive word cardinality;
  set W.WORDS1;
  by WORD;
  CNT+1;
  if last.WORD then do;
    output;
    CNT=0;
  end;
run;
&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;This uses 35 minutes:&lt;/P&gt;
&lt;PRE&gt;37         data _null_;
38           set WORDS end=LASTOBS;
39           retain LEN;
40           LEN=max(LEN,length(WORD));
41           if LASTOBS then call symputx('len',LEN);
42         run;
NOTE: DATA statement used (Total process time):&lt;BR /&gt;      real time 4:09.76&lt;BR /&gt;      user cpu time 3:53.46&lt;BR /&gt;      system cpu time 15.75 seconds
43         
44         data W.WORDS1;
45           length WORD $&amp;amp;len.;
46         proc append data=WORDS base=W.WORDS1 force;
47         run;
NOTE: PROCEDURE APPEND used (Total process time):
      real time           4:55.10
      user cpu time       7:17.37
      system cpu time     36.93 seconds&lt;BR /&gt;48         
49         data COUNTS;
51           set W.WORDS1;
52           by WORD;
53           CNT+1;
54           if last.WORD then do;
55             output;
56             CNT=0;
57           end;
58         run;
NOTE: DATA statement used (Total process time):
      real time           26:15.31
      user cpu time       13:07.46
      system cpu time     14:12.42
&lt;/PRE&gt;
&lt;P&gt;on my old server. So old I cannot use &lt;FONT face="courier new,courier"&gt;rand('uniform')&lt;/FONT&gt; !&amp;nbsp; &lt;span class="lia-unicode-emoji" title=":slightly_smiling_face:"&gt;🙂&lt;/span&gt;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Sun, 15 Nov 2020 04:11:23 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Base-SAS-word-Count-with-millions-of-rows-of-word/m-p/698933#M213793</guid>
      <dc:creator>ChrisNZ</dc:creator>
      <dc:date>2020-11-15T04:11:23Z</dc:date>
    </item>
  </channel>
</rss>

