<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Count each type of character in Statistical Procedures</title>
    <link>https://communities.sas.com/t5/Statistical-Procedures/Count-each-type-of-character/m-p/810339#M39893</link>
    <description>&lt;P&gt;You should create a list of chars that don't cause any trouble, so that they can be removed with kcompress before variable text is analysed.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;This could solve your problem:&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;data _null_;
   set have end= jobDone;
   
   length 
      OneChar $ 2 /*  not sure if a length of 4 is required */
      Count 8
   ;
   
   if _n_ = 1 then do;
      declare hash h(ordered: 'yes');
      h.defineKey('OneChar');
      h.defineData('OneChar', 'Count');
      h.defineDone();
   end;
   
   do i = 1 to lengthn(Text);
      OneChar = ksubstr(Text, i, 1);
      
      if h.find() ^= 0 then do;
         Count = 1;
         h.add();
      end;
      else do;
         Count = Count + 1;
         h.replace();
      end;
   end;
   
   if jobDone then do;
      h.output(dataset: 'work.counter');
   end;
run;&lt;/CODE&gt;&lt;/PRE&gt;</description>
    <pubDate>Thu, 28 Apr 2022 11:44:24 GMT</pubDate>
    <dc:creator>andreas_lds</dc:creator>
    <dc:date>2022-04-28T11:44:24Z</dc:date>
    <item>
      <title>Count each type of character</title>
      <link>https://communities.sas.com/t5/Statistical-Procedures/Count-each-type-of-character/m-p/810338#M39892</link>
      <description>&lt;P&gt;I have data like this (simplified version):&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;ID&amp;nbsp; &amp;nbsp; &amp;nbsp;Text&lt;/P&gt;&lt;P&gt;1&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;ABC&lt;/P&gt;&lt;P&gt;2&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;ABCD&lt;/P&gt;&lt;P&gt;3&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;ABCDE&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;I wish to count how many times each type of character appears. But: the actual data it is a big dataset and I don't know what types of characters that may appear. In the example above, A, B, C, D and E appear, but in the actual data any unicode character may appear. The purpose is to remove certain characters since they seem to cause trouble when I try to import it in SQL, but it seems that I can import it in SAS. Therefore I try to figure find out which characters I need to remove to be able to import it in SQL. I found countc which allows me to specify a particular character I wish to count, which would work fine for a limited number of characters, but in this case I expect to get a table of many hundred lines in the output since I have several million lines in the raw data.&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Any help would be very appreciated&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Kind regards,&lt;/P&gt;&lt;P&gt;Jacob&lt;/P&gt;</description>
      <pubDate>Thu, 28 Apr 2022 11:26:15 GMT</pubDate>
      <guid>https://communities.sas.com/t5/Statistical-Procedures/Count-each-type-of-character/m-p/810338#M39892</guid>
      <dc:creator>Jbraun</dc:creator>
      <dc:date>2022-04-28T11:26:15Z</dc:date>
    </item>
    <item>
      <title>Re: Count each type of character</title>
      <link>https://communities.sas.com/t5/Statistical-Procedures/Count-each-type-of-character/m-p/810339#M39893</link>
      <description>&lt;P&gt;You should create a list of chars that don't cause any trouble, so that they can be removed with kcompress before variable text is analysed.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;This could solve your problem:&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;data _null_;
   set have end= jobDone;
   
   length 
      OneChar $ 2 /*  not sure if a length of 4 is required */
      Count 8
   ;
   
   if _n_ = 1 then do;
      declare hash h(ordered: 'yes');
      h.defineKey('OneChar');
      h.defineData('OneChar', 'Count');
      h.defineDone();
   end;
   
   do i = 1 to lengthn(Text);
      OneChar = ksubstr(Text, i, 1);
      
      if h.find() ^= 0 then do;
         Count = 1;
         h.add();
      end;
      else do;
         Count = Count + 1;
         h.replace();
      end;
   end;
   
   if jobDone then do;
      h.output(dataset: 'work.counter');
   end;
run;&lt;/CODE&gt;&lt;/PRE&gt;</description>
      <pubDate>Thu, 28 Apr 2022 11:44:24 GMT</pubDate>
      <guid>https://communities.sas.com/t5/Statistical-Procedures/Count-each-type-of-character/m-p/810339#M39893</guid>
      <dc:creator>andreas_lds</dc:creator>
      <dc:date>2022-04-28T11:44:24Z</dc:date>
    </item>
    <item>
      <title>Re: Count each type of character</title>
      <link>https://communities.sas.com/t5/Statistical-Procedures/Count-each-type-of-character/m-p/810340#M39894</link>
      <description>Awesome, thanks!</description>
      <pubDate>Thu, 28 Apr 2022 11:48:11 GMT</pubDate>
      <guid>https://communities.sas.com/t5/Statistical-Procedures/Count-each-type-of-character/m-p/810340#M39894</guid>
      <dc:creator>Jbraun</dc:creator>
      <dc:date>2022-04-28T11:48:11Z</dc:date>
    </item>
    <item>
      <title>Re: Count each type of character</title>
      <link>https://communities.sas.com/t5/Statistical-Procedures/Count-each-type-of-character/m-p/810341#M39895</link>
      <description>&lt;P&gt;You can create a new data set of just single characters, then let PROC FREQ have all the fun&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;data intermediate;
    set have;
    length char $ 1;
    do i=1 to length(text);
        char=substr(text,i,1);
        output;
    end;
    drop i text;
run;
proc freq data=intermediate;
    tables char/noprint out=want;
run;&lt;/CODE&gt;&lt;/PRE&gt;</description>
      <pubDate>Thu, 28 Apr 2022 13:01:33 GMT</pubDate>
      <guid>https://communities.sas.com/t5/Statistical-Procedures/Count-each-type-of-character/m-p/810341#M39895</guid>
      <dc:creator>PaigeMiller</dc:creator>
      <dc:date>2022-04-28T13:01:33Z</dc:date>
    </item>
    <item>
      <title>Re: Count each type of character</title>
      <link>https://communities.sas.com/t5/Statistical-Procedures/Count-each-type-of-character/m-p/810342#M39896</link>
      <description>&lt;P&gt;Much appreciated!&lt;/P&gt;</description>
      <pubDate>Thu, 28 Apr 2022 11:51:32 GMT</pubDate>
      <guid>https://communities.sas.com/t5/Statistical-Procedures/Count-each-type-of-character/m-p/810342#M39896</guid>
      <dc:creator>Jbraun</dc:creator>
      <dc:date>2022-04-28T11:51:32Z</dc:date>
    </item>
    <item>
      <title>Re: Count each type of character</title>
      <link>https://communities.sas.com/t5/Statistical-Procedures/Count-each-type-of-character/m-p/810363#M39900</link>
      <description>&lt;P&gt;Wouldn't it be easier to just understand what is happening?&lt;/P&gt;
&lt;P&gt;Sounds like you have data in UTF-8 encoding and you are trying to write it to a database that is using some other encoding.&amp;nbsp; What encoding is it using? Is it a single byte encoding?&amp;nbsp; Like LATIN1?&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;If it is a single byte encoding the remove the characters that are not available in that encoding.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;You might start with getting the list of invalid characters.&lt;/P&gt;
&lt;P&gt;For example this will create a new variable named BAD with the characters that will not map to LATIN1 and only keep the observations that have at least one bad character.&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;data test;
  set have;
  bad = kcompress(text, kcvt(collate(0,256),'LATIN1','UTF-8'),'k');
  if bad ne ' ';
run;&lt;/CODE&gt;&lt;/PRE&gt;</description>
      <pubDate>Thu, 28 Apr 2022 13:53:06 GMT</pubDate>
      <guid>https://communities.sas.com/t5/Statistical-Procedures/Count-each-type-of-character/m-p/810363#M39900</guid>
      <dc:creator>Tom</dc:creator>
      <dc:date>2022-04-28T13:53:06Z</dc:date>
    </item>
    <item>
      <title>Re: Count each type of character</title>
      <link>https://communities.sas.com/t5/Statistical-Procedures/Count-each-type-of-character/m-p/810461#M39901</link>
      <description>The data is undocumentet so I don't know exactly what could hide in the file, but thanks for the suggestion</description>
      <pubDate>Thu, 28 Apr 2022 19:02:10 GMT</pubDate>
      <guid>https://communities.sas.com/t5/Statistical-Procedures/Count-each-type-of-character/m-p/810461#M39901</guid>
      <dc:creator>Jbraun</dc:creator>
      <dc:date>2022-04-28T19:02:10Z</dc:date>
    </item>
    <item>
      <title>Re: Count each type of character</title>
      <link>https://communities.sas.com/t5/Statistical-Procedures/Count-each-type-of-character/m-p/810469#M39903</link>
      <description>&lt;BLOCKQUOTE&gt;&lt;HR /&gt;&lt;a href="https://communities.sas.com/t5/user/viewprofilepage/user-id/163039"&gt;@Jbraun&lt;/a&gt;&amp;nbsp;wrote:&lt;BR /&gt;The data is undocumentet so I don't know exactly what could hide in the file, but thanks for the suggestion&lt;HR /&gt;&lt;/BLOCKQUOTE&gt;
&lt;P&gt;Your original description of the problem made it sound like the problem was not the source data, but the limitations of the target you were trying to push the data into.&amp;nbsp; &amp;nbsp;If the target database supported UTF-8 characters and your SAS session is running with UTF-8 support then reading files with any encoding should work fine.&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;It is when you try to go the other way that you end up trying to stuff ten pounds of **** into a five pound bag.&lt;/P&gt;</description>
      <pubDate>Thu, 28 Apr 2022 19:45:01 GMT</pubDate>
      <guid>https://communities.sas.com/t5/Statistical-Procedures/Count-each-type-of-character/m-p/810469#M39903</guid>
      <dc:creator>Tom</dc:creator>
      <dc:date>2022-04-28T19:45:01Z</dc:date>
    </item>
  </channel>
</rss>

