<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Web scrapping using SAS in SAS Programming</title>
    <link>https://communities.sas.com/t5/SAS-Programming/Web-scraping-a-CDC-gov-site-using-SAS/m-p/417607#M280381</link>
    <description>&lt;P&gt;Thank you, I'll use this until I find a stable source.&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Thu, 30 Nov 2017 22:46:35 GMT</pubDate>
    <dc:creator>HabAM</dc:creator>
    <dc:date>2017-11-30T22:46:35Z</dc:date>
    <item>
      <title>Web scraping a CDC.gov site using SAS</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Web-scraping-a-CDC-gov-site-using-SAS/m-p/417569#M280378</link>
      <description>&lt;P&gt;Hello Community,&lt;/P&gt;
&lt;P&gt;I need to know what I am doing wrong with this code:&lt;/P&gt;
&lt;P&gt;interested in this page:&lt;/P&gt;
&lt;P&gt;&lt;A href="https://wwwn.cdc.gov/nndss/conditions/search/" target="_blank" rel="nofollow noopener noreferrer"&gt;https://wwwn.cdc.gov/nndss/conditions/search/&lt;/A&gt;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Two&amp;nbsp;columns of interest:&lt;/P&gt;
&lt;P&gt;&lt;SPAN&gt;&amp;nbsp;Name and Notifiable To&lt;/SPAN&gt;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;current code:&lt;/P&gt;
&lt;P&gt;filename CDC url "&lt;A href="https://wwwn.cdc.gov/nndss/conditions/search/" target="_blank" rel="nofollow noopener noreferrer"&gt;https://wwwn.cdc.gov/nndss/conditions/search/&lt;/A&gt;";&lt;BR /&gt;data rep;&lt;BR /&gt;infile CDC length=len lrecl=32767;&lt;BR /&gt;input line $varying32767. len dt $varying32767. len;&lt;BR /&gt;if find(line,"nndss/conditions/") then do;&lt;BR /&gt;list=scan(line,4,'/');&lt;BR /&gt;output;&lt;BR /&gt;end;&lt;/P&gt;
&lt;P&gt;run;&lt;BR /&gt;filename CDC clear;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Thank you&lt;/P&gt;</description>
      <pubDate>Thu, 30 Nov 2017 21:28:50 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Web-scraping-a-CDC-gov-site-using-SAS/m-p/417569#M280378</guid>
      <dc:creator>HabAM</dc:creator>
      <dc:date>2017-11-30T21:28:50Z</dc:date>
    </item>
    <item>
      <title>Re: Web scrapping using SAS</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Web-scraping-a-CDC-gov-site-using-SAS/m-p/417578#M280379</link>
      <description>&lt;P&gt;To sum up: you want to use SAS to pull in this web page hosted by the CDC, which contains a large table (among other items), and save just a couple of columns from that table.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;I still maintain (as I've said in other similar threads) -- this information&amp;nbsp;&lt;STRONG&gt;probably&lt;/STRONG&gt; exists somewhere else in a more consumable form, probably from the CDC -- a US government organization that serves up a tremendous amount of data to the public and to other agencies.&amp;nbsp; It might be worth reaching out to them.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&lt;STRONG&gt;Update&lt;/STRONG&gt;: this thread inspired me to write a longer article about web scraping in general: &lt;A href="https://blogs.sas.com/content/sasdummy/2017/12/04/scrape-web-page-data/" target="_self"&gt;How to scrape data from a web page using SAS&lt;/A&gt;.&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;The web table looks like this:&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="cdc.png" style="width: 600px;"&gt;&lt;img src="https://communities.sas.com/t5/image/serverpage/image-id/16942iE5407E98F2A0C227/image-size/large?v=v2&amp;amp;px=999" role="button" title="cdc.png" alt="cdc.png" /&gt;&lt;/span&gt;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Here's a brute force start at parsing it.&amp;nbsp; You'll have additional cleanup to do.&amp;nbsp;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&lt;BR /&gt;Note that this relies on the web page current scheme for line breaks and structure -- it's fragile in that sense.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;/* Get all of the nonblank lines */
filename CDC url "https://wwwn.cdc.gov/nndss/conditions/search/";
data rep;
infile CDC length=len lrecl=32767;
input line $varying32767. len;
 line = strip(line);
 if len&amp;gt;0;
run;
filename CDC clear;

/* Parse the lines and keep just condition names */
/* When a condition code is found, grab the line following (full name of condition) */
/* and the 8th line following (Notification To date)                                */
/* Relies on this page's exact layout and line break scheme */
data parsed (keep=condition_code condition_full note_to);
 length condition_code $ 40 condition_full $ 60;
 set rep;
 if find(line,"/nndss/conditions/") then do;
   condition_code=scan(line,4,'/');
   pickup= _n_+1 ;
   pickup2 = _n_+8;
   set rep (rename=(line=condition_full)) point=pickup;
   set rep (rename=(line=note_to)) point=pickup2;
   output;
  end;
run;
&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Sample result -- HTML tags and other noise are yours to clean up.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="cdcres.png" style="width: 600px;"&gt;&lt;img src="https://communities.sas.com/t5/image/serverpage/image-id/16943i7CEAF395F30FC233/image-size/large?v=v2&amp;amp;px=999" role="button" title="cdcres.png" alt="cdcres.png" /&gt;&lt;/span&gt;&lt;/P&gt;</description>
      <pubDate>Mon, 04 Dec 2017 17:02:33 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Web-scraping-a-CDC-gov-site-using-SAS/m-p/417578#M280379</guid>
      <dc:creator>ChrisHemedinger</dc:creator>
      <dc:date>2017-12-04T17:02:33Z</dc:date>
    </item>
    <item>
      <title>Re: Web scraping a CDC.gov site using SAS</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Web-scraping-a-CDC-gov-site-using-SAS/m-p/417592#M280380</link>
      <description>&lt;P&gt;To add on to&amp;nbsp;&lt;a href="https://communities.sas.com/t5/user/viewprofilepage/user-id/4"&gt;@ChrisHemedinger&lt;/a&gt;&amp;nbsp;comment, the CDC maintains there own data page here:&lt;/P&gt;
&lt;P&gt;&lt;A href="https://data.cdc.gov" target="_blank"&gt;https://data.cdc.gov&lt;/A&gt;&lt;/P&gt;
&lt;P&gt;There's also an email contact.&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Just a note that if you never ask, the data will never change format. We often change how we provide data solely based on requests we receive...if people scrape we don't know and can't make decisions on what to release publicly.&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Note that I am not affiliated with the CDC in any manner.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 30 Nov 2017 22:06:07 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Web-scraping-a-CDC-gov-site-using-SAS/m-p/417592#M280380</guid>
      <dc:creator>Reeza</dc:creator>
      <dc:date>2017-11-30T22:06:07Z</dc:date>
    </item>
    <item>
      <title>Re: Web scrapping using SAS</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Web-scraping-a-CDC-gov-site-using-SAS/m-p/417607#M280381</link>
      <description>&lt;P&gt;Thank you, I'll use this until I find a stable source.&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 30 Nov 2017 22:46:35 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Web-scraping-a-CDC-gov-site-using-SAS/m-p/417607#M280381</guid>
      <dc:creator>HabAM</dc:creator>
      <dc:date>2017-11-30T22:46:35Z</dc:date>
    </item>
    <item>
      <title>Re: Web scraping a CDC.gov site using SAS</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Web-scraping-a-CDC-gov-site-using-SAS/m-p/417622#M280382</link>
      <description>&lt;P&gt;Thank you&lt;/P&gt;</description>
      <pubDate>Thu, 30 Nov 2017 23:42:51 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Web-scraping-a-CDC-gov-site-using-SAS/m-p/417622#M280382</guid>
      <dc:creator>HabAM</dc:creator>
      <dc:date>2017-11-30T23:42:51Z</dc:date>
    </item>
  </channel>
</rss>

