<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Web Scraping websites with load more button/infinite scrolling in SAS Programming</title>
    <link>https://communities.sas.com/t5/SAS-Programming/Web-Scraping-websites-with-load-more-button-infinite-scrolling/m-p/738195#M230222</link>
    <description>&lt;P&gt;This appears to be a WordPress site, so the options for different categories might be there. However, it will require some exploration. So far I've seen only the one main feed.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;The RSS feed does have a category field. Example:&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;PRE&gt;&amp;lt;category&amp;gt;South Africa&amp;lt;/category&amp;gt;&lt;/PRE&gt;
&lt;P&gt;You could use this in your SAS process to filter items from the data after you fetch it.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Revised code to capture the category:&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;LI-CODE lang="sas"&gt;filename rssmap temp;
data _null_;
infile datalines;
file rssmap;
input;
put _infile_;
datalines;
&amp;lt;?xml version="1.0" encoding="windows-1252"?&amp;gt;
&amp;lt;SXLEMAP name="RSSMAP" version="2.1"&amp;gt;
    &amp;lt;NAMESPACES count="0"/&amp;gt;
    &amp;lt;!-- ############################################################ --&amp;gt;
    &amp;lt;TABLE name="item"&amp;gt;
        &amp;lt;TABLE-PATH syntax="XPath"&amp;gt;/rss/channel/item&amp;lt;/TABLE-PATH&amp;gt;
        &amp;lt;COLUMN name="title"&amp;gt;
            &amp;lt;PATH syntax="XPath"&amp;gt;/rss/channel/item/title&amp;lt;/PATH&amp;gt;
            &amp;lt;TYPE&amp;gt;character&amp;lt;/TYPE&amp;gt;
            &amp;lt;DATATYPE&amp;gt;string&amp;lt;/DATATYPE&amp;gt;
            &amp;lt;LENGTH&amp;gt;250&amp;lt;/LENGTH&amp;gt;
        &amp;lt;/COLUMN&amp;gt;
        &amp;lt;COLUMN name="link"&amp;gt;
            &amp;lt;PATH syntax="XPath"&amp;gt;/rss/channel/item/link&amp;lt;/PATH&amp;gt;
            &amp;lt;TYPE&amp;gt;character&amp;lt;/TYPE&amp;gt;
            &amp;lt;DATATYPE&amp;gt;string&amp;lt;/DATATYPE&amp;gt;
            &amp;lt;LENGTH&amp;gt;200&amp;lt;/LENGTH&amp;gt;
        &amp;lt;/COLUMN&amp;gt;
        &amp;lt;COLUMN name="pubDate"&amp;gt;
            &amp;lt;PATH syntax="XPath"&amp;gt;/rss/channel/item/pubDate&amp;lt;/PATH&amp;gt;
            &amp;lt;TYPE&amp;gt;character&amp;lt;/TYPE&amp;gt;
            &amp;lt;DATATYPE&amp;gt;string&amp;lt;/DATATYPE&amp;gt;
            &amp;lt;LENGTH&amp;gt;40&amp;lt;/LENGTH&amp;gt;
        &amp;lt;/COLUMN&amp;gt;
         &amp;lt;COLUMN name="category"&amp;gt;
            &amp;lt;PATH syntax="XPath"&amp;gt;/rss/channel/item/category&amp;lt;/PATH&amp;gt;
            &amp;lt;TYPE&amp;gt;character&amp;lt;/TYPE&amp;gt;
            &amp;lt;DATATYPE&amp;gt;string&amp;lt;/DATATYPE&amp;gt;
            &amp;lt;LENGTH&amp;gt;40&amp;lt;/LENGTH&amp;gt;
        &amp;lt;/COLUMN&amp;gt;
    &amp;lt;/TABLE&amp;gt;
&amp;lt;/SXLEMAP&amp;gt;
;
run;

/* WordPress feeds return data in pages, 25 entries at a time        */
/* So using a short macro to loop through past 5 pages, or 125 items */
%macro getItems;
  %do i = 1 %to 5;
  filename feed temp;
  proc http
   method="get"
   url="https://www.dailymaverick.co.za/dmrss?paged=&amp;amp;i."
   out=feed;
  run;
 
  libname result XMLv2 xmlfileref=feed xmlmap=rssmap;
 
  data posts_&amp;amp;i.;
   set result.item;
  run;
  %end;
%mend;
 
%getItems;
 
/* Assemble all pages of entries                       */
/* Cast the date field into a proper SAS date          */
/* Have to strip out the default day name abbreviation */
/* "Wed, 10 Apr 2019 17:36:27 +0000" -&amp;gt; 10APR2019      */
data allPosts ;
 set posts_:;
 length sasPubdate 8;
 sasPubdate = input( substr(pubDate,4),anydtdtm.);
 format sasPubdate dtdate9.;
 drop pubDate;
run;&lt;/LI-CODE&gt;
&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="rssfeed.jpg" style="width: 999px;"&gt;&lt;img src="https://communities.sas.com/t5/image/serverpage/image-id/58902iF0A6371990A23540/image-size/large?v=v2&amp;amp;px=999" role="button" title="rssfeed.jpg" alt="rssfeed.jpg" /&gt;&lt;/span&gt;&lt;/P&gt;</description>
    <pubDate>Fri, 30 Apr 2021 16:15:23 GMT</pubDate>
    <dc:creator>ChrisHemedinger</dc:creator>
    <dc:date>2021-04-30T16:15:23Z</dc:date>
    <item>
      <title>Web Scraping websites with load more button/infinite scrolling</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Web-Scraping-websites-with-load-more-button-infinite-scrolling/m-p/737890#M230087</link>
      <description>&lt;P&gt;Hi everyone , I am trying to scrape a website with a load more button . Initially the website shows around 20 articles , in order to see more articles the user is required to press the load more button or scroll down.&lt;/P&gt;
&lt;P&gt;Is there any way we can bypass this and scrape the website?&lt;/P&gt;
&lt;P&gt;This is the website I am trying to scrape:&lt;/P&gt;
&lt;P&gt;&lt;A href="https://www.dailymaverick.co.za/section/world/" target="_blank" rel="noopener"&gt;https://www.dailymaverick.co.za/section/world/&amp;nbsp;&lt;/A&gt;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;I just want to extract the main headlines from the articles.&lt;/P&gt;
&lt;P&gt;Thanks.&lt;/P&gt;</description>
      <pubDate>Thu, 29 Apr 2021 13:10:50 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Web-Scraping-websites-with-load-more-button-infinite-scrolling/m-p/737890#M230087</guid>
      <dc:creator>kaziumair</dc:creator>
      <dc:date>2021-04-29T13:10:50Z</dc:date>
    </item>
    <item>
      <title>Re: Web Scraping websites with load more button/infinite scrolling</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Web-Scraping-websites-with-load-more-button-infinite-scrolling/m-p/737921#M230096</link>
      <description>&lt;P&gt;For this site, it looks like you might be able to get an RSS feed (XML):&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&lt;A href="https://www.dailymaverick.co.za/dmrss/" target="_blank"&gt;https://www.dailymaverick.co.za/dmrss/&lt;/A&gt;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;I've shared &lt;A href="https://blogs.sas.com/content/sasdummy/2019/04/11/read-rss-feeds/" target="_self"&gt;how to fetch/parse RSS feeds in this article&lt;/A&gt;.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;For fun, I applied the technique to this source:&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;LI-CODE lang="sas"&gt;/* Copyright SAS Institute Inc. */

filename rssmap temp;
data _null_;
infile datalines;
file rssmap;
input;
put _infile_;
datalines;
&amp;lt;?xml version="1.0" encoding="windows-1252"?&amp;gt;
&amp;lt;SXLEMAP name="RSSMAP" version="2.1"&amp;gt;
    &amp;lt;NAMESPACES count="0"/&amp;gt;
    &amp;lt;!-- ############################################################ --&amp;gt;
    &amp;lt;TABLE name="item"&amp;gt;
        &amp;lt;TABLE-PATH syntax="XPath"&amp;gt;/rss/channel/item&amp;lt;/TABLE-PATH&amp;gt;
        &amp;lt;COLUMN name="title"&amp;gt;
            &amp;lt;PATH syntax="XPath"&amp;gt;/rss/channel/item/title&amp;lt;/PATH&amp;gt;
            &amp;lt;TYPE&amp;gt;character&amp;lt;/TYPE&amp;gt;
            &amp;lt;DATATYPE&amp;gt;string&amp;lt;/DATATYPE&amp;gt;
            &amp;lt;LENGTH&amp;gt;250&amp;lt;/LENGTH&amp;gt;
        &amp;lt;/COLUMN&amp;gt;
        &amp;lt;COLUMN name="link"&amp;gt;
            &amp;lt;PATH syntax="XPath"&amp;gt;/rss/channel/item/link&amp;lt;/PATH&amp;gt;
            &amp;lt;TYPE&amp;gt;character&amp;lt;/TYPE&amp;gt;
            &amp;lt;DATATYPE&amp;gt;string&amp;lt;/DATATYPE&amp;gt;
            &amp;lt;LENGTH&amp;gt;200&amp;lt;/LENGTH&amp;gt;
        &amp;lt;/COLUMN&amp;gt;
        &amp;lt;COLUMN name="pubDate"&amp;gt;
            &amp;lt;PATH syntax="XPath"&amp;gt;/rss/channel/item/pubDate&amp;lt;/PATH&amp;gt;
            &amp;lt;TYPE&amp;gt;character&amp;lt;/TYPE&amp;gt;
            &amp;lt;DATATYPE&amp;gt;string&amp;lt;/DATATYPE&amp;gt;
            &amp;lt;LENGTH&amp;gt;40&amp;lt;/LENGTH&amp;gt;
        &amp;lt;/COLUMN&amp;gt;
    &amp;lt;/TABLE&amp;gt;
&amp;lt;/SXLEMAP&amp;gt;
;
run;

/* WordPress feeds return data in pages, 25 entries at a time        */
/* So using a short macro to loop through past 5 pages, or 125 items */
%macro getItems;
  %do i = 1 %to 5;
  filename feed temp;
  proc http
   method="get"
   url="https://www.dailymaverick.co.za/dmrss?paged=&amp;amp;i."
   out=feed;
  run;
 
  libname result XMLv2 xmlfileref=feed xmlmap=rssmap;
 
  data posts_&amp;amp;i.;
   set result.item;
  run;
  %end;
%mend;
 
%getItems;
 
/* Assemble all pages of entries                       */
/* Cast the date field into a proper SAS date          */
/* Have to strip out the default day name abbreviation */
/* "Wed, 10 Apr 2019 17:36:27 +0000" -&amp;gt; 10APR2019      */
data allPosts ;
 set posts_:;
 length sasPubdate 8;
 sasPubdate = input( substr(pubDate,4),anydtdtm.);
 format sasPubdate dtdate9.;
 drop pubDate;
run;&lt;/LI-CODE&gt;
&lt;P&gt;Result:&lt;/P&gt;
&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="rssfeed.jpg" style="width: 742px;"&gt;&lt;img src="https://communities.sas.com/t5/image/serverpage/image-id/58866i7F2F1DE7DD663143/image-size/large?v=v2&amp;amp;px=999" role="button" title="rssfeed.jpg" alt="rssfeed.jpg" /&gt;&lt;/span&gt;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 29 Apr 2021 14:00:38 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Web-Scraping-websites-with-load-more-button-infinite-scrolling/m-p/737921#M230096</guid>
      <dc:creator>ChrisHemedinger</dc:creator>
      <dc:date>2021-04-29T14:00:38Z</dc:date>
    </item>
    <item>
      <title>Re: Web Scraping websites with load more button/infinite scrolling</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Web-Scraping-websites-with-load-more-button-infinite-scrolling/m-p/738098#M230182</link>
      <description>Hi , thank you for your guidance . Just wanted to ask whether there is a way to fetch RSS feed only from a particular section of the website? As this RSS feed seems to fetch data from all the sections available in the website .</description>
      <pubDate>Fri, 30 Apr 2021 05:54:39 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Web-Scraping-websites-with-load-more-button-infinite-scrolling/m-p/738098#M230182</guid>
      <dc:creator>kaziumair</dc:creator>
      <dc:date>2021-04-30T05:54:39Z</dc:date>
    </item>
    <item>
      <title>Re: Web Scraping websites with load more button/infinite scrolling</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Web-Scraping-websites-with-load-more-button-infinite-scrolling/m-p/738195#M230222</link>
      <description>&lt;P&gt;This appears to be a WordPress site, so the options for different categories might be there. However, it will require some exploration. So far I've seen only the one main feed.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;The RSS feed does have a category field. Example:&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;PRE&gt;&amp;lt;category&amp;gt;South Africa&amp;lt;/category&amp;gt;&lt;/PRE&gt;
&lt;P&gt;You could use this in your SAS process to filter items from the data after you fetch it.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Revised code to capture the category:&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;LI-CODE lang="sas"&gt;filename rssmap temp;
data _null_;
infile datalines;
file rssmap;
input;
put _infile_;
datalines;
&amp;lt;?xml version="1.0" encoding="windows-1252"?&amp;gt;
&amp;lt;SXLEMAP name="RSSMAP" version="2.1"&amp;gt;
    &amp;lt;NAMESPACES count="0"/&amp;gt;
    &amp;lt;!-- ############################################################ --&amp;gt;
    &amp;lt;TABLE name="item"&amp;gt;
        &amp;lt;TABLE-PATH syntax="XPath"&amp;gt;/rss/channel/item&amp;lt;/TABLE-PATH&amp;gt;
        &amp;lt;COLUMN name="title"&amp;gt;
            &amp;lt;PATH syntax="XPath"&amp;gt;/rss/channel/item/title&amp;lt;/PATH&amp;gt;
            &amp;lt;TYPE&amp;gt;character&amp;lt;/TYPE&amp;gt;
            &amp;lt;DATATYPE&amp;gt;string&amp;lt;/DATATYPE&amp;gt;
            &amp;lt;LENGTH&amp;gt;250&amp;lt;/LENGTH&amp;gt;
        &amp;lt;/COLUMN&amp;gt;
        &amp;lt;COLUMN name="link"&amp;gt;
            &amp;lt;PATH syntax="XPath"&amp;gt;/rss/channel/item/link&amp;lt;/PATH&amp;gt;
            &amp;lt;TYPE&amp;gt;character&amp;lt;/TYPE&amp;gt;
            &amp;lt;DATATYPE&amp;gt;string&amp;lt;/DATATYPE&amp;gt;
            &amp;lt;LENGTH&amp;gt;200&amp;lt;/LENGTH&amp;gt;
        &amp;lt;/COLUMN&amp;gt;
        &amp;lt;COLUMN name="pubDate"&amp;gt;
            &amp;lt;PATH syntax="XPath"&amp;gt;/rss/channel/item/pubDate&amp;lt;/PATH&amp;gt;
            &amp;lt;TYPE&amp;gt;character&amp;lt;/TYPE&amp;gt;
            &amp;lt;DATATYPE&amp;gt;string&amp;lt;/DATATYPE&amp;gt;
            &amp;lt;LENGTH&amp;gt;40&amp;lt;/LENGTH&amp;gt;
        &amp;lt;/COLUMN&amp;gt;
         &amp;lt;COLUMN name="category"&amp;gt;
            &amp;lt;PATH syntax="XPath"&amp;gt;/rss/channel/item/category&amp;lt;/PATH&amp;gt;
            &amp;lt;TYPE&amp;gt;character&amp;lt;/TYPE&amp;gt;
            &amp;lt;DATATYPE&amp;gt;string&amp;lt;/DATATYPE&amp;gt;
            &amp;lt;LENGTH&amp;gt;40&amp;lt;/LENGTH&amp;gt;
        &amp;lt;/COLUMN&amp;gt;
    &amp;lt;/TABLE&amp;gt;
&amp;lt;/SXLEMAP&amp;gt;
;
run;

/* WordPress feeds return data in pages, 25 entries at a time        */
/* So using a short macro to loop through past 5 pages, or 125 items */
%macro getItems;
  %do i = 1 %to 5;
  filename feed temp;
  proc http
   method="get"
   url="https://www.dailymaverick.co.za/dmrss?paged=&amp;amp;i."
   out=feed;
  run;
 
  libname result XMLv2 xmlfileref=feed xmlmap=rssmap;
 
  data posts_&amp;amp;i.;
   set result.item;
  run;
  %end;
%mend;
 
%getItems;
 
/* Assemble all pages of entries                       */
/* Cast the date field into a proper SAS date          */
/* Have to strip out the default day name abbreviation */
/* "Wed, 10 Apr 2019 17:36:27 +0000" -&amp;gt; 10APR2019      */
data allPosts ;
 set posts_:;
 length sasPubdate 8;
 sasPubdate = input( substr(pubDate,4),anydtdtm.);
 format sasPubdate dtdate9.;
 drop pubDate;
run;&lt;/LI-CODE&gt;
&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="rssfeed.jpg" style="width: 999px;"&gt;&lt;img src="https://communities.sas.com/t5/image/serverpage/image-id/58902iF0A6371990A23540/image-size/large?v=v2&amp;amp;px=999" role="button" title="rssfeed.jpg" alt="rssfeed.jpg" /&gt;&lt;/span&gt;&lt;/P&gt;</description>
      <pubDate>Fri, 30 Apr 2021 16:15:23 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Web-Scraping-websites-with-load-more-button-infinite-scrolling/m-p/738195#M230222</guid>
      <dc:creator>ChrisHemedinger</dc:creator>
      <dc:date>2021-04-30T16:15:23Z</dc:date>
    </item>
    <item>
      <title>Re: Web Scraping websites with load more button/infinite scrolling</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Web-Scraping-websites-with-load-more-button-infinite-scrolling/m-p/738486#M230367</link>
      <description>Thanks a lot for your help and guidance</description>
      <pubDate>Mon, 03 May 2021 05:12:09 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Web-Scraping-websites-with-load-more-button-infinite-scrolling/m-p/738486#M230367</guid>
      <dc:creator>kaziumair</dc:creator>
      <dc:date>2021-05-03T05:12:09Z</dc:date>
    </item>
    <item>
      <title>Re: Web Scraping websites with load more button/infinite scrolling</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Web-Scraping-websites-with-load-more-button-infinite-scrolling/m-p/758798#M239663</link>
      <description>&lt;P&gt;Hi , how can I scrape a website with load more button and no rss feed. Is there a way to call javascript that executes the "load more" functionality in sas&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Tue, 03 Aug 2021 13:14:10 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Web-Scraping-websites-with-load-more-button-infinite-scrolling/m-p/758798#M239663</guid>
      <dc:creator>kaziumair</dc:creator>
      <dc:date>2021-08-03T13:14:10Z</dc:date>
    </item>
  </channel>
</rss>

