<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Removing duplicate observations in-place in large datasets in SAS Programming</title>
    <link>https://communities.sas.com/t5/SAS-Programming/Removing-duplicate-observations-in-place-in-large-datasets/m-p/444026#M111120</link>
    <description>&lt;P&gt;Using the index to get at the duplicates is a very good idea, you may try something like this:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;data big;
  set dupes(keep=&amp;lt;key variables&amp;gt;);
  first=1;
  do until(0);
    modify big key=&amp;lt;your index name&amp;gt;;
    if _iorc_ then do;
       _error_=0; /* when _iorc_ is set, an error is provoked */
       leave;
       end;
    if not first then 
      remove;
    else 
      first=0; /* the next obs is not the first */
    end;
run;&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Fri, 09 Mar 2018 08:22:05 GMT</pubDate>
    <dc:creator>s_lassen</dc:creator>
    <dc:date>2018-03-09T08:22:05Z</dc:date>
    <item>
      <title>Removing duplicate observations in-place in large datasets</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Removing-duplicate-observations-in-place-in-large-datasets/m-p/443851#M111084</link>
      <description>&lt;P&gt;Hi,&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;I am working with some large datasets (billions of obs) and found there are duplicates of the key values which I want to remove all but the first.&amp;nbsp; The file is sorted by key variables (say A and B) and is indexed by them as well (call it myndx).&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;The first dataset I am working with I have a list of the key values which have dupes.&amp;nbsp; I was thinking I could use a MODIFY statement with the key= option to step through it, removing dupes.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;The Statements Reference manual refers to using a DO loop to process successive copies but gives no examples.&amp;nbsp; Has anyone done this already and can share the details on the method used or suggest an alternate approach?&amp;nbsp; The preference is to use the existing file(s) in-place rather than conventional Data step replacement.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Thanks!&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;--Ben&lt;/P&gt;</description>
      <pubDate>Thu, 08 Mar 2018 18:20:00 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Removing-duplicate-observations-in-place-in-large-datasets/m-p/443851#M111084</guid>
      <dc:creator>bconner</dc:creator>
      <dc:date>2018-03-08T18:20:00Z</dc:date>
    </item>
    <item>
      <title>Re: Removing duplicate observations in-place in large datasets</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Removing-duplicate-observations-in-place-in-large-datasets/m-p/443853#M111086</link>
      <description>&lt;P&gt;Does the table exists in database or is it a SAS Table?&lt;/P&gt;&lt;P&gt;If its a database tables then try using pass-through query by sending your table which contains duplicate values into database.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 08 Mar 2018 18:32:08 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Removing-duplicate-observations-in-place-in-large-datasets/m-p/443853#M111086</guid>
      <dc:creator>SuryaKiran</dc:creator>
      <dc:date>2018-03-08T18:32:08Z</dc:date>
    </item>
    <item>
      <title>Re: Removing duplicate observations in-place in large datasets</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Removing-duplicate-observations-in-place-in-large-datasets/m-p/443864#M111087</link>
      <description>&lt;P&gt;Everything is in SAS...&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;--Ben&lt;/P&gt;</description>
      <pubDate>Thu, 08 Mar 2018 19:04:16 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Removing-duplicate-observations-in-place-in-large-datasets/m-p/443864#M111087</guid>
      <dc:creator>bconner</dc:creator>
      <dc:date>2018-03-08T19:04:16Z</dc:date>
    </item>
    <item>
      <title>Re: Removing duplicate observations in-place in large datasets</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Removing-duplicate-observations-in-place-in-large-datasets/m-p/443885#M111091</link>
      <description>&lt;P&gt;Something like below should work.&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;data dupList;
  do id=2,5,7,9;
    output;
  end;
  stop;
run;

data big;
  var='some string';
  do id=2,7;
    output;
  end;
  do id=1 to 10;
    output;
  end;
  do id=2,5,7,9;
    output;
  end;
  stop;
run;

data big;
  if _n_=1 then
    do;
      dcl hash dupList(dataset:'dupList');
      _rc=dupList.defineKey('id');
      _rc=dupList.defineDone();
      dcl hash remList(dataset:'dupList(obs=0)');
      _rc=remList.defineKey('id');
      _rc=remList.defineDone();
    end;

  modify big;

  if remList.check()=0 then 
    do;
      remove;
    end;
  else if dupList.check()=0 then 
    do;
      _rc=remList.add();
      _rc=dupList.remove();
    end;

run;

proc print data=big;
run;

&lt;/CODE&gt;&lt;/PRE&gt;</description>
      <pubDate>Thu, 08 Mar 2018 20:37:58 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Removing-duplicate-observations-in-place-in-large-datasets/m-p/443885#M111091</guid>
      <dc:creator>Patrick</dc:creator>
      <dc:date>2018-03-08T20:37:58Z</dc:date>
    </item>
    <item>
      <title>Re: Removing duplicate observations in-place in large datasets</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Removing-duplicate-observations-in-place-in-large-datasets/m-p/443890#M111092</link>
      <description>&lt;P&gt;And should you have sufficient memory to store all the keys in memory then you could also take a more direct approach.&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;data big;
  if _n_=1 then
    do;
      dcl hash dupList();
      _rc=dupList.defineKey('id');
      _rc=dupList.defineDone();
    end;

  modify big;

  if dupList.check()=0 then 
    do;
      remove;
    end;
  else dupList.add();

run;

proc print data=big;
run;&lt;/CODE&gt;&lt;/PRE&gt;</description>
      <pubDate>Thu, 08 Mar 2018 20:45:31 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Removing-duplicate-observations-in-place-in-large-datasets/m-p/443890#M111092</guid>
      <dc:creator>Patrick</dc:creator>
      <dc:date>2018-03-08T20:45:31Z</dc:date>
    </item>
    <item>
      <title>Re: Removing duplicate observations in-place in large datasets</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Removing-duplicate-observations-in-place-in-large-datasets/m-p/444026#M111120</link>
      <description>&lt;P&gt;Using the index to get at the duplicates is a very good idea, you may try something like this:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;data big;
  set dupes(keep=&amp;lt;key variables&amp;gt;);
  first=1;
  do until(0);
    modify big key=&amp;lt;your index name&amp;gt;;
    if _iorc_ then do;
       _error_=0; /* when _iorc_ is set, an error is provoked */
       leave;
       end;
    if not first then 
      remove;
    else 
      first=0; /* the next obs is not the first */
    end;
run;&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Fri, 09 Mar 2018 08:22:05 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Removing-duplicate-observations-in-place-in-large-datasets/m-p/444026#M111120</guid>
      <dc:creator>s_lassen</dc:creator>
      <dc:date>2018-03-09T08:22:05Z</dc:date>
    </item>
  </channel>
</rss>

