<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Dealing with duplicate sets of rows in SAS Programming</title>
    <link>https://communities.sas.com/t5/SAS-Programming/Dealing-with-duplicate-sets-of-rows/m-p/230203#M41733</link>
    <description>&lt;P&gt;This should give you what you need ...&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;DATA have;
  input ID Date :MMDDYY10. Event $ Score;
  format date DATE9.;
  datalines;
  1 10/1/2012 1    5
  1 10/1/2012 2    3
  1 10/1/2012 3    7
  1 10/1/2012 4    2
  1 10/1/2012 5    3
  1 10/1/2012 tot 17
  1 10/1/2012 tot 14
  2 9/7/2014  1    3
  2 9/7/2014  1    .
  2 9/7/2014  2    7
  2 9/7/2014  2    .
;
RUN;


PROC SQL;
  CREATE TABLE want AS 
  SELECT ID
       , Date
       , Event
       , MAX(Score)            AS Max_Score
         /* other helper variables are listed below ... */
       , NMISS(Score)          AS Num_Missing_Score
       , COUNT(DISTINCT Score) AS Num_Distinct_Score
  FROM have
  GROUP BY ID, Date, Event;
QUIT;&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Here is the output:&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&lt;IMG src="https://communities.sas.com/t5/image/serverpage/image-id/515i0CB92C42AC89DE87/image-size/original?v=mpbl-1&amp;amp;px=-1" border="0" alt="wanted_distinct.gif" title="wanted_distinct.gif" /&gt;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Fri, 16 Oct 2015 00:35:22 GMT</pubDate>
    <dc:creator>hbi</dc:creator>
    <dc:date>2015-10-16T00:35:22Z</dc:date>
    <item>
      <title>Dealing with duplicate sets of rows</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Dealing-with-duplicate-sets-of-rows/m-p/230091#M41707</link>
      <description>&lt;P&gt;Hi,&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;I have a dataset in which each row is a person's score for a given event on a given date. People have 5 events in each day, as well as a total score which incorporates the 5 event scores but is not a straight sum. People can be in the dataset on multiple days.&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;ID &amp;nbsp; &amp;nbsp; &amp;nbsp;Date &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;Event &amp;nbsp; &amp;nbsp; &amp;nbsp;Score&lt;/P&gt;&lt;P&gt;1 &amp;nbsp; &amp;nbsp;10/1/2012 &amp;nbsp; &amp;nbsp; 1 &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;5&lt;/P&gt;&lt;P&gt;1 &amp;nbsp; &amp;nbsp;10/1/2012 &amp;nbsp; &amp;nbsp; 2 &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;3&lt;/P&gt;&lt;P&gt;1 &amp;nbsp; &amp;nbsp;10/1/2012 &amp;nbsp; &amp;nbsp; 3 &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;7&lt;/P&gt;&lt;P&gt;1 &amp;nbsp; &amp;nbsp;10/1/2012 &amp;nbsp; &amp;nbsp; 4 &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;2&lt;/P&gt;&lt;P&gt;1 &amp;nbsp; &amp;nbsp;10/1/2012 &amp;nbsp; &amp;nbsp; 5 &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;3&lt;/P&gt;&lt;P&gt;1 &amp;nbsp; &amp;nbsp;10/1/2012 &amp;nbsp; &amp;nbsp;tot &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;17&lt;/P&gt;&lt;P&gt;1 &amp;nbsp; &amp;nbsp;10/1/2012 &amp;nbsp; &amp;nbsp;tot &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;14&lt;/P&gt;&lt;P&gt;2 &amp;nbsp; &amp;nbsp;9/7/2014 &amp;nbsp; &amp;nbsp; &amp;nbsp; 1 &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;3&lt;/P&gt;&lt;P&gt;2 &amp;nbsp; &amp;nbsp;9/7/2014 &amp;nbsp; &amp;nbsp; &amp;nbsp; 1 &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; null&lt;/P&gt;&lt;P&gt;2 &amp;nbsp; &amp;nbsp;9/7/2014 &amp;nbsp; &amp;nbsp; &amp;nbsp; 2 &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; 7&lt;/P&gt;&lt;P&gt;2 &amp;nbsp; &amp;nbsp;9/7/2014 &amp;nbsp; &amp;nbsp; &amp;nbsp; 2 &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; null&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;I'm trying to clean the data but it has 3 problems:&lt;/P&gt;&lt;P&gt;&amp;nbsp; 1. One person will have multiple sets of events on one day, in which one set has values and one set is null/missing&lt;/P&gt;&lt;P&gt;&amp;nbsp; 2. One person will have multiple sets of events on one day, in which both sets have identical values&lt;/P&gt;&lt;P&gt;&amp;nbsp; 3. One person will have multiple sets of events in one day, in which both sets have different values&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;This is my solution to 1 and 2:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Proc sort data=have;&lt;/P&gt;&lt;P&gt;By ID date&amp;nbsp;event&amp;nbsp;descending score; Run;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Data want;&lt;/P&gt;&lt;P&gt;Set have;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;By ID &lt;/SPAN&gt;&lt;SPAN&gt;date&lt;/SPAN&gt;&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;&lt;SPAN&gt;event&amp;nbsp;&lt;/SPAN&gt;&lt;SPAN&gt;descending &lt;/SPAN&gt;&lt;SPAN&gt;score&lt;/SPAN&gt;&lt;SPAN&gt;;&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;If first.event&amp;nbsp;output want;&lt;/P&gt;&lt;P&gt;Run;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;That should output the first of two identical rows or a non-missing row if one set of responses has values and the other set is missing. However, it does not address the third problem. I know I can't get rid of a row at random so if I could maybe create a flag or some sort of indicator that would show that the person had two distinct scores&amp;nbsp;for that event, that would be helpful.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Additionally, when I was going through the data, the ones that I noticed people having multiple scores on the same day for were the total scores. If event=tot were the only ones for which a person had two distinct values of score, I could just delete the event=tot rows altogether, transpose the data, then calculate the total scores myself. However, I would need to be able to verify that it was only the event=tot rows that had two distinct responses, and I'm not sure how to do that.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Any help is much appreciated.&lt;/P&gt;</description>
      <pubDate>Thu, 15 Oct 2015 14:18:24 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Dealing-with-duplicate-sets-of-rows/m-p/230091#M41707</guid>
      <dc:creator>Walternate</dc:creator>
      <dc:date>2015-10-15T14:18:24Z</dc:date>
    </item>
    <item>
      <title>Re: Dealing with duplicate sets of rows</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Dealing-with-duplicate-sets-of-rows/m-p/230098#M41709</link>
      <description>&lt;P&gt;Are you saying that the stored total that is not a&amp;nbsp;straight sum is wrong, and your&amp;nbsp;calculation would be better?&lt;/P&gt;&lt;P&gt;First, why store totals, if they are, totals.&amp;nbsp;That's the job for the reporting procedure.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Second, go back to your source to get better data, or at least more details, Maybe ther's a timestamp for each record, and you could use that for a cleansing rule.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Other than that, there is no silver&amp;nbsp;bullet - you need to create a rule, and communicate that very clear to any person looking at this data/report. If you chose the first/last/medium, or just set it to missing/null -&amp;nbsp;pretty much up to business requirements.&lt;/P&gt;</description>
      <pubDate>Thu, 15 Oct 2015 15:06:46 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Dealing-with-duplicate-sets-of-rows/m-p/230098#M41709</guid>
      <dc:creator>LinusH</dc:creator>
      <dc:date>2015-10-15T15:06:46Z</dc:date>
    </item>
    <item>
      <title>Re: Dealing with duplicate sets of rows</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Dealing-with-duplicate-sets-of-rows/m-p/230203#M41733</link>
      <description>&lt;P&gt;This should give you what you need ...&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;DATA have;
  input ID Date :MMDDYY10. Event $ Score;
  format date DATE9.;
  datalines;
  1 10/1/2012 1    5
  1 10/1/2012 2    3
  1 10/1/2012 3    7
  1 10/1/2012 4    2
  1 10/1/2012 5    3
  1 10/1/2012 tot 17
  1 10/1/2012 tot 14
  2 9/7/2014  1    3
  2 9/7/2014  1    .
  2 9/7/2014  2    7
  2 9/7/2014  2    .
;
RUN;


PROC SQL;
  CREATE TABLE want AS 
  SELECT ID
       , Date
       , Event
       , MAX(Score)            AS Max_Score
         /* other helper variables are listed below ... */
       , NMISS(Score)          AS Num_Missing_Score
       , COUNT(DISTINCT Score) AS Num_Distinct_Score
  FROM have
  GROUP BY ID, Date, Event;
QUIT;&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Here is the output:&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&lt;IMG src="https://communities.sas.com/t5/image/serverpage/image-id/515i0CB92C42AC89DE87/image-size/original?v=mpbl-1&amp;amp;px=-1" border="0" alt="wanted_distinct.gif" title="wanted_distinct.gif" /&gt;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Fri, 16 Oct 2015 00:35:22 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Dealing-with-duplicate-sets-of-rows/m-p/230203#M41733</guid>
      <dc:creator>hbi</dc:creator>
      <dc:date>2015-10-16T00:35:22Z</dc:date>
    </item>
  </channel>
</rss>

