<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: delete entries for which the spread is 50 times the median spread in SAS Data Management</title>
    <link>https://communities.sas.com/t5/SAS-Data-Management/delete-entries-for-which-the-spread-is-50-times-the-median/m-p/457313#M14147</link>
    <description>&lt;P&gt;As an update,&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;I seem to have figured out my first issue:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;proc sql;
create table want as
select *, median(RateBid) as median_bid, median(RateAsk) as median_ask
from work.example
group by Time;
quit;


data want;
  set want;
  by Time;
  if last.Time;   
run;&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;so calculate the median for all observations (which is just the observation if there are no duplicates) and then I select the last one of ech observation. So for all time stamps only one observation (either the observation itself or the median) remains. Feel free to correct this, if I overlook something!&lt;/P&gt;</description>
    <pubDate>Wed, 25 Apr 2018 14:37:25 GMT</pubDate>
    <dc:creator>NewSASuser2018</dc:creator>
    <dc:date>2018-04-25T14:37:25Z</dc:date>
    <item>
      <title>delete entries for which the spread is 50 times the median spread</title>
      <link>https://communities.sas.com/t5/SAS-Data-Management/delete-entries-for-which-the-spread-is-50-times-the-median/m-p/457221#M14142</link>
      <description>&lt;P&gt;Hello sas community!&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Many thanks again for beeing very helpful regarding my last problem. The solution worked marvelously!&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;My issue is the following. I have a large dataset containing ultra high frequency data (tick data):&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;DIV&gt;Time&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; RateBid&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; RateAsk .....&lt;/DIV&gt;&lt;DIV&gt;01.01.2015:17:12:12.445&amp;nbsp;&amp;nbsp; xxxxxxxxxx&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; xxxxxxxxx&lt;BR /&gt;01.01.2015:17:13:32.565&amp;nbsp;&amp;nbsp; xxxxxxxxxx&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; xxxxxxxxx&lt;BR /&gt;01.01.2015:17:13:40.685&amp;nbsp;&amp;nbsp; xxxxxxxxxx&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; xxxxxxxxx&lt;BR /&gt;01.01.2015:17:14:59.895&amp;nbsp;&amp;nbsp; 1.32473 &amp;nbsp; &amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 1.32487&lt;BR /&gt;01.01.2015:17:14:59.895 &amp;nbsp; 1.86743 &amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 1.97473&lt;/DIV&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;An example.csv is attached below. I have already removed many obvious data anomalies and now want to filter for outliers as suggested in the literature.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;1) For quotes that have the same time stamp (in HH:MM: SS : SSS precision) I want to detect duplicates (see the last to lines of my illustration above). For each of these duplicates I want to keep the median of RateBid and RateAsk for the respective timestamps respectively. So something like that:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;DIV&gt;Time&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; RateBid&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; RateAsk .....&lt;/DIV&gt;&lt;DIV&gt;01.01.2015:17:12:12.445&amp;nbsp;&amp;nbsp; xxxxxxxxxx&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; xxxxxxxxx&lt;BR /&gt;01.01.2015:17:13:32.565&amp;nbsp;&amp;nbsp; xxxxxxxxxx&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; xxxxxxxxx&lt;BR /&gt;01.01.2015:17:13:40.685&amp;nbsp;&amp;nbsp; xxxxxxxxxx&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; xxxxxxxxx&lt;BR /&gt;01.01.2015:17:14:59.895&amp;nbsp;&amp;nbsp; medRateBid&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; medRateAsk&lt;/DIV&gt;&lt;DIV&gt;&amp;nbsp;&lt;/DIV&gt;&lt;P&gt;&lt;FONT face="arial,helvetica,sans-serif"&gt;My attempt was this:&lt;/FONT&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;PRE&gt;&lt;FONT face="arial,helvetica,sans-serif"&gt;&lt;CODE class=" language-sas"&gt;proc sql;
   title 'Duplicate Rows in DUPLICATES Table';
   select *, count(*) as Count
      from &lt;FONT face="arial,helvetica,sans-serif"&gt;example&lt;/FONT&gt;
      group by Time
      having count(*) &amp;gt; 1;
run;&lt;/CODE&gt;&lt;/FONT&gt;&lt;/PRE&gt;&lt;P&gt;&lt;FONT face="arial,helvetica,sans-serif"&gt;So I was able to get at least see what timestamps feature duplicates. Online I found many solutions for detecting and deleting duplicates using proc sort and e.g. the noupkey option, however, I did not find any that suits my purpose. &lt;/FONT&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;FONT face="arial,helvetica,sans-serif"&gt;As a second step I want to perform a manupulation regarding which I struggle understanding sas's syntax.&lt;/FONT&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;FONT face="arial,helvetica,sans-serif"&gt;For each day delete the entry for which the spread is more than 50 times the median spread on that day. My attempt is this:&lt;/FONT&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;data example;
set example;
spread = RateAsk - RateBid;
run;

data example;
set example;
t=datepart(Time);
t1=timepart(Time);
format t date9. t1 time.;
run;

data example;
set example;
by t;
medspread= median(spread);
run;

data example;
set example;
if spread &amp;gt; 50*medspread then delete;
run;&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;This approch is erroneous, however. I don't get an error message but sas does not coumpute the median of the variable "spread" for each day. I must have got the indexing wrong.&lt;/P&gt;&lt;P&gt;After this step, to make sure, I want to delete all entries for which the so called mid-quote deviated by more than 10 mean absolute deviations from a rolling centered median (excluding the observation under consideration) of the 50 observations around the one considered.&amp;nbsp; Here to be honest, I cannot figure out how to construct such a measure in sas. Generally I am used to working with matlab where I would just construct a loop and use the runnig index to index intro the matrix elements. I know, however, that this is not the way to go in sas (that I know).&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;I hope you can help me with my issue. Thank you very much in advance!&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Kind regars&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Wed, 25 Apr 2018 11:38:50 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Data-Management/delete-entries-for-which-the-spread-is-50-times-the-median/m-p/457221#M14142</guid>
      <dc:creator>NewSASuser2018</dc:creator>
      <dc:date>2018-04-25T11:38:50Z</dc:date>
    </item>
    <item>
      <title>Re: delete entries for which the spread is 50 times the median spread</title>
      <link>https://communities.sas.com/t5/SAS-Data-Management/delete-entries-for-which-the-spread-is-50-times-the-median/m-p/457294#M14144</link>
      <description>Any help with any Problem as they are technicall 3 is greatly appreciated!&lt;BR /&gt;&lt;BR /&gt;Best</description>
      <pubDate>Wed, 25 Apr 2018 14:05:07 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Data-Management/delete-entries-for-which-the-spread-is-50-times-the-median/m-p/457294#M14144</guid>
      <dc:creator>NewSASuser2018</dc:creator>
      <dc:date>2018-04-25T14:05:07Z</dc:date>
    </item>
    <item>
      <title>Re: delete entries for which the spread is 50 times the median spread</title>
      <link>https://communities.sas.com/t5/SAS-Data-Management/delete-entries-for-which-the-spread-is-50-times-the-median/m-p/457313#M14147</link>
      <description>&lt;P&gt;As an update,&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;I seem to have figured out my first issue:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;proc sql;
create table want as
select *, median(RateBid) as median_bid, median(RateAsk) as median_ask
from work.example
group by Time;
quit;


data want;
  set want;
  by Time;
  if last.Time;   
run;&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;so calculate the median for all observations (which is just the observation if there are no duplicates) and then I select the last one of ech observation. So for all time stamps only one observation (either the observation itself or the median) remains. Feel free to correct this, if I overlook something!&lt;/P&gt;</description>
      <pubDate>Wed, 25 Apr 2018 14:37:25 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Data-Management/delete-entries-for-which-the-spread-is-50-times-the-median/m-p/457313#M14147</guid>
      <dc:creator>NewSASuser2018</dc:creator>
      <dc:date>2018-04-25T14:37:25Z</dc:date>
    </item>
  </channel>
</rss>

