<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Merge datasets with fuzzy matches in SAS Programming</title>
    <link>https://communities.sas.com/t5/SAS-Programming/Merge-datasets-with-fuzzy-matches/m-p/348998#M80888</link>
    <description>&lt;P&gt;Thank you, very much.&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;I will test it out.&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;TS&lt;/P&gt;</description>
    <pubDate>Tue, 11 Apr 2017 07:08:02 GMT</pubDate>
    <dc:creator>TingSern</dc:creator>
    <dc:date>2017-04-11T07:08:02Z</dc:date>
    <item>
      <title>Merge datasets with fuzzy matches</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Merge-datasets-with-fuzzy-matches/m-p/348986#M80879</link>
      <description>&lt;P&gt;Hi,&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Supposed I have two SAS datasets - call these JOBS and DATA.&amp;nbsp; They have two common variables TIME and NAME.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;I sort JOBS and DATA by TIME and then NAME.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;So, I code -&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Data Combined;&lt;/P&gt;&lt;P&gt;&amp;nbsp; Merge JOBS DATA;&lt;/P&gt;&lt;P&gt;&amp;nbsp; by TIME NAME;&lt;/P&gt;&lt;P&gt;Run;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;This assumes that the TIME matches exactly for both JOBS and DATA.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;My problem is that TIME for both JOBS and DATA are not "EQUAL" - but can be off by up to 10 seconds, let's say.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;There is another variable in both datasets (NAME) - that can be used to match as well.&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;For example - in JOBS -&lt;/P&gt;&lt;P&gt;TIME&amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;nbsp;&amp;nbsp; NAME&amp;nbsp;&amp;nbsp;&amp;nbsp; PET&lt;/P&gt;&lt;P&gt;120005&amp;nbsp;&amp;nbsp; ABC&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; CAT&lt;/P&gt;&lt;P&gt;120007&amp;nbsp;&amp;nbsp; ABD&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; DOG&lt;/P&gt;&lt;P&gt;120403&amp;nbsp;&amp;nbsp; ABC&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; COW&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;In DATA -&lt;/P&gt;&lt;P&gt;TIME&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; NAME&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; ADDRESS&lt;/P&gt;&lt;P&gt;120003&amp;nbsp;&amp;nbsp;&amp;nbsp; ABC&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; London&lt;/P&gt;&lt;P&gt;120008&amp;nbsp;&amp;nbsp;&amp;nbsp; ABD&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Paris&lt;/P&gt;&lt;P&gt;120400&amp;nbsp;&amp;nbsp;&amp;nbsp; ABC&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Warsaw&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Combined output -&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;TIME&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; NAME&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; PET&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; ADDRESS&lt;/P&gt;&lt;P&gt;120005&amp;nbsp;&amp;nbsp;&amp;nbsp; ABC&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; CAT&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; London&lt;/P&gt;&lt;P&gt;120007&amp;nbsp;&amp;nbsp;&amp;nbsp; ABD&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; DOG&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Paris&lt;/P&gt;&lt;P&gt;120403&amp;nbsp;&amp;nbsp;&amp;nbsp; ABC&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; COW&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; Warsaw&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Is there a way to do this?&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Thank you.&lt;/P&gt;</description>
      <pubDate>Tue, 11 Apr 2017 06:23:26 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Merge-datasets-with-fuzzy-matches/m-p/348986#M80879</guid>
      <dc:creator>TingSern</dc:creator>
      <dc:date>2017-04-11T06:23:26Z</dc:date>
    </item>
    <item>
      <title>Re: Merge datasets with fuzzy matches</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Merge-datasets-with-fuzzy-matches/m-p/348997#M80887</link>
      <description>&lt;P&gt;Many ways to do this but here is one &lt;span class="lia-unicode-emoji" title=":slightly_smiling_face:"&gt;🙂&lt;/span&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;data JOBS;
input TIME NAME $ PET $;
datalines;
120005 ABC CAT
120007 ABD DOG
120403 ABC COW
;

data DATA;
input TIME NAME $ ADRESS $;
datalines;
120003 ABC London
120008 ABD Paris
120400 ABC Warsaw
;


proc sort data = JOBS;by TIME NAME;run;
proc sort data = DATA;by TIME NAME;run;

data want;
	format TIME NAME PET ADRESS;
	merge JOBS(rename=(TIME=JOBS_TIME NAME = JOBS_NAME)) 
		  DATA(rename=(TIME=DATA_TIME));

	if (JOBS_TIME - DATA_TIME) &amp;lt;= 10 then do;
		TIME = JOBS_TIME;
		if NAME = JOBS_NAME then output;
	end;
	else if (DATA_TIME - JOBS_TIME) &amp;lt;= 10 then do;
		TIME = DATA_TIME;
		if NAME = JOBS_NAME then output;
	end;
	
	drop DATA_TIME JOBS_TIME JOBS_NAME;
run;&lt;/CODE&gt;&lt;/PRE&gt;</description>
      <pubDate>Tue, 11 Apr 2017 07:09:00 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Merge-datasets-with-fuzzy-matches/m-p/348997#M80887</guid>
      <dc:creator>PeterClemmensen</dc:creator>
      <dc:date>2017-04-11T07:09:00Z</dc:date>
    </item>
    <item>
      <title>Re: Merge datasets with fuzzy matches</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Merge-datasets-with-fuzzy-matches/m-p/348998#M80888</link>
      <description>&lt;P&gt;Thank you, very much.&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;I will test it out.&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;TS&lt;/P&gt;</description>
      <pubDate>Tue, 11 Apr 2017 07:08:02 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Merge-datasets-with-fuzzy-matches/m-p/348998#M80888</guid>
      <dc:creator>TingSern</dc:creator>
      <dc:date>2017-04-11T07:08:02Z</dc:date>
    </item>
    <item>
      <title>Re: Merge datasets with fuzzy matches</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Merge-datasets-with-fuzzy-matches/m-p/348999#M80889</link>
      <description>&lt;P&gt;Just edited the code a bit &lt;span class="lia-unicode-emoji" title=":slightly_smiling_face:"&gt;🙂&lt;/span&gt; Test that instead&lt;/P&gt;</description>
      <pubDate>Tue, 11 Apr 2017 07:09:32 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Merge-datasets-with-fuzzy-matches/m-p/348999#M80889</guid>
      <dc:creator>PeterClemmensen</dc:creator>
      <dc:date>2017-04-11T07:09:32Z</dc:date>
    </item>
    <item>
      <title>Re: Merge datasets with fuzzy matches</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Merge-datasets-with-fuzzy-matches/m-p/349033#M80908</link>
      <description>&lt;P&gt;The question here really is what is going to be your fixed logic. &amp;nbsp;For instance if you have several rows which are all within 10 seconds of multiple timepoints, what then, is it a many to many merge it that case, i.e. you clause would be:&lt;/P&gt;
&lt;P&gt;from HAVE1 A&lt;/P&gt;
&lt;P&gt;full join HAVE2 B&lt;/P&gt;
&lt;P&gt;on &amp;nbsp; &amp;nbsp;A.NAME=B.NAME&lt;/P&gt;
&lt;P&gt;and &amp;nbsp;A.TIME-10 &amp;lt;= B.TIME &amp;lt;= A.TIME+10&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;However if you want 1 row per item, tehn it becomes more difficult, would you take the first time occuring within that group in the window, or the last/mean etc.In your example, 120005 is within 10 seconds of both 120003 and 120008. &amp;nbsp;You seem to take the lowest, but do confirm. &amp;nbsp;If so, then maybe something like:&lt;/P&gt;
&lt;PRE&gt;proc sql;
  create table WANT as
  select A.*,
         B.ADDRESS
  from   HAVE1 A
  left join (select NAME,ADDRESS from (select * from HAVE2 where TIME-10 &amp;lt;= A.TIME &amp;lt;= TIME+10) group by NAME,ADDRESS having TIME=min(TIME)) B
  on     A.NAME=B.NAME;
quit;
           &lt;/PRE&gt;
&lt;P&gt;Also,&amp;nbsp;&lt;STRONG&gt;Post test data in the form of a datastep!&lt;/STRONG&gt;&lt;/P&gt;</description>
      <pubDate>Tue, 11 Apr 2017 09:23:18 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Merge-datasets-with-fuzzy-matches/m-p/349033#M80908</guid>
      <dc:creator>RW9</dc:creator>
      <dc:date>2017-04-11T09:23:18Z</dc:date>
    </item>
    <item>
      <title>Re: Merge datasets with fuzzy matches</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Merge-datasets-with-fuzzy-matches/m-p/349083#M80934</link>
      <description>&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;
data JOBS;
input TIME NAME $ PET $;
datalines;
120005 ABC CAT
120007 ABD DOG
120403 ABC COW
;

data DATA;
input TIME NAME $ ADRESS $;
datalines;
120003 ABC London
120008 ABD Paris
120400 ABC Warsaw
;

proc sql;
select a.*,b.adress
 from jobs as a left jion data as b
  on a.name=b.name and b.time between a.time-10 and a.time+10;
quit;&lt;/CODE&gt;&lt;/PRE&gt;</description>
      <pubDate>Tue, 11 Apr 2017 13:00:20 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Merge-datasets-with-fuzzy-matches/m-p/349083#M80934</guid>
      <dc:creator>Ksharp</dc:creator>
      <dc:date>2017-04-11T13:00:20Z</dc:date>
    </item>
  </channel>
</rss>

