<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Using hash tables for pairwise comparing in SAS Programming</title>
    <link>https://communities.sas.com/t5/SAS-Programming/Using-hash-tables-for-pairwise-comparing/m-p/255254#M48742</link>
    <description>&lt;P&gt;I have answered my own question. I used SAS' geodist for the example solution.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;data test;
 infile datalines delimiter = ',';
 input serial lat long date;
 format date date9.;
 datalines;
 1, 58.6732, 118.2569, 20225
 1, 58.9852, 118.2568, 20226
 1, 58.9865, 118.3156, 20227
 1, 75.259, -52.0023,  20228
 1, 75.2689, -52.125,  20229
 2, 10.123, 15.256,    22111
 2, 10.2365, 14.2698,  22112
 2, 11.3569, 15.3489,  22113
 3, 23.2569, 0.2659,   21256
 3, 22.6789, 0.2788,   21257
 3, 12.3569, 35.126,   21258
 3, 12.3789, 34.679,   21259
 ;
run;

data test;
	set test;
	i = _n_;
run;

proc sql _method;
	create table results (drop = sn date1 j) 
	as select
		a.*,
		b.serial as sn label = 'sn',
		b.lat as lat1 label = 'lat1',
		b.long as long1 label = 'long1',
		b.date as date1 label = 'date1',
		b.i as j,
		geodist(a.lat, a.long, b.lat, b.long) as gdist
	from test as a
	inner join test as b
	on a.serial = b.serial
	where a.i &amp;gt; b.i;
quit; 

proc sort results;
	by serial date;
run;&lt;/CODE&gt;&lt;/PRE&gt;</description>
    <pubDate>Tue, 08 Mar 2016 15:08:52 GMT</pubDate>
    <dc:creator>dwsmith</dc:creator>
    <dc:date>2016-03-08T15:08:52Z</dc:date>
    <item>
      <title>Using hash tables for pairwise comparing</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Using-hash-tables-for-pairwise-comparing/m-p/255071#M48707</link>
      <description>&lt;P&gt;I have a data set that has a list of serial numbers. For each serial number, I want to run the same calculation without using a macro to loop through each serial number and create a table. There are 37k serial numbers.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;The following code works if I specify one serial number:&lt;/P&gt;&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;data test;
 infile datalines delimiter = ',';
 input serial lat long;
 datalines;
 1, 58.6732, 118.2569
 1, 58.9852, 118.2568
 1, 58.9865, 118.3156
 1, 75.259, -52.0023
 1, 75.2689, -52.125
 2, 10.123, 15.256
 2, 10.2365, 14.2698
 2, 11.3569, 15.3489
 3, 23.2569, 0.2659
 3, 22.6789, 0.2788
 3, 12.3569, 35.126
 3, 12.3789, 34.679
 ;
run;

%let rad = constant('pi') / 180; /*degrees to radians*/
%let r_km = 6371; /*radius of the earth in km (3959 miles)*/

proc fcmp outlib = funcs.haversine;
 function haversine(lat1, long1, lat2, long2); /*function name*/
 /*function definition*/
 dist = round(&amp;amp;r_km * 2 * arsin(min(1, sqrt(sin((lat2 - lat1) * &amp;amp;rad / 2)**2 + 
 cos(lat1 * &amp;amp;rad) * cos(lat2 * &amp;amp;rad) * sin((long2 - long1) * &amp;amp;rad / 2)**2))), 0.000001);
 return(dist); /*what to return*/
 endsub;
run;
options cmplib = funcs; /*needed to access the function*/

data test;
 set test;
 cnt = _n_;

	/*pairwise compare distance*/
	data results (drop = _rc dt);
		/*set up hash table*/
		if _n_ = 1 then do;
			if 0 then 
				set test(rename = (lat = lat1 long = long2));
			declare hash h1(dataset:"test(rename = (lat = lat1 long = long2, multidata&amp;amp;colon; 'y');
			_rc = h1.definekey('cnt', 'serial');
			_rc = h1.definedata('lat1', 'long1');
			_rc = h1.definedone();
		end;
		if serial = /*first serial number*/ then accum + 1;/*but then the serial number needs to change, I think a macro wouldn't be ideal though*/
		do _i = (cnt + 1) to (cnt + accum);
			_rc = h1.find(key: _i, key: serial);
			hdist = haversine(lat, long, lat1, long1);
			output;
		end;
	run;&lt;/CODE&gt;&lt;/PRE&gt;&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;&amp;nbsp;&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;Can this be generalized to run on whole dataset comparing distance only for the specific serial numbers, recognizing a new number, and starting the process again. Otherwise, I need a macro to subset the major data set 37k times which seems inefficient.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;The gist is, I need to find the distance between gps location 1 with 2, 2 with 3, ... for serial number 1. Then when I get to serial number 2, I need to repeat this process and so on.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;This is a rough sketch of the what I am trying to achieve. I added comments where I explained the intent but haven't figured out the process to achieve the desired results.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;/*Tried setting up a macro for the serial loop but this doesn't work
  Here is what I tried for that*/
%let rad = constant('pi') / 180; /*degrees to radians*/
%let r_km = 6371; /*radius of the earth in km (3959 miles)*/

proc fcmp outlib = funcs.haversine;
 function haversine(lat1, long1, lat2, long2); /*function name*/
 /*function definition*/
 dist = round(&amp;amp;r_km * 2 * arsin(min(1, sqrt(sin((lat2 - lat1) * &amp;amp;rad / 2)**2 + 
 cos(lat1 * &amp;amp;rad) * cos(lat2 * &amp;amp;rad) * sin((long2 - long1) * &amp;amp;rad / 2)**2))), 0.000001);
 return(dist); /*what to return*/
 endsub;
run;
options cmplib = funcs; /*needed to access the function*/

data test;
 infile datalines delimiter = ',';
 input serial lat long;
 label serial = 'serial'
 lat = 'lat'
 long = 'long';
 datalines;
 1, 58.6732, 118.2569
 1, 58.9852, 118.2568
 1, 58.9865, 118.3156
 1, 75.259, -52.0023
 1, 75.2689, -52.125
 2, 10.123, 15.256
 2, 10.2365, 14.2698
 2, 11.3569, 15.3489
 3, 23.2569, 0.2659
 3, 22.6789, 0.2788
 3, 12.3569, 35.126
 3, 12.3789, 34.679
 ;
run;

data test;
 set test;
 input cnt;
 label cnt = 'cnt';
 cnt = _n_;
run;

data serials (keep = serial);
 set test;
run;

proc sort nodupkey data = serials;
 by serial;
run;

data _null_;
 set serials nobs = n;
 call symput ('num', n);
run;

options mprint; /*full output for macro execution so you know something is happening*/

%macro serial_loop;
%local i sn;
%do i = 1 %to &amp;amp;num;
 data _null_;
 set serials (obs = &amp;amp;i firstobs = &amp;amp;i);
 call symputx ('sn', serial);
 run;
 if serial = "&amp;amp;sn" then
 accum + 1;
 /*find the distance between the ith and ith + 1 coordinates for each gps location*/
 do _i = (cnt + 1) to (cnt + accum);
 _rc = h1.find(key: _i, key: ser_no);
 if _rc = 0 then do;
 hdist = haversine(lat, long, lat1, long1);
 output;
 end;
 end;
%end;
%mend serial_loop;
data results (drop = _rc);
 /*set up hash table*/
 set test;
 if _n_ = 1 then do;
 if 0 then 
 set test(rename = (lat = lat1 long = long1));
 declare hash h1(dataset: "test(rename = (lat = lat1 
 long = long1))",
 multidata&amp;amp;colon; 'y');
 _rc = h1.definekey('cnt', 'serial');
 _rc = h1.definedata('lat', 'long');
 _rc = h1.definedone();
 end;
 %serial_loop;
run;
 
ERROR: Variable lat is not on file WORK.TEST.&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Tue, 08 Mar 2016 13:21:39 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Using-hash-tables-for-pairwise-comparing/m-p/255071#M48707</guid>
      <dc:creator>dwsmith</dc:creator>
      <dc:date>2016-03-08T13:21:39Z</dc:date>
    </item>
    <item>
      <title>Re: Using hash tables for pairwise comparing</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Using-hash-tables-for-pairwise-comparing/m-p/255083#M48711</link>
      <description>&lt;P&gt;Guessing I would say yes. &amp;nbsp;I think you will need to supply &lt;U&gt;data&lt;/U&gt; to go with your code get anyone interested. &amp;nbsp;Also show what the output might look like for a "new" number where process is started over.&lt;/P&gt;</description>
      <pubDate>Mon, 07 Mar 2016 20:28:13 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Using-hash-tables-for-pairwise-comparing/m-p/255083#M48711</guid>
      <dc:creator>data_null__</dc:creator>
      <dc:date>2016-03-07T20:28:13Z</dc:date>
    </item>
    <item>
      <title>Re: Using hash tables for pairwise comparing</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Using-hash-tables-for-pairwise-comparing/m-p/255100#M48717</link>
      <description>&lt;BLOCKQUOTE&gt;&lt;HR /&gt;&lt;a href="https://communities.sas.com/t5/user/viewprofilepage/user-id/72834"&gt;@dwsmith&lt;/a&gt; wrote:&lt;BR /&gt;
&lt;P&gt;I have a data set that has a list of serial numbers. For each serial number, I want to run the same calculation without using a macro to loop through each serial number and create a table. There are 37k serial numbers.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;HR /&gt;&lt;/BLOCKQUOTE&gt;
&lt;P&gt;Are you comparing this only to other items with the same serial number?&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;I am not sure what this means at all:&lt;/P&gt;
&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;recognizing a new number, and starting the process again. Otherwise, I need a macro to subset the major data set 37k times which seems inefficient.&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;It will really help if you can provide an example data set with a small number of records, say 5 to 10 and the minimum number of variables to work with and what the output should look like. What little I think I understand might be done with a reflexive join in Proc Sql since it looks like you are only using one data set.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Mon, 07 Mar 2016 21:20:29 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Using-hash-tables-for-pairwise-comparing/m-p/255100#M48717</guid>
      <dc:creator>ballardw</dc:creator>
      <dc:date>2016-03-07T21:20:29Z</dc:date>
    </item>
    <item>
      <title>Re: Using hash tables for pairwise comparing</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Using-hash-tables-for-pairwise-comparing/m-p/255221#M48740</link>
      <description>&lt;P&gt;I added a sample data set&lt;/P&gt;</description>
      <pubDate>Tue, 08 Mar 2016 12:32:15 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Using-hash-tables-for-pairwise-comparing/m-p/255221#M48740</guid>
      <dc:creator>dwsmith</dc:creator>
      <dc:date>2016-03-08T12:32:15Z</dc:date>
    </item>
    <item>
      <title>Re: Using hash tables for pairwise comparing</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Using-hash-tables-for-pairwise-comparing/m-p/255222#M48741</link>
      <description>&lt;P&gt;I added a sample data set.&lt;/P&gt;</description>
      <pubDate>Tue, 08 Mar 2016 12:32:44 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Using-hash-tables-for-pairwise-comparing/m-p/255222#M48741</guid>
      <dc:creator>dwsmith</dc:creator>
      <dc:date>2016-03-08T12:32:44Z</dc:date>
    </item>
    <item>
      <title>Re: Using hash tables for pairwise comparing</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Using-hash-tables-for-pairwise-comparing/m-p/255254#M48742</link>
      <description>&lt;P&gt;I have answered my own question. I used SAS' geodist for the example solution.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;data test;
 infile datalines delimiter = ',';
 input serial lat long date;
 format date date9.;
 datalines;
 1, 58.6732, 118.2569, 20225
 1, 58.9852, 118.2568, 20226
 1, 58.9865, 118.3156, 20227
 1, 75.259, -52.0023,  20228
 1, 75.2689, -52.125,  20229
 2, 10.123, 15.256,    22111
 2, 10.2365, 14.2698,  22112
 2, 11.3569, 15.3489,  22113
 3, 23.2569, 0.2659,   21256
 3, 22.6789, 0.2788,   21257
 3, 12.3569, 35.126,   21258
 3, 12.3789, 34.679,   21259
 ;
run;

data test;
	set test;
	i = _n_;
run;

proc sql _method;
	create table results (drop = sn date1 j) 
	as select
		a.*,
		b.serial as sn label = 'sn',
		b.lat as lat1 label = 'lat1',
		b.long as long1 label = 'long1',
		b.date as date1 label = 'date1',
		b.i as j,
		geodist(a.lat, a.long, b.lat, b.long) as gdist
	from test as a
	inner join test as b
	on a.serial = b.serial
	where a.i &amp;gt; b.i;
quit; 

proc sort results;
	by serial date;
run;&lt;/CODE&gt;&lt;/PRE&gt;</description>
      <pubDate>Tue, 08 Mar 2016 15:08:52 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Using-hash-tables-for-pairwise-comparing/m-p/255254#M48742</guid>
      <dc:creator>dwsmith</dc:creator>
      <dc:date>2016-03-08T15:08:52Z</dc:date>
    </item>
    <item>
      <title>Re: Using hash tables for pairwise comparing</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Using-hash-tables-for-pairwise-comparing/m-p/255275#M48744</link>
      <description>&lt;P&gt;And that solution uses a reflexive join ...&lt;/P&gt;</description>
      <pubDate>Tue, 08 Mar 2016 15:46:42 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Using-hash-tables-for-pairwise-comparing/m-p/255275#M48744</guid>
      <dc:creator>ballardw</dc:creator>
      <dc:date>2016-03-08T15:46:42Z</dc:date>
    </item>
    <item>
      <title>Re: Using hash tables for pairwise comparing</title>
      <link>https://communities.sas.com/t5/SAS-Programming/Using-hash-tables-for-pairwise-comparing/m-p/255277#M48746</link>
      <description>&lt;P&gt;I am not sure if this solution is good for a data set with 15million rows though.&lt;/P&gt;</description>
      <pubDate>Tue, 08 Mar 2016 15:47:41 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/Using-hash-tables-for-pairwise-comparing/m-p/255277#M48746</guid>
      <dc:creator>dwsmith</dc:creator>
      <dc:date>2016-03-08T15:47:41Z</dc:date>
    </item>
  </channel>
</rss>

