<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Matching in SAS Procedures</title>
    <link>https://communities.sas.com/t5/SAS-Procedures/Matching/m-p/194156#M48755</link>
    <description>&lt;HTML&gt;&lt;HEAD&gt;&lt;/HEAD&gt;&lt;BODY&gt;&lt;P&gt;Query Builder is the easiest way to match/join data.&lt;/P&gt;&lt;P&gt;Be sure that you have the corresponding variables/columns in both your tables.&lt;/P&gt;&lt;/BODY&gt;&lt;/HTML&gt;</description>
    <pubDate>Wed, 04 Mar 2015 14:58:51 GMT</pubDate>
    <dc:creator>LinusH</dc:creator>
    <dc:date>2015-03-04T14:58:51Z</dc:date>
    <item>
      <title>Matching</title>
      <link>https://communities.sas.com/t5/SAS-Procedures/Matching/m-p/194155#M48754</link>
      <description>&lt;HTML&gt;&lt;HEAD&gt;&lt;/HEAD&gt;&lt;BODY&gt;&lt;P&gt;Hi,&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;I have a dataset with 250 records and a dataset of 1.4 million. I created a key which is a concatenation of lastname, last4digitstaxid and zipcode. How can i find how many of the 250 records are there in 1.4 million and what they are. I have SAS EG.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Thanks,&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Tom&lt;/P&gt;&lt;/BODY&gt;&lt;/HTML&gt;</description>
      <pubDate>Wed, 04 Mar 2015 14:50:15 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Procedures/Matching/m-p/194155#M48754</guid>
      <dc:creator>gobejo</dc:creator>
      <dc:date>2015-03-04T14:50:15Z</dc:date>
    </item>
    <item>
      <title>Re: Matching</title>
      <link>https://communities.sas.com/t5/SAS-Procedures/Matching/m-p/194156#M48755</link>
      <description>&lt;HTML&gt;&lt;HEAD&gt;&lt;/HEAD&gt;&lt;BODY&gt;&lt;P&gt;Query Builder is the easiest way to match/join data.&lt;/P&gt;&lt;P&gt;Be sure that you have the corresponding variables/columns in both your tables.&lt;/P&gt;&lt;/BODY&gt;&lt;/HTML&gt;</description>
      <pubDate>Wed, 04 Mar 2015 14:58:51 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Procedures/Matching/m-p/194156#M48755</guid>
      <dc:creator>LinusH</dc:creator>
      <dc:date>2015-03-04T14:58:51Z</dc:date>
    </item>
    <item>
      <title>Re: Matching</title>
      <link>https://communities.sas.com/t5/SAS-Procedures/Matching/m-p/194157#M48756</link>
      <description>&lt;HTML&gt;&lt;HEAD&gt;&lt;/HEAD&gt;&lt;BODY&gt;&lt;P&gt;I am looking to see if if the dataset of 1.4 million contains any of 250. So the out put range should be within 1-250. Now when i did an inner join, i get 498.&lt;/P&gt;&lt;/BODY&gt;&lt;/HTML&gt;</description>
      <pubDate>Wed, 04 Mar 2015 16:20:49 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Procedures/Matching/m-p/194157#M48756</guid>
      <dc:creator>gobejo</dc:creator>
      <dc:date>2015-03-04T16:20:49Z</dc:date>
    </item>
    <item>
      <title>Re: Matching</title>
      <link>https://communities.sas.com/t5/SAS-Procedures/Matching/m-p/194158#M48757</link>
      <description>&lt;HTML&gt;&lt;HEAD&gt;&lt;/HEAD&gt;&lt;BODY&gt;&lt;P&gt;Not necessarily. If you have duplicate keys in your 1.4 million records, ALL of them will end up in your output table.&lt;/P&gt;&lt;/BODY&gt;&lt;/HTML&gt;</description>
      <pubDate>Wed, 04 Mar 2015 17:24:34 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Procedures/Matching/m-p/194158#M48757</guid>
      <dc:creator>TomKari</dc:creator>
      <dc:date>2015-03-04T17:24:34Z</dc:date>
    </item>
    <item>
      <title>Re: Matching</title>
      <link>https://communities.sas.com/t5/SAS-Procedures/Matching/m-p/194159#M48758</link>
      <description>&lt;HTML&gt;&lt;HEAD&gt;&lt;/HEAD&gt;&lt;BODY&gt;&lt;P&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;Getting a count &amp;gt; 250 means there are some duplicates (as far as the key fields are concerned) in the larger dataset.&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;Suggested query:&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;proc sql;&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;&amp;nbsp;&amp;nbsp; select count (distinct a.last_name||a.ssn4||a.zipcode) as distinct_cnts&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;&amp;nbsp;&amp;nbsp; from&amp;nbsp;&amp;nbsp; table1 a&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;&amp;nbsp;&amp;nbsp; inner join table2 b &lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;&amp;nbsp;&amp;nbsp; on&amp;nbsp;&amp;nbsp;&amp;nbsp; (a.last_name=b.last_name)&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; and (a.ssn4=b.ssn4)&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; and (a.zipcode=b.zipcode);&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;quit;&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;Exploring using the MERGE statement.&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;Some sample datasets:&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;/*******************/&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;/*** dataset t_a ***/&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;/*******************/&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;data t_a(keep=j k);&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;&amp;nbsp; do i = 1 to 40;&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; j = ceil(5*ranuni(3));&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; k = ceil(5*ranuni(3));&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; output;&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;&amp;nbsp; end;&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;run;&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;proc sort data=t_a nodupkey; by j k; run; /*** get unique sets of (j,k) ***/&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;/*******************/&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;/*** dataset t_a ***/&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;/*******************/&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;data t_b;&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;&amp;nbsp; do i = 1 to 4000;&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; j = ceil(50*ranuni(3));&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; k = ceil(50*ranuni(3));&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; output;&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;&amp;nbsp; end;&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;run;&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;proc sort data=t_b; by j k; run;&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;/******************************************/&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;/*** calculate the number of (j,k) sets ***/&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;/*** (cnts) present in dataset t_b.&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; ***/&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;/******************************************/&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;data t_e(keep=cnts);&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;&amp;nbsp;&amp;nbsp; merge t_a(in=a) t_b(in=b) end=n_last;&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;&amp;nbsp;&amp;nbsp; by j k;&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;&amp;nbsp;&amp;nbsp; if (first.j or first.k) and (a and b) then cnts+1;&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;&amp;nbsp;&amp;nbsp; if n_last then output;&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;run;&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN style="font-family: courier new,courier;"&gt;In the current case, dataset t_e has 1 variable (cnts), 1 row, with cnts = 16.&lt;/SPAN&gt;&lt;/P&gt;&lt;/BODY&gt;&lt;/HTML&gt;</description>
      <pubDate>Wed, 04 Mar 2015 18:11:10 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Procedures/Matching/m-p/194159#M48758</guid>
      <dc:creator>billfish</dc:creator>
      <dc:date>2015-03-04T18:11:10Z</dc:date>
    </item>
  </channel>
</rss>

