<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: How to clean duplicate records based on minor typos in SAS Programming</title>
    <link>https://communities.sas.com/t5/SAS-Programming/How-to-clean-duplicate-records-based-on-minor-typos/m-p/868194#M342936</link>
    <description>&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;/*
It is fuzz matched problem.
*/
data have;
infile cards expandtabs;
input (Record	FirstName	LastName	Gender) ($);
cards;
009	Jason	Aman	M
009	Jazon	Aman	M
100	Marshall	Reid	F
101	Reed	Jones	M
102	Aundo	Kalen	F
102	Aundy	Kalen	F
102	Jack	Sarz	M
110	Jack	Sarz	M
110	Kack	Sarz	M
;

proc sql;
create table k as
select a.Record,a.FirstName,b.FirstName as _FirstName,spedis(a.FirstName,b.FirstName) as dis
 from have as a,have as b
  where a.Record=b.Record and a.FirstName ne b.FirstName 
 ;
quit;
data k2;
 set k;
 call sortc(FirstName,_FirstName);
run;
proc sort data=k2 nodupkey;by Record FirstName _FirstName;run;
data k3;
 set k2;
 if dis&amp;lt;50 then group=1;
  else group=dis;
 output;

 if dis&amp;lt;50 then group=1;
  else group=dis+1;
 FirstName=_FirstName;output;
 keep Record FirstName group;
run;
proc sort data=k3 out=k4 ;by Record FirstName group;run;
proc sort data=k4 nodupkey ;by Record FirstName;run;
proc sql;
create table temp as
select a.*,b.group
 from have as a left join k4 as b
  on a.Record=b.Record and a.FirstName=b.FirstName;
quit;
proc sort data=temp out=want nodupkey;
by Record group;
run;&lt;/CODE&gt;&lt;/PRE&gt;</description>
    <pubDate>Wed, 05 Apr 2023 11:20:27 GMT</pubDate>
    <dc:creator>Ksharp</dc:creator>
    <dc:date>2023-04-05T11:20:27Z</dc:date>
    <item>
      <title>How to clean duplicate records based on minor typos</title>
      <link>https://communities.sas.com/t5/SAS-Programming/How-to-clean-duplicate-records-based-on-minor-typos/m-p/868025#M342838</link>
      <description>&lt;P&gt;HI,&lt;/P&gt;&lt;P&gt;I have a dataset that looks like the following. There are duplicate record IDs due to minor typos in the firstname and lastname variables.&amp;nbsp;&lt;/P&gt;&lt;TABLE border="1"&gt;&lt;TBODY&gt;&lt;TR&gt;&lt;TD&gt;Record&lt;/TD&gt;&lt;TD&gt;FirstName&lt;/TD&gt;&lt;TD&gt;LastName&lt;/TD&gt;&lt;TD&gt;Gender&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;009&lt;/TD&gt;&lt;TD&gt;Jason&lt;/TD&gt;&lt;TD&gt;Aman&lt;/TD&gt;&lt;TD&gt;M&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;009&lt;/TD&gt;&lt;TD&gt;Jazon&lt;/TD&gt;&lt;TD&gt;Aman&lt;/TD&gt;&lt;TD&gt;M&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;100&lt;/TD&gt;&lt;TD&gt;Marshall&lt;/TD&gt;&lt;TD&gt;Reid&lt;/TD&gt;&lt;TD&gt;F&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;101&lt;/TD&gt;&lt;TD&gt;Reed&lt;/TD&gt;&lt;TD&gt;Jones&lt;/TD&gt;&lt;TD&gt;M&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;102&lt;/TD&gt;&lt;TD&gt;Aundo&lt;/TD&gt;&lt;TD&gt;Kalen&lt;/TD&gt;&lt;TD&gt;F&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;102&lt;/TD&gt;&lt;TD&gt;Aundy&lt;/TD&gt;&lt;TD&gt;Kalen&lt;/TD&gt;&lt;TD&gt;F&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;110&lt;/TD&gt;&lt;TD&gt;Jack&lt;/TD&gt;&lt;TD&gt;Sarz&lt;/TD&gt;&lt;TD&gt;M&lt;/TD&gt;&lt;/TR&gt;&lt;/TBODY&gt;&lt;/TABLE&gt;&lt;P&gt;I would like to fix these duplicates, so there's only one line per record number. I've started fixing them manually, but there are at least 700 observations. This is what I want the dataset to look like:&lt;/P&gt;&lt;TABLE border="1"&gt;&lt;TBODY&gt;&lt;TR&gt;&lt;TD&gt;Record&lt;/TD&gt;&lt;TD&gt;FirstName&lt;/TD&gt;&lt;TD&gt;LastName&lt;/TD&gt;&lt;TD&gt;Gender&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;009&lt;/TD&gt;&lt;TD&gt;Jason&lt;/TD&gt;&lt;TD&gt;Aman&lt;/TD&gt;&lt;TD&gt;M&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;100&lt;/TD&gt;&lt;TD&gt;Marshall&lt;/TD&gt;&lt;TD&gt;Reid&lt;/TD&gt;&lt;TD&gt;F&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;101&lt;/TD&gt;&lt;TD&gt;Reed&lt;/TD&gt;&lt;TD&gt;Jones&lt;/TD&gt;&lt;TD&gt;M&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;102&lt;/TD&gt;&lt;TD&gt;Aundy&lt;/TD&gt;&lt;TD&gt;Kalen&lt;/TD&gt;&lt;TD&gt;F&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD&gt;110&lt;/TD&gt;&lt;TD&gt;Jack&lt;/TD&gt;&lt;TD&gt;Sarz&lt;/TD&gt;&lt;TD&gt;M&lt;/TD&gt;&lt;/TR&gt;&lt;/TBODY&gt;&lt;/TABLE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Before continuing with the manual approach, I wanted to check and see if anyone had ideas of a are more programmatic way to fix these duplicate names?&lt;/P&gt;</description>
      <pubDate>Tue, 04 Apr 2023 16:47:31 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/How-to-clean-duplicate-records-based-on-minor-typos/m-p/868025#M342838</guid>
      <dc:creator>newtosas34</dc:creator>
      <dc:date>2023-04-04T16:47:31Z</dc:date>
    </item>
    <item>
      <title>Re: How to clean duplicate records based on minor typos</title>
      <link>https://communities.sas.com/t5/SAS-Programming/How-to-clean-duplicate-records-based-on-minor-typos/m-p/868043#M342853</link>
      <description>&lt;P&gt;When there are duplicates, which of the duplicate records should be kept?&lt;/P&gt;</description>
      <pubDate>Tue, 04 Apr 2023 17:44:22 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/How-to-clean-duplicate-records-based-on-minor-typos/m-p/868043#M342853</guid>
      <dc:creator>PaigeMiller</dc:creator>
      <dc:date>2023-04-04T17:44:22Z</dc:date>
    </item>
    <item>
      <title>Re: How to clean duplicate records based on minor typos</title>
      <link>https://communities.sas.com/t5/SAS-Programming/How-to-clean-duplicate-records-based-on-minor-typos/m-p/868054#M342856</link>
      <description>&lt;P&gt;This varies depending on ID. I guess the one that doesn't' have a typo....but is that even possible to specify in SAS?&lt;/P&gt;</description>
      <pubDate>Tue, 04 Apr 2023 18:57:56 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/How-to-clean-duplicate-records-based-on-minor-typos/m-p/868054#M342856</guid>
      <dc:creator>newtosas34</dc:creator>
      <dc:date>2023-04-04T18:57:56Z</dc:date>
    </item>
    <item>
      <title>Re: How to clean duplicate records based on minor typos</title>
      <link>https://communities.sas.com/t5/SAS-Programming/How-to-clean-duplicate-records-based-on-minor-typos/m-p/868056#M342858</link>
      <description>&lt;BLOCKQUOTE&gt;&lt;HR /&gt;&lt;a href="https://communities.sas.com/t5/user/viewprofilepage/user-id/441515"&gt;@newtosas34&lt;/a&gt;&amp;nbsp;wrote:&lt;BR /&gt;
&lt;P&gt;This varies depending on ID. I guess the one that doesn't' have a typo....but is that even possible to specify in SAS?&lt;/P&gt;
&lt;HR /&gt;&lt;/BLOCKQUOTE&gt;
&lt;P&gt;There isn't any function in SAS to determine which name is a typo and which isn't. It's probably something you have to do manually (but really, how does that even work, how do you know the person's name isn't really Jazon?)&lt;/P&gt;</description>
      <pubDate>Tue, 04 Apr 2023 19:00:28 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/How-to-clean-duplicate-records-based-on-minor-typos/m-p/868056#M342858</guid>
      <dc:creator>PaigeMiller</dc:creator>
      <dc:date>2023-04-04T19:00:28Z</dc:date>
    </item>
    <item>
      <title>Re: How to clean duplicate records based on minor typos</title>
      <link>https://communities.sas.com/t5/SAS-Programming/How-to-clean-duplicate-records-based-on-minor-typos/m-p/868078#M342864</link>
      <description>&lt;P&gt;If there is no business rule you can apply to identify a typo (and really there isn't) then you can't code for that. The best you can do is maybe choose arbitrarily one of the names and use that across all instances for that customer.&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Tue, 04 Apr 2023 20:33:37 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/How-to-clean-duplicate-records-based-on-minor-typos/m-p/868078#M342864</guid>
      <dc:creator>SASKiwi</dc:creator>
      <dc:date>2023-04-04T20:33:37Z</dc:date>
    </item>
    <item>
      <title>Re: How to clean duplicate records based on minor typos</title>
      <link>https://communities.sas.com/t5/SAS-Programming/How-to-clean-duplicate-records-based-on-minor-typos/m-p/868088#M342874</link>
      <description>&lt;P&gt;Let's not forget, that some people really do have unusual spellings in their name. Basketball player Micheal (not Michael) Ray Richardson.&lt;/P&gt;</description>
      <pubDate>Tue, 04 Apr 2023 21:18:19 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/How-to-clean-duplicate-records-based-on-minor-typos/m-p/868088#M342874</guid>
      <dc:creator>PaigeMiller</dc:creator>
      <dc:date>2023-04-04T21:18:19Z</dc:date>
    </item>
    <item>
      <title>Re: How to clean duplicate records based on minor typos</title>
      <link>https://communities.sas.com/t5/SAS-Programming/How-to-clean-duplicate-records-based-on-minor-typos/m-p/868194#M342936</link>
      <description>&lt;PRE&gt;&lt;CODE class=" language-sas"&gt;/*
It is fuzz matched problem.
*/
data have;
infile cards expandtabs;
input (Record	FirstName	LastName	Gender) ($);
cards;
009	Jason	Aman	M
009	Jazon	Aman	M
100	Marshall	Reid	F
101	Reed	Jones	M
102	Aundo	Kalen	F
102	Aundy	Kalen	F
102	Jack	Sarz	M
110	Jack	Sarz	M
110	Kack	Sarz	M
;

proc sql;
create table k as
select a.Record,a.FirstName,b.FirstName as _FirstName,spedis(a.FirstName,b.FirstName) as dis
 from have as a,have as b
  where a.Record=b.Record and a.FirstName ne b.FirstName 
 ;
quit;
data k2;
 set k;
 call sortc(FirstName,_FirstName);
run;
proc sort data=k2 nodupkey;by Record FirstName _FirstName;run;
data k3;
 set k2;
 if dis&amp;lt;50 then group=1;
  else group=dis;
 output;

 if dis&amp;lt;50 then group=1;
  else group=dis+1;
 FirstName=_FirstName;output;
 keep Record FirstName group;
run;
proc sort data=k3 out=k4 ;by Record FirstName group;run;
proc sort data=k4 nodupkey ;by Record FirstName;run;
proc sql;
create table temp as
select a.*,b.group
 from have as a left join k4 as b
  on a.Record=b.Record and a.FirstName=b.FirstName;
quit;
proc sort data=temp out=want nodupkey;
by Record group;
run;&lt;/CODE&gt;&lt;/PRE&gt;</description>
      <pubDate>Wed, 05 Apr 2023 11:20:27 GMT</pubDate>
      <guid>https://communities.sas.com/t5/SAS-Programming/How-to-clean-duplicate-records-based-on-minor-typos/m-p/868194#M342936</guid>
      <dc:creator>Ksharp</dc:creator>
      <dc:date>2023-04-05T11:20:27Z</dc:date>
    </item>
  </channel>
</rss>

