BookmarkSubscribeRSS Feed
France
Quartz | Level 8

dear all,

 

why some of the observation cannot be matched?

I am trying to clean the company name and merge two excel based on the cleaned company name.

 

I process the excel Step5.DATASTREAM_GB2  by following codes,

data Step3.Datastream_gb_export_2nd;
set Step5.Datastream_gb2;
COMPANY_NAME = prxChange("s/,?\s*(incorporated|incorporation|INCORP|COMPANY|inco|inc|corporation|corp|limited|ltd|plc|ag|s\.a\.|sa)\.?\s*$//io",1,COMPANY_NAME);
COMPANY_NAME_1 = prxChange("s/,?\s*(incorporated|incorporation|INCORP|COMPANY|inco|inc|corporation|corp|limited|ltd|plc|ag|s\.a\.|sa)\.?\s*$//io",1,COMPANY_NAME_1);
COMPANY_NAME___SHORT = prxChange("s/,?\s*(incorporated|incorporation|INCORP|COMPANY|inco|inc|corporation|corp|limited|ltd|plc|ag|s\.a\.|sa)\.?\s*$//io",1,COMPANY_NAME___SHORT);
PREVIOUS_NAME = prxChange("s/,?\s*(incorporated|incorporation|INCORP|COMPANY|inco|inc|corporation|corp|limited|ltd|plc|ag|s\.a\.|sa)\.?\s*$//io",1,PREVIOUS_NAME);
run;

data Step3.Datastream_gb_export_2nd;
  set Step3.Datastream_gb_export_2nd;
  if GEOGRAPHIC_DESCR_='NETHERLANDS' then do;
    COMPANY_NAME   = prxchange('s/(NV|N\.V\.)$//',1,trim(COMPANY_NAME  ));
    COMPANY_NAME_1 = prxchange('s/(NV|N\.V\.)$//',1,trim(COMPANY_NAME_1));
	COMPANY_NAME___SHORT = prxchange('s/(NV|N\.V\.)$//',1,trim(COMPANY_NAME___SHORT));
	PREVIOUS_NAME = prxchange('s/(NV|N\.V\.)$//',1,trim(PREVIOUS_NAME));
  end;
  if GEOGRAPHIC_DESCR_='SWEDEN' then do;
    COMPANY_NAME   = prxchange('s/(AB)$//',1,trim(COMPANY_NAME  ));
    COMPANY_NAME_1 = prxchange('s/(AB)$//',1,trim(COMPANY_NAME_1));
	COMPANY_NAME___SHORT = prxchange('s/(AB)$//',1,trim(COMPANY_NAME___SHORT));
	PREVIOUS_NAME = prxchange('s/(AB)$//',1,trim(PREVIOUS_NAME));
  end;
   if GEOGRAPHIC_DESCR_='FINLAND' then do;
    COMPANY_NAME   = prxchange('s/(AB|OYJ)$//',1,trim(COMPANY_NAME  ));
    COMPANY_NAME_1 = prxchange('s/(AB|OYJ)$//',1,trim(COMPANY_NAME_1));
	COMPANY_NAME___SHORT = prxchange('s/(AB|OYJ)$//',1,trim(COMPANY_NAME___SHORT));
	PREVIOUS_NAME = prxchange('s/(AB|OYJ)$//',1,trim(PREVIOUS_NAME));
  end;
  if GEOGRAPHIC_DESCR_='ITALY' then do;
    COMPANY_NAME   = prxchange('s/(SPA|S\.P\.A\.)$//',1,trim(COMPANY_NAME  ));
    COMPANY_NAME_1 = prxchange('s/(SPA|S\.P\.A\.)$//',1,trim(COMPANY_NAME_1));
	COMPANY_NAME___SHORT = prxchange('s/(SPA|S\.P\.A\.)$//',1,trim(COMPANY_NAME___SHORT));
	PREVIOUS_NAME = prxchange('s/(SPA|S\.P\.A\.)$//',1,trim(PREVIOUS_NAME));
  end;
    if GEOGRAPHIC_DESCR_='NORWAY' then do;
    COMPANY_NAME   = prxchange('s/(ASA)$//',1,trim(COMPANY_NAME  ));
    COMPANY_NAME_1 = prxchange('s/(ASA)$//',1,trim(COMPANY_NAME_1));
	COMPANY_NAME___SHORT = prxchange('s/(ASA)$//',1,trim(COMPANY_NAME___SHORT));
	PREVIOUS_NAME = prxchange('s/(ASA)$//',1,trim(PREVIOUS_NAME));
  end;

run;

then match the processed Step5.DATASTREAM_GB2 (i.e.,Step3.Datastream_gb_export_2nd) with Step5.PATSTAT_GB2 by following codes,

data Step3.Datastream_gb_export_2nd;
set Step3.Datastream_gb_export_2nd;
run;

data Step3.Datastream_gb2;
  set Step2.Datastream_gb_export_2nd;
  rename name=psn_name;
run;

proc sort data= Step3.Datastream_gb2;
  by psn_name;
run;

proc sort data= Step5.PATSTAT_GB2;
  by psn_name;
run;

data STEP6.Datastream_Patstat1(keep=psn_name Type) step4.Patstat_gb_simple2 step3.Datastream_gb3;
   merge Step5.PATSTAT_GB2 (in=ina)
         Step3.Datastream_gb2 (in=inb)
   ;
   by psn_name;
   if ina and inb then output STEP6.Datastream_Patstat1;
   else if ina then output step4.Patstat_gb_simple2;
   else if inb then output step3.Datastream_gb3;
run;

data step3.Datastream_gb3;
  set step3.Datastream_gb3;
  drop psn_name;
  rename COMPANY_NAME=psn_name;
run;

proc sort data=step3.Datastream_gb3;
  by psn_name;
run;

proc sort data=step4.Patstat_gb_simple2;
  by psn_name;
run;

data STEP6.Datastream_Patstat2(keep=psn_name Type) step4.Patstat_gb_simple3 step3.Datastream_gb4;
   merge step4.Patstat_gb_simple2 (in=ina)
         step3.Datastream_gb3 (in=inb)
   ;
   by psn_name;
   if ina and inb then output STEP6.Datastream_Patstat2;
   else if ina then output step4.Patstat_gb_simple3;
   else if inb then output step3.Datastream_gb4;
run;

data step3.Datastream_gb4;
  set step3.Datastream_gb4;
  drop psn_name;
  rename COMPANY_NAME_1=psn_name;
run;

proc sort data=step3.Datastream_gb4;
  by psn_name;
run;

proc sort data=step4.Patstat_gb_simple3;
  by psn_name;
run;

data STEP6.Datastream_Patstat3(keep=psn_name Type) step4.Patstat_gb_simple4 step3.Datastream_gb5;
   merge step4.Patstat_gb_simple3 (in=ina)
         step3.Datastream_gb4 (in=inb)
   ;
   by psn_name;
   if ina and inb then output STEP6.Datastream_Patstat3;
   else if ina then output step4.Patstat_gb_simple4;
   else if inb then output step3.Datastream_gb5;
run;
data step3.Datastream_gb5;
  set step3.Datastream_gb5;
  drop psn_name;
  rename COMPANY_NAME___SHORT=psn_name;
run;

proc sort data=step3.Datastream_gb5;
  by psn_name;
run;

proc sort data=step4.Patstat_gb_simple4;
  by psn_name;
run;

data STEP6.Datastream_Patstat4(keep=psn_name Type) step4.Patstat_gb_simple5 step3.Datastream_gb6;
   merge step4.Patstat_gb_simple4 (in=ina)
         step3.Datastream_gb5 (in=inb)
   ;
   by psn_name;
   if ina and inb then output STEP6.Datastream_Patstat4;
   else if ina then output step4.Patstat_gb_simple5;
   else if inb then output step3.Datastream_gb6;
run;
data step3.Datastream_gb6;
  set step3.Datastream_gb6;
  drop psn_name;
  rename PREVIOUS_NAME=psn_name;
run;

proc sort data=step3.Datastream_gb6;
  by psn_name;
run;

proc sort data=step4.Patstat_gb_simple5;
  by psn_name;
run;

data STEP6.Datastream_Patstat5(keep=psn_name Type) step4.Patstat_gb_simple6 step3.Datastream_gb7;
   merge step4.Patstat_gb_simple5 (in=ina)
         step3.Datastream_gb6 (in=inb)
   ;
   by psn_name;
   if ina and inb then output STEP6.Datastream_Patstat5;
   else if ina then output step4.Patstat_gb_simple6;
   else if inb then output step3.Datastream_gb7;
run;

data step7.Datastream_patstat_match2;
set Step6.Datastream_patstat: ;
run;

however, some of the observations cannot be matched together even though they have been processed and are same. 

for example, the 'ALSTOM' in the processed table 'Step3.Datastream_gb_export_2nd'(i.e., the table has been added in the attachment as an excel'Datastream_gb_export_2nd.xlsx'), however, it cannot be matched with the same character 'ALSTOM' in the table'Step5.PATSTAT_GB2' (excel'DATASTREAM_GB2.xlsx')

 

could you please give me some suggestion about this? why I cannot merge them together?

thanks in advance.

3 REPLIES 3
mkeintz
PROC Star

If they don't match then they are not both "ALSTOM" - perhaps neither of them are.

 

Perhaps you have whitespace characters that are not true blanks.

 

I would read the data from each source, find the unmatched name, and then write out the values to the login using format $x.  (where x is the longest possible length of the name variable), and also a second time using $HEXy., where y=2x.  See if the $HEXy. values are the same from both sources.

 

data _null_;

  set unmatched_source1;

  put 'Source1: ' name=$40. / name=$hex80.;

  set unmatched_source2;

  put 'Source2: ' name=$40. / name=$hex80.;

run;

--------------------------
The hash OUTPUT method will overwrite a SAS data set, but not append. That can be costly. Consider voting for Add a HASH object method which would append a hash object to an existing SAS data set

Would enabling PROC SORT to simultaneously output multiple datasets be useful? Then vote for
Allow PROC SORT to output multiple datasets

--------------------------
France
Quartz | Level 8
dear mkeintz,

thanks for your advice.

could you please explain it more clearly. I am a beginner at SAS and not very understand. What is the meaning of 'unmatched_source', 'Source' ? could you please explain it by using the sample?

thanks in advance.
mkeintz
PROC Star

When you do the initial match you should generate 3 result data sets:

  1. The set of matched records.
  2. The unmatched records from source1   ("unmatched_source1")
  3. The unmatched records from source2  ("unmatched_source2");

 

It's the contents of data sets 2 and 3 then I recommend you examine for whitespace that is not a true blank.  For instance, you apparently have an "ALSTOM" record in both original data sets.  But they don't match, so one should end up in unmatched_source1, and the other in unmatched_source2.

 

I think in your last data step you use the names step4.Patstat_gb_simple6 and step3.Datastream_gb7.

 

 

 

 

--------------------------
The hash OUTPUT method will overwrite a SAS data set, but not append. That can be costly. Consider voting for Add a HASH object method which would append a hash object to an existing SAS data set

Would enabling PROC SORT to simultaneously output multiple datasets be useful? Then vote for
Allow PROC SORT to output multiple datasets

--------------------------

hackathon24-white-horiz.png

2025 SAS Hackathon: There is still time!

Good news: We've extended SAS Hackathon registration until Sept. 12, so you still have time to be part of our biggest event yet – our five-year anniversary!

Register Now

How to Concatenate Values

Learn how use the CAT functions in SAS to join values from multiple variables into a single value.

Find more tutorials on the SAS Users YouTube channel.

SAS Training: Just a Click Away

 Ready to level-up your skills? Choose your own adventure.

Browse our catalog!

Discussion stats
  • 3 replies
  • 932 views
  • 3 likes
  • 2 in conversation