BookmarkSubscribeRSS Feed
France
Quartz | Level 8

dear all,

 

why some of the observation cannot be matched?

I am trying to clean the company name and merge two excel based on the cleaned company name.

 

I process the excel Step5.DATASTREAM_GB2  by following codes,

data Step3.Datastream_gb_export_2nd;
set Step5.Datastream_gb2;
COMPANY_NAME = prxChange("s/,?\s*(incorporated|incorporation|INCORP|COMPANY|inco|inc|corporation|corp|limited|ltd|plc|ag|s\.a\.|sa)\.?\s*$//io",1,COMPANY_NAME);
COMPANY_NAME_1 = prxChange("s/,?\s*(incorporated|incorporation|INCORP|COMPANY|inco|inc|corporation|corp|limited|ltd|plc|ag|s\.a\.|sa)\.?\s*$//io",1,COMPANY_NAME_1);
COMPANY_NAME___SHORT = prxChange("s/,?\s*(incorporated|incorporation|INCORP|COMPANY|inco|inc|corporation|corp|limited|ltd|plc|ag|s\.a\.|sa)\.?\s*$//io",1,COMPANY_NAME___SHORT);
PREVIOUS_NAME = prxChange("s/,?\s*(incorporated|incorporation|INCORP|COMPANY|inco|inc|corporation|corp|limited|ltd|plc|ag|s\.a\.|sa)\.?\s*$//io",1,PREVIOUS_NAME);
run;

data Step3.Datastream_gb_export_2nd;
  set Step3.Datastream_gb_export_2nd;
  if GEOGRAPHIC_DESCR_='NETHERLANDS' then do;
    COMPANY_NAME   = prxchange('s/(NV|N\.V\.)$//',1,trim(COMPANY_NAME  ));
    COMPANY_NAME_1 = prxchange('s/(NV|N\.V\.)$//',1,trim(COMPANY_NAME_1));
	COMPANY_NAME___SHORT = prxchange('s/(NV|N\.V\.)$//',1,trim(COMPANY_NAME___SHORT));
	PREVIOUS_NAME = prxchange('s/(NV|N\.V\.)$//',1,trim(PREVIOUS_NAME));
  end;
  if GEOGRAPHIC_DESCR_='SWEDEN' then do;
    COMPANY_NAME   = prxchange('s/(AB)$//',1,trim(COMPANY_NAME  ));
    COMPANY_NAME_1 = prxchange('s/(AB)$//',1,trim(COMPANY_NAME_1));
	COMPANY_NAME___SHORT = prxchange('s/(AB)$//',1,trim(COMPANY_NAME___SHORT));
	PREVIOUS_NAME = prxchange('s/(AB)$//',1,trim(PREVIOUS_NAME));
  end;
   if GEOGRAPHIC_DESCR_='FINLAND' then do;
    COMPANY_NAME   = prxchange('s/(AB|OYJ)$//',1,trim(COMPANY_NAME  ));
    COMPANY_NAME_1 = prxchange('s/(AB|OYJ)$//',1,trim(COMPANY_NAME_1));
	COMPANY_NAME___SHORT = prxchange('s/(AB|OYJ)$//',1,trim(COMPANY_NAME___SHORT));
	PREVIOUS_NAME = prxchange('s/(AB|OYJ)$//',1,trim(PREVIOUS_NAME));
  end;
  if GEOGRAPHIC_DESCR_='ITALY' then do;
    COMPANY_NAME   = prxchange('s/(SPA|S\.P\.A\.)$//',1,trim(COMPANY_NAME  ));
    COMPANY_NAME_1 = prxchange('s/(SPA|S\.P\.A\.)$//',1,trim(COMPANY_NAME_1));
	COMPANY_NAME___SHORT = prxchange('s/(SPA|S\.P\.A\.)$//',1,trim(COMPANY_NAME___SHORT));
	PREVIOUS_NAME = prxchange('s/(SPA|S\.P\.A\.)$//',1,trim(PREVIOUS_NAME));
  end;
    if GEOGRAPHIC_DESCR_='NORWAY' then do;
    COMPANY_NAME   = prxchange('s/(ASA)$//',1,trim(COMPANY_NAME  ));
    COMPANY_NAME_1 = prxchange('s/(ASA)$//',1,trim(COMPANY_NAME_1));
	COMPANY_NAME___SHORT = prxchange('s/(ASA)$//',1,trim(COMPANY_NAME___SHORT));
	PREVIOUS_NAME = prxchange('s/(ASA)$//',1,trim(PREVIOUS_NAME));
  end;

run;

then match the processed Step5.DATASTREAM_GB2 (i.e.,Step3.Datastream_gb_export_2nd) with Step5.PATSTAT_GB2 by following codes,

data Step3.Datastream_gb_export_2nd;
set Step3.Datastream_gb_export_2nd;
run;

data Step3.Datastream_gb2;
  set Step2.Datastream_gb_export_2nd;
  rename name=psn_name;
run;

proc sort data= Step3.Datastream_gb2;
  by psn_name;
run;

proc sort data= Step5.PATSTAT_GB2;
  by psn_name;
run;

data STEP6.Datastream_Patstat1(keep=psn_name Type) step4.Patstat_gb_simple2 step3.Datastream_gb3;
   merge Step5.PATSTAT_GB2 (in=ina)
         Step3.Datastream_gb2 (in=inb)
   ;
   by psn_name;
   if ina and inb then output STEP6.Datastream_Patstat1;
   else if ina then output step4.Patstat_gb_simple2;
   else if inb then output step3.Datastream_gb3;
run;

data step3.Datastream_gb3;
  set step3.Datastream_gb3;
  drop psn_name;
  rename COMPANY_NAME=psn_name;
run;

proc sort data=step3.Datastream_gb3;
  by psn_name;
run;

proc sort data=step4.Patstat_gb_simple2;
  by psn_name;
run;

data STEP6.Datastream_Patstat2(keep=psn_name Type) step4.Patstat_gb_simple3 step3.Datastream_gb4;
   merge step4.Patstat_gb_simple2 (in=ina)
         step3.Datastream_gb3 (in=inb)
   ;
   by psn_name;
   if ina and inb then output STEP6.Datastream_Patstat2;
   else if ina then output step4.Patstat_gb_simple3;
   else if inb then output step3.Datastream_gb4;
run;

data step3.Datastream_gb4;
  set step3.Datastream_gb4;
  drop psn_name;
  rename COMPANY_NAME_1=psn_name;
run;

proc sort data=step3.Datastream_gb4;
  by psn_name;
run;

proc sort data=step4.Patstat_gb_simple3;
  by psn_name;
run;

data STEP6.Datastream_Patstat3(keep=psn_name Type) step4.Patstat_gb_simple4 step3.Datastream_gb5;
   merge step4.Patstat_gb_simple3 (in=ina)
         step3.Datastream_gb4 (in=inb)
   ;
   by psn_name;
   if ina and inb then output STEP6.Datastream_Patstat3;
   else if ina then output step4.Patstat_gb_simple4;
   else if inb then output step3.Datastream_gb5;
run;
data step3.Datastream_gb5;
  set step3.Datastream_gb5;
  drop psn_name;
  rename COMPANY_NAME___SHORT=psn_name;
run;

proc sort data=step3.Datastream_gb5;
  by psn_name;
run;

proc sort data=step4.Patstat_gb_simple4;
  by psn_name;
run;

data STEP6.Datastream_Patstat4(keep=psn_name Type) step4.Patstat_gb_simple5 step3.Datastream_gb6;
   merge step4.Patstat_gb_simple4 (in=ina)
         step3.Datastream_gb5 (in=inb)
   ;
   by psn_name;
   if ina and inb then output STEP6.Datastream_Patstat4;
   else if ina then output step4.Patstat_gb_simple5;
   else if inb then output step3.Datastream_gb6;
run;
data step3.Datastream_gb6;
  set step3.Datastream_gb6;
  drop psn_name;
  rename PREVIOUS_NAME=psn_name;
run;

proc sort data=step3.Datastream_gb6;
  by psn_name;
run;

proc sort data=step4.Patstat_gb_simple5;
  by psn_name;
run;

data STEP6.Datastream_Patstat5(keep=psn_name Type) step4.Patstat_gb_simple6 step3.Datastream_gb7;
   merge step4.Patstat_gb_simple5 (in=ina)
         step3.Datastream_gb6 (in=inb)
   ;
   by psn_name;
   if ina and inb then output STEP6.Datastream_Patstat5;
   else if ina then output step4.Patstat_gb_simple6;
   else if inb then output step3.Datastream_gb7;
run;

data step7.Datastream_patstat_match2;
set Step6.Datastream_patstat: ;
run;

however, some of the observations cannot be matched together even though they have been processed and are same. 

for example, the 'ALSTOM' in the processed table 'Step3.Datastream_gb_export_2nd'(i.e., the table has been added in the attachment as an excel'Datastream_gb_export_2nd.xlsx'), however, it cannot be matched with the same character 'ALSTOM' in the table'Step5.PATSTAT_GB2' (excel'DATASTREAM_GB2.xlsx')

 

could you please give me some suggestion about this? why I cannot merge them together?

thanks in advance.

3 REPLIES 3
mkeintz
PROC Star

If they don't match then they are not both "ALSTOM" - perhaps neither of them are.

 

Perhaps you have whitespace characters that are not true blanks.

 

I would read the data from each source, find the unmatched name, and then write out the values to the login using format $x.  (where x is the longest possible length of the name variable), and also a second time using $HEXy., where y=2x.  See if the $HEXy. values are the same from both sources.

 

data _null_;

  set unmatched_source1;

  put 'Source1: ' name=$40. / name=$hex80.;

  set unmatched_source2;

  put 'Source2: ' name=$40. / name=$hex80.;

run;

--------------------------
The hash OUTPUT method will overwrite a SAS data set, but not append. That can be costly. Consider voting for Add a HASH object method which would append a hash object to an existing SAS data set

Would enabling PROC SORT to simultaneously output multiple datasets be useful? Then vote for
Allow PROC SORT to output multiple datasets

--------------------------
France
Quartz | Level 8
dear mkeintz,

thanks for your advice.

could you please explain it more clearly. I am a beginner at SAS and not very understand. What is the meaning of 'unmatched_source', 'Source' ? could you please explain it by using the sample?

thanks in advance.
mkeintz
PROC Star

When you do the initial match you should generate 3 result data sets:

  1. The set of matched records.
  2. The unmatched records from source1   ("unmatched_source1")
  3. The unmatched records from source2  ("unmatched_source2");

 

It's the contents of data sets 2 and 3 then I recommend you examine for whitespace that is not a true blank.  For instance, you apparently have an "ALSTOM" record in both original data sets.  But they don't match, so one should end up in unmatched_source1, and the other in unmatched_source2.

 

I think in your last data step you use the names step4.Patstat_gb_simple6 and step3.Datastream_gb7.

 

 

 

 

--------------------------
The hash OUTPUT method will overwrite a SAS data set, but not append. That can be costly. Consider voting for Add a HASH object method which would append a hash object to an existing SAS data set

Would enabling PROC SORT to simultaneously output multiple datasets be useful? Then vote for
Allow PROC SORT to output multiple datasets

--------------------------

sas-innovate-2024.png

Available on demand!

Missed SAS Innovate Las Vegas? Watch all the action for free! View the keynotes, general sessions and 22 breakouts on demand.

 

Register now!

How to Concatenate Values

Learn how use the CAT functions in SAS to join values from multiple variables into a single value.

Find more tutorials on the SAS Users YouTube channel.

Click image to register for webinarClick image to register for webinar

Classroom Training Available!

Select SAS Training centers are offering in-person courses. View upcoming courses for:

View all other training opportunities.

Discussion stats
  • 3 replies
  • 536 views
  • 3 likes
  • 2 in conversation