Solved: Re: Compare two char variables

LineMoon · Posted 11-13-2019 04:43 PM

Hello Experts,

I have two variables in two tables.

The first variable, Var, in the first table , it is a big table.

The second variable, var_ref, in the second table (reference table) , it is a small table.

Output Rules:

If var =var_ref then CommonValue='equal'

if there is a common values then CommonValue = "common values betwwen var and var_ref"

if not CommonValue=''

> Big Table

Var

First one

Test number one

> Ref table

Var_Ref

one

First one

Test

number

> Output

Var	Var_Ref	Common Values
First one	one	one
First one	First one	equal
Test number one	one	one
Test number one	number	number
Test number one	Test	Test

hashman · Posted 11-15-2019 01:28 AM

@jarg:That's good. The more, the merrier 😉 ;

View solution in original post

Tom · Posted 11-13-2019 05:21 PM

None of those values are EQUAL. What is the test that you actually want to do? Are you trying to test if VAR_REF is contained in VAR? Does case matter? Does it need to match a full word? Does VAR_REF ever contain more than one word?

LineMoon · Posted 11-13-2019 05:36 PM

Thank you.

I want to test as described in the output

If the value of "VAR" is contained in VAR_Ref?

Does it contain at least one word in VAR_Ref ?

or if does not contain any word in in VAR_Ref ?

jarg · Posted 11-13-2019 08:07 PM

i'm certain there are at least 5 better ways of doing this but this may do what you're after:

options mlogic symbolgen;
data ref_table;
    infile datalines dlm=",";
    format var $50.;
    input var;
    datalines;
Var_Ref,
one,
First one,
Test,
number,
    ;
run;

data big_table;
    infile datalines dlm=",";
    format var $50.;
    input var;
    datalines;
Var,
First one,
Test number one,
;
run;

proc sql;
    create table distinct_varlist as
        select distinct(strip(var)) as ref_var
        from ref_table;
quit;

proc sql; 
    create table Big_table_distinct as 
        select distinct var 
        from big_table; 
quit;

proc sql noprint; select count(ref_var) into: distinct_varcount from distinct_varlist;

%macro iterative_check;

    %do i = 1 %to &distinct_varcount.;

        data _null_;
            p = &i.;
            set distinct_varlist point=p;
            call symputx('varlength',length(ref_var),'L');
            call symputx('ref_var_m',ref_var,'L');
            stop;
        run;

        data setall_&i.(drop=CV:);
            set Big_table_distinct;
                BigTable_Var = strip(Var);
                RefTable_Var = strip("&ref_var_m.");

             if BigTable_Var = RefTable_Var         then CV1 = "Equal";
             if indexw(BigTable_Var,RefTable_Var)   then CV2 = strip(substr(BigTable_Var,indexw(BigTable_Var,RefTable_Var),index(BigTable_Var,' ')));
             if indexw(RefTable_Var,BigTable_Var)   then CV3 = strip(substr(RefTable_Var,indexw(RefTable_Var,BigTable_Var),index(RefTable_Var,' ')+1));
             format Common_Values $50.;
                    Common_Values = coalescec(CV1,CV2,CV3);
        run;

    %end;
%mend iterative_check;
%iterative_check;

data combine;
    set setall:;
run;

proc sort nodupkey data=combine; by BigTable_Var RefTable_Var Common_Values;

data output;
    set combine;
        where Common_Values ne '';
run;

LineMoon · Posted 11-14-2019 03:46 PM

Thank you very much for your answer, That's very kind from you

hashman · Posted 11-13-2019 11:15 PM

@LineMoon:

A modicum of hash object usage should do it:

data big ;                                                                                                                              
  input @1 var $16. ;                                                                                                                   
  cards ;                                                                                                                               
first one                                                                                                                               
test number one                                                                                                                         
;                                                                                                                                       
run ;                                                                                                                                   
                                                                                                                                        
data ref ;                                                                                                                              
  input @1 var_ref $16. ;                                                                                                               
  cards ;                                                                                                                               
one                                                                                                                                     
first one                                                                                                                               
test                                                                                                                                    
number                                                                                                                                  
;                                                                                                                                       
run ;                                                                                                                                   
                                                                                                                                        
data want ;                                                                                                                             
  if _n_ = 1 then do ;                                                                                                                  
    if 0 then set big ref ;                                                                                                             
    dcl hash h (dataset:"ref") ;                                                                                                        
    h.definekey ("var_ref") ;                                                                                                           
    h.definedone () ;                                                                                                                   
  end ;                                                                                                                                 
  set big ;                                                                                                                             
  common_values = var ;                                                                                                                 
  if h.find (key:var) = 0 then do ;                                                                                                     
    common_values = "equal" ;                                                                                                           
    output ;                                                                                                                            
  end ;                                                                                                                                 
  do _n_ = 1 to countw (var) ;                                                                                                          
    common_values = scan (var, _n_) ;                                                                                                   
    if h.find (key:common_values) = 0 then output ;                                                                                    
  end ;                                                                                                                                 
run ;

Kind regards

Paul D.

LineMoon · Posted 11-14-2019 05:28 PM

Thank very much. that's very kind from you.

jarg · Posted 11-14-2019 06:48 PM

this is an elegant solution, it's probably about time i wrap my head around hash objects. kudos!

hashman · Posted 11-14-2019 08:34 PM

@jarg:

The hash object can make many things more elegant, simpler, and faster. That's why @DonH and I have written a whole book about it:

https://www.sas.com/store/books/categories/examples/data-management-solutions-using-sas-hash-table-o...

Though in this case, the concept of the program is quite simple:

store VAR_REF from REF in a lookup table
for each record from BIG:
search VAR in the table, and if found, output as "equal"
then search each word in VAR in the table, and if found, output it and the corresponding VAR_REF

The concept would remain the same if a different type of lookup table were used. However, the hash table makes the process of organizing the lookup table and searching it a breeze - and searches in O(1) time to boot. I agree that it behooves any SAS programmer to make the hash object, to a lesser or larger degree, an instrument in one's toolbox. After all, it's been an integral part of Base SAS for 16 years and counting!

Kind regards

Paul D.

jarg · Posted 11-15-2019 12:14 AM

i see i'm speaking to a resident expert then! shouldn't have expected anything less from the Hashman in hindsight.

thanks for the explanation, i was sure there'd be a more compact solution but wasn't expecting it all to be done in a singe step. i've been meaning to look into hashes as i'm working with some larger tables these days than i'm used to and need some superior merging methods.

i'd say you've just sold another copy of your book

hashman · Posted 11-15-2019 01:28 AM

@jarg:That's good. The more, the merrier 😉 ;

LineMoon · Posted 11-16-2019 07:02 AM

Please, is -it possible to show, the order of the first char there is a difference betwwen var and var_ref

Var=test one

Var_ref=test two

order_first_char_with_dif=5

hashman · Posted 11-16-2019 03:19 PM

@LineMoon: This is what the COMPARE function is for:

data want ;                                                                                                                             
  if _n_ = 1 then do ;                                                                                                                  
    if 0 then set big ref ;                                                                                                             
    dcl hash h (dataset:"ref") ;                                                                                                        
    h.definekey ("var_ref") ;                                                                                                           
    h.definedone () ;                                                                                                                   
  end ;                                                                                                                                 
  set big ;                                                                                                                             
  common_values = var ;                                                                                                                 
  do _n_ = 1 to countw (var) ;                                                                                                          
    common_values = scan (var, _n_) ;                                                                                                   
    if h.find (key:common_values) then continue ;                                                                                       
    order_first_char_with_dif = abs (compare (var_ref, var)) ;                                                                          
    output ;                                                                                                                            
  end ;                                                                                                                                 
  if h.find (key:var) = 0 then do ;                                                                                                     
    common_values = "equal" ;                                                                                                           
    order_first_char_with_dif = 0 ;                                                                                                     
    output ;                                                                                                                            
  end ;                                                                                                                                 
run ;

If you want the records in the output listed in the order exactly as you originally suggested, just swap the execution order around:

data want ;                                                                                                                             
  if _n_ = 1 then do ;                                                                                                                  
    if 0 then set big ref ;                                                                                                             
    dcl hash h (dataset:"ref") ;                                                                                                        
    h.definekey ("var_ref") ;                                                                                                           
    h.definedone () ;                                                                                                                   
  end ;                                                                                                                                 
  set big ;                                                                                                                             
  common_values = var ;                                                                                                                 
  common_values = "equal" ;                                                                                                             
  order_first_char_with_dif = 0 ;                                                                                                       
  if h.find (key:var) = 0 then output ;                                                                                                 
  do _n_ = countw (var) to 1 by -1 ;                                                                                                    
    common_values = scan (var, _n_) ;                                                                                                   
    if h.find (key:common_values) ne 0 then continue ;                                                                                       
    order_first_char_with_dif = abs (compare (var_ref, var)) ;                                                                          
    output ;                                                                                                                            
  end ;                                                                                                                                 
run ;

Kind regards

Paul D.

Registration is open

SAS Training: Just a Click Away