BookmarkSubscribeRSS Feed

%SASDIFF, a SAS macro for differential file comparison

Started ‎12-20-2022 by
Modified ‎12-20-2022 by
Views 905

When I developed SAS software on the UNIX operating system, I would often use the diff file comparison function to maintain code libraries. I always wondered how diff worked, so I looked up its help page and saw that it used a sliding window to compare lines of code with various filters applied. So I wrote my own version of diff, called %SASDIFF, to do many of the same things that diff does.

 

I used the DATA step point= option on the set statement to match lines of one file with lines of a second file.

 

I share my code with the SAS community and hope that you will find it useful.

%macro SASDIFF( FILEIN1              /* name of original file to compare for differences                           */
              , FILEIN2              /* name of new      file to compare for differences                           */
              , FLOW=N               /* [optional] wrap long lines of text to show complete text                   */
              , IGNORE_WHITE_SPACE=Y /* [optional] ignore white space when comparing lines                         */
              , IGNORE_BLANK_LINES=Y /* [optional] ignore changes that just insert or delete blank lines           */
              , IGNORE_CASE=Y        /* [optional] ignore changes in case. consider UPPER and lower to be the same */
              , IGNORE_MATCHES=N     /* [optional] ignore matching lines                                           */
              , LINESIZE=            /* [optional] linesize for display of text (default=system linesize)          */
              , WINDOW=10            /* search +/- &WINDOW lines above/below current line for match                */
              ) ;

   /* PURPOSE: perform differential file comparison using a subset of UNIX sdiff commands
    *
    * NOTE:    &FILEIN1 and &FILEIN2 must be text strings containing path/filename
    *
    * NOTE:    &WINDOW represents the maximum number of lines to look before or after the current line
    *          in the original file being matched in the new file, or vice-versa.
    *          &WINDOW ought to be adjusted to the value displayed in the log.
    *          it may be necessary to run %SASDIFF multiple times until &WINDOW remains constant between runs.
    *
    * NOTE:    the labelling convention for indicating source of text from &FILEIN1 or &FILEIN2 is:
    *          '<' indicates that line of text comes from &FILEIN1 and not &FILEIN2
    *          '>'                                        &FILEIN2 and not &FILEIN1
    *          '|'                                        &FILEIN1 and     &FILEIN2 but is changed
    *          ' '                                        &FILEIN1 and     &FILEIN2 and is identical 
    *
    * EXAMPLE OF USE:
    *    %let FILE1 = C:\Documents and Settings\rosbet\My Documents\My SAS Files\9.1\Macros\sasdiff_v0.sas ;
    *    %let FILE2 = C:\Documents and Settings\rosbet\My Documents\My SAS Files\9.1\Macros\sasdiff_v1.sas ;
    *
    *    %SASDIFF( &FILE1, &FILE2, ignore_blank_lines=n, ignore_matches=y )
    */

   /* process macro parameter flags */

   %let FLG_FLOW               = %index( %upcase( &FLOW              , Y )) ;
   %let FLG_IGNORE_MATCHES     = %index( %upcase( &IGNORE_MATCHES    , Y )) ;
   %let FLG_IGNORE_BLANK_LINES = %index( %upcase( &IGNORE_BLANK_LINES, Y )) ;
   %let FLG_IGNORE_CASE        = %index( %upcase( &IGNORE_CASE       , Y )) ;
   %let FLG_IGNORE_WHITE_SPACE = %index( %upcase( &IGNORE_WHITE_SPACE, Y )) ;

   /*############################################################################*/
   /* begin executable statements
   /*############################################################################*/

   /* clear pre-existing file references, assign new ones */

   filename file1 clear ;
   filename file2 clear ;

   filename file1 "&FILEIN1" ;
   filename file2 "&FILEIN2" ;

   data _null_ ;
      length fname1 fname2 $ 80 ;

      fname1 = reverse( scan( reverse( "&FILEIN1" ), 1, '/\' )) ;
      fname2 = reverse( scan( reverse( "&FILEIN2" ), 1, '/\' )) ;

      call symput( 'FNAME1', fname1 ) ;
      call symput( 'FNAME2', fname2 ) ;
   run ;

   /* verify validity of physical filename. err-out if non-zero return code */

   %if %sysfunc( fileref( FILE1 )) %then %goto ERROR_PATH1 ;
   %if %sysfunc( fileref( FILE2 )) %then %goto ERROR_PATH2 ;

   /*==================================================================*/
   /* read lines of text into SAS dataset format
   /* apply parameter settings to text
   /*==================================================================*/

   data dsn1 ;
      length line1 text1 $ 32767 ;
      retain maxreclen -1 ;

      infile FILE1 eof=lastobs length=reclen lrecl=32767 truncover ;

      input text1 $varying32767. reclen ;

      %if &FLG_IGNORE_BLANK_LINES %then %str( if lengthn( compress( text1,, 's' )) ; ) ;

      linenum1 + 1 ;

      line1 = %if &FLG_IGNORE_CASE %then %str( upcase( text1 ) ; ) ; %else %str( text1 ; ) ;

      %if &FLG_IGNORE_WHITE_SPACE %then %str( line1 = compress( line1,, 's' ) ; ) ;

      maxreclen = max( maxreclen, reclen ) ;

      return ;

LASTOBS: 
      call symput( 'N_OBS1' , put( linenum1 , best12. )) ;

      call symput( 'MAXLEN1', put( maxreclen, 5. )) ;

      drop maxreclen ;
   run ;

   data dsn2 ;
      length line2 text2 $ 32767 ;
      retain maxreclen -1 ;

      infile FILE2 eof=lastobs length=reclen lrecl=32767 truncover ;

      input text2 $varying32767. reclen ;

      %if &FLG_IGNORE_BLANK_LINES %then %str( if lengthn( compress( text2,, 's' )) ; ) ;

      linenum2 + 1 ;

      line2 = %if &FLG_IGNORE_CASE %then %str( upcase( text2 ) ; ) ; %else %str( text2 ; ) ;

      %if &FLG_IGNORE_WHITE_SPACE %then %str( line2 = compress( line2,, 's' ) ; ) ;

      maxreclen = max( maxreclen, reclen ) ;

      return ;

LASTOBS:
      call symput( 'N_OBS2' , put( linenum2 , best12. )) ;

      call symput( 'MAXLEN2', put( maxreclen, 5. )) ;

      drop maxreclen ;
   run ;

   /*==================================================================*/
   /* perform differential file comparison
   /*
   /* note: PROC REPORT linesize must be between 64 and 256
   /*==================================================================*/

   %let MAXLEN = %sysfunc( max( &MAXLEN1, &MAXLEN2 )) ;
   %let MAXOBS = %eval( &N_OBS1 + &N_OBS2 ) ; /* if both dsn1 and dsn2 are completely disjoint */

   data diff1( keep= linenum1 line_dsn1 line_dsn2 line1 text1 ) diff2( keep= linenum2 line_dsn1 line_dsn2 line2 text2 ) ;
      array match1[ &MAXOBS ] _temporary_ ( &MAXOBS * 0 ) ;
      array match2[ &MAXOBS ] _temporary_ ( &MAXOBS * 0 ) ;

      /* find subset of dsn2 in dsn1 */

      do i = 1 to &N_OBS1 ;
         line_dsn1 = i ;

         set dsn1 point=i ; /* read line1, text1 from dsn1 into Program Data Vector */

         /* if no match on previous search, set line_dsn2 = missing */

         match = 0 ;

         do j = 1 to &N_OBS2 ;
            line_dsn2 = . ; /* assume no match */

            if ^match2[ j ] /* check only dsn2 lines that have not been previously matched */
            then do ;
               set dsn2 point=j ; /* add line2, text2 from dsn2 to PDV */

               /* test for equality. if match, save respective line positns, leave loop */

               match = ( line1 = line2 ) & ( abs( i - j ) <= &WINDOW ) ;

               if match then do ; line_dsn2 = j ; match2[ j ] = 1 ; leave ; end ;
            end ;
         end ;

         output diff1 ;
      end ;

      /* find subset of dsn1 in dsn2 */

      do j = 1 to &N_OBS2 ;
         line_dsn2 = j ;

         set dsn2 point=j ;

         match = 0 ;

         do i = 1 to &N_OBS1 ;
            line_dsn1 = . ;

            if ^match1[ i ]
            then do ;
               set dsn1 point=i ;

               match = ( line1 = line2 ) & ( abs( j - i ) <= &WINDOW ) ;

               if match then do ; line_dsn1 = i ; match1[ i ] = 1 ; leave ; end ;
            end ;
         end ;

         output diff2 ;
      end ;

      stop ; /* to prevent infinite loop since data step iterates */
   run ;

   /* if blank line, set pointer to missing to force inclusion in output dataset */

   data diff1 ; set diff1 ; if ^lengthn( text1 ) then line_dsn2 = . ; run ;
   data diff2 ; set diff2 ; if ^lengthn( text2 ) then line_dsn1 = . ; run ;

   data diff12 ;
      retain rec_ptr1 rec_ptr2 1 ;

      do until( lastobs1 & lastobs2 ) ;
         /* read a record from diff1, diff2
          * determine match or non-matching subset
          * output
          */

         if ^lastobs1 then set diff1( rename=( line_dsn2=diff1_line_dsn2 )) point = rec_ptr1 ;

         if ^lastobs2 then set diff2( rename=( line_dsn1=diff2_line_dsn1 )) point = rec_ptr2 ;

		 state = 10 * missing( diff1_line_dsn2 ) + missing( diff2_line_dsn1 ) ;

		 select( state ) ;
		 when( 00 ) do ; rec_ptr1 + ^lastobs1 ; rec_ptr2 + ^lastobs2      ; output ; end ; /*  text1 &  text2 */
		 when( 01 ) do ; linenum1 = . ; text1 = '' ; rec_ptr2 + ^lastobs2 ; output ; end ; /* ^text1 &  text2 */
		 when( 10 ) do ; linenum2 = . ; text2 = '' ; rec_ptr1 + ^lastobs1 ; output ; end ; /*  text1 & ^text2 */
		 when( 11 ) do ;
						rec_ptr1 + ^lastobs1 ; rec_ptr2 + ^lastobs2 ;

						text1 = ifc( text1 = lag( text1 ), '', text1 ) ;
		 				text2 = ifc( text2 = lag( text2 ), '', text2 ) ;

						output ;
					end ; /* ^text1 & ^text2 */
		 otherwise putlog '>>> ERROR: ought not to occur' ;
		 end ;

		 lastobs1 = rec_ptr1 > &N_OBS1 ; lastobs2 = rec_ptr2 > &N_OBS2 ;

         /* compute max size of comparison window */

         if ^missing( diff1_line_dsn2 ) & ^missing( diff2_line_dsn1 )
         then max_diff = max(  abs( diff1_line_dsn2 - diff2_line_dsn1 ), max_diff ) ;
      end ;

      putlog 'NOTE: Maximum difference for comparison window = ' max_diff ;

      stop ;
   run ;

   /*==================================================================*/
   /* report results of comparing original to new file
   /*==================================================================*/

   %let COLWIDTH = %sysfunc( ceil( %sysfunc( log10( &MAXOBS )))) ;

   data diff ;
      length lineno1 lineno2 $ &COLWIDTH source $ 1 text1 text2 $ &MAXLEN ;

      do i = 1 to n_obs ;
         set diff12( keep= linenum: line: text: ) nobs=n_obs point = i ;


         if  lengthn( text1 ) & ^lengthn( text2 )                      then source = '<' ; else
         if ^lengthn( text1 ) &  lengthn( text2 )                      then source = '>' ; else
         if  lengthn( text1 ) &  lengthn( text2 ) & ( text1  = text2 ) then source = ' ' ; else
         if  lengthn( text1 ) &  lengthn( text2 ) & ( text1 ^= text2 ) then source = '|' ;

         %if &FLG_IGNORE_MATCHES %then %str( if missing( source ) then continue ; ) ;

         lineno1 = ifc( source = '>', ' ', put( linenum1, &COLWIDTH.. )) ;
         lineno2 = ifc( source = '<', ' ', put( linenum2, &COLWIDTH.. )) ;

         output ;
      end ;

      stop ;

      drop linenum: ;
   run ;

   %if &FLG_FLOW %then %let FLOW = flow ; %else %let FLOW = ;

   %if ^%length( &LINESIZE ) %then %let LINESIZE = %sysfunc( getoption( LS )) ;

   /* "- 5" because spacing=1 btwn 4 columns and source char requires 1 column */

   %let WIDTH = %sysfunc( min( &MAXLEN, %sysevalf( .5 *( &LINESIZE - 2 * &COLWIDTH - 5 ), integer ))) ;

   title '%SASDIFF Differential File Comparison' ;
   proc report data=diff headline headskip ls=&LINESIZE nowindows spacing=1 split='`' ;
      column lineno1 text1 source text2 lineno2 ;

      define lineno1  / display width=&COLWIDTH    " "              ;
      define text1    / display &FLOW width=&WIDTH "%trim(&FNAME1)" ;
      define source   / display       width=1      " "              ;
      define text2    / display &FLOW width=&WIDTH "%trim(&FNAME2)" ;
      define lineno2  / display width=&COLWIDTH    " "              ;
   run ;
   title ;

   %goto L9999 ;

%ERROR_PATH1:
   %put %sysfunc( sysmsg()) ;
   %goto L9999 ;

%ERROR_PATH2:
   %put %sysfunc( sysmsg()) ;
   %goto L9999 ;

%L9999: /* exit the macro */

   proc datasets library=work nolist ; delete diff diff1 diff2 diff12 dsn1 dsn2 ; quit ;

%mend SASDIFF ;
Version history
Last update:
‎12-20-2022 04:15 PM
Updated by:
Contributors

sas-innovate-2024.png

Available on demand!

Missed SAS Innovate Las Vegas? Watch all the action for free! View the keynotes, general sessions and 22 breakouts on demand.

 

Register now!

Free course: Data Literacy Essentials

Data Literacy is for all, even absolute beginners. Jump on board with this free e-learning  and boost your career prospects.

Get Started

Article Tags