SAS Communities Library

We’re smarter together. Learn from this collection of community knowledge and add your expertise.
BookmarkSubscribeRSS Feed

%SASDIFF, a SAS macro for differential file comparison

Started ‎12-20-2022 by
Modified ‎12-20-2022 by
Views 1,816

When I developed SAS software on the UNIX operating system, I would often use the diff file comparison function to maintain code libraries. I always wondered how diff worked, so I looked up its help page and saw that it used a sliding window to compare lines of code with various filters applied. So I wrote my own version of diff, called %SASDIFF, to do many of the same things that diff does.

 

I used the DATA step point= option on the set statement to match lines of one file with lines of a second file.

 

I share my code with the SAS community and hope that you will find it useful.

%macro SASDIFF( FILEIN1              /* name of original file to compare for differences                           */
              , FILEIN2              /* name of new      file to compare for differences                           */
              , FLOW=N               /* [optional] wrap long lines of text to show complete text                   */
              , IGNORE_WHITE_SPACE=Y /* [optional] ignore white space when comparing lines                         */
              , IGNORE_BLANK_LINES=Y /* [optional] ignore changes that just insert or delete blank lines           */
              , IGNORE_CASE=Y        /* [optional] ignore changes in case. consider UPPER and lower to be the same */
              , IGNORE_MATCHES=N     /* [optional] ignore matching lines                                           */
              , LINESIZE=            /* [optional] linesize for display of text (default=system linesize)          */
              , WINDOW=10            /* search +/- &WINDOW lines above/below current line for match                */
              ) ;

   /* PURPOSE: perform differential file comparison using a subset of UNIX sdiff commands
    *
    * NOTE:    &FILEIN1 and &FILEIN2 must be text strings containing path/filename
    *
    * NOTE:    &WINDOW represents the maximum number of lines to look before or after the current line
    *          in the original file being matched in the new file, or vice-versa.
    *          &WINDOW ought to be adjusted to the value displayed in the log.
    *          it may be necessary to run %SASDIFF multiple times until &WINDOW remains constant between runs.
    *
    * NOTE:    the labelling convention for indicating source of text from &FILEIN1 or &FILEIN2 is:
    *          '<' indicates that line of text comes from &FILEIN1 and not &FILEIN2
    *          '>'                                        &FILEIN2 and not &FILEIN1
    *          '|'                                        &FILEIN1 and     &FILEIN2 but is changed
    *          ' '                                        &FILEIN1 and     &FILEIN2 and is identical 
    *
    * EXAMPLE OF USE:
    *    %let FILE1 = C:\Documents and Settings\rosbet\My Documents\My SAS Files\9.1\Macros\sasdiff_v0.sas ;
    *    %let FILE2 = C:\Documents and Settings\rosbet\My Documents\My SAS Files\9.1\Macros\sasdiff_v1.sas ;
    *
    *    %SASDIFF( &FILE1, &FILE2, ignore_blank_lines=n, ignore_matches=y )
    */

   /* process macro parameter flags */

   %let FLG_FLOW               = %index( %upcase( &FLOW              , Y )) ;
   %let FLG_IGNORE_MATCHES     = %index( %upcase( &IGNORE_MATCHES    , Y )) ;
   %let FLG_IGNORE_BLANK_LINES = %index( %upcase( &IGNORE_BLANK_LINES, Y )) ;
   %let FLG_IGNORE_CASE        = %index( %upcase( &IGNORE_CASE       , Y )) ;
   %let FLG_IGNORE_WHITE_SPACE = %index( %upcase( &IGNORE_WHITE_SPACE, Y )) ;

   /*############################################################################*/
   /* begin executable statements
   /*############################################################################*/

   /* clear pre-existing file references, assign new ones */

   filename file1 clear ;
   filename file2 clear ;

   filename file1 "&FILEIN1" ;
   filename file2 "&FILEIN2" ;

   data _null_ ;
      length fname1 fname2 $ 80 ;

      fname1 = reverse( scan( reverse( "&FILEIN1" ), 1, '/\' )) ;
      fname2 = reverse( scan( reverse( "&FILEIN2" ), 1, '/\' )) ;

      call symput( 'FNAME1', fname1 ) ;
      call symput( 'FNAME2', fname2 ) ;
   run ;

   /* verify validity of physical filename. err-out if non-zero return code */

   %if %sysfunc( fileref( FILE1 )) %then %goto ERROR_PATH1 ;
   %if %sysfunc( fileref( FILE2 )) %then %goto ERROR_PATH2 ;

   /*==================================================================*/
   /* read lines of text into SAS dataset format
   /* apply parameter settings to text
   /*==================================================================*/

   data dsn1 ;
      length line1 text1 $ 32767 ;
      retain maxreclen -1 ;

      infile FILE1 eof=lastobs length=reclen lrecl=32767 truncover ;

      input text1 $varying32767. reclen ;

      %if &FLG_IGNORE_BLANK_LINES %then %str( if lengthn( compress( text1,, 's' )) ; ) ;

      linenum1 + 1 ;

      line1 = %if &FLG_IGNORE_CASE %then %str( upcase( text1 ) ; ) ; %else %str( text1 ; ) ;

      %if &FLG_IGNORE_WHITE_SPACE %then %str( line1 = compress( line1,, 's' ) ; ) ;

      maxreclen = max( maxreclen, reclen ) ;

      return ;

LASTOBS: 
      call symput( 'N_OBS1' , put( linenum1 , best12. )) ;

      call symput( 'MAXLEN1', put( maxreclen, 5. )) ;

      drop maxreclen ;
   run ;

   data dsn2 ;
      length line2 text2 $ 32767 ;
      retain maxreclen -1 ;

      infile FILE2 eof=lastobs length=reclen lrecl=32767 truncover ;

      input text2 $varying32767. reclen ;

      %if &FLG_IGNORE_BLANK_LINES %then %str( if lengthn( compress( text2,, 's' )) ; ) ;

      linenum2 + 1 ;

      line2 = %if &FLG_IGNORE_CASE %then %str( upcase( text2 ) ; ) ; %else %str( text2 ; ) ;

      %if &FLG_IGNORE_WHITE_SPACE %then %str( line2 = compress( line2,, 's' ) ; ) ;

      maxreclen = max( maxreclen, reclen ) ;

      return ;

LASTOBS:
      call symput( 'N_OBS2' , put( linenum2 , best12. )) ;

      call symput( 'MAXLEN2', put( maxreclen, 5. )) ;

      drop maxreclen ;
   run ;

   /*==================================================================*/
   /* perform differential file comparison
   /*
   /* note: PROC REPORT linesize must be between 64 and 256
   /*==================================================================*/

   %let MAXLEN = %sysfunc( max( &MAXLEN1, &MAXLEN2 )) ;
   %let MAXOBS = %eval( &N_OBS1 + &N_OBS2 ) ; /* if both dsn1 and dsn2 are completely disjoint */

   data diff1( keep= linenum1 line_dsn1 line_dsn2 line1 text1 ) diff2( keep= linenum2 line_dsn1 line_dsn2 line2 text2 ) ;
      array match1[ &MAXOBS ] _temporary_ ( &MAXOBS * 0 ) ;
      array match2[ &MAXOBS ] _temporary_ ( &MAXOBS * 0 ) ;

      /* find subset of dsn2 in dsn1 */

      do i = 1 to &N_OBS1 ;
         line_dsn1 = i ;

         set dsn1 point=i ; /* read line1, text1 from dsn1 into Program Data Vector */

         /* if no match on previous search, set line_dsn2 = missing */

         match = 0 ;

         do j = 1 to &N_OBS2 ;
            line_dsn2 = . ; /* assume no match */

            if ^match2[ j ] /* check only dsn2 lines that have not been previously matched */
            then do ;
               set dsn2 point=j ; /* add line2, text2 from dsn2 to PDV */

               /* test for equality. if match, save respective line positns, leave loop */

               match = ( line1 = line2 ) & ( abs( i - j ) <= &WINDOW ) ;

               if match then do ; line_dsn2 = j ; match2[ j ] = 1 ; leave ; end ;
            end ;
         end ;

         output diff1 ;
      end ;

      /* find subset of dsn1 in dsn2 */

      do j = 1 to &N_OBS2 ;
         line_dsn2 = j ;

         set dsn2 point=j ;

         match = 0 ;

         do i = 1 to &N_OBS1 ;
            line_dsn1 = . ;

            if ^match1[ i ]
            then do ;
               set dsn1 point=i ;

               match = ( line1 = line2 ) & ( abs( j - i ) <= &WINDOW ) ;

               if match then do ; line_dsn1 = i ; match1[ i ] = 1 ; leave ; end ;
            end ;
         end ;

         output diff2 ;
      end ;

      stop ; /* to prevent infinite loop since data step iterates */
   run ;

   /* if blank line, set pointer to missing to force inclusion in output dataset */

   data diff1 ; set diff1 ; if ^lengthn( text1 ) then line_dsn2 = . ; run ;
   data diff2 ; set diff2 ; if ^lengthn( text2 ) then line_dsn1 = . ; run ;

   data diff12 ;
      retain rec_ptr1 rec_ptr2 1 ;

      do until( lastobs1 & lastobs2 ) ;
         /* read a record from diff1, diff2
          * determine match or non-matching subset
          * output
          */

         if ^lastobs1 then set diff1( rename=( line_dsn2=diff1_line_dsn2 )) point = rec_ptr1 ;

         if ^lastobs2 then set diff2( rename=( line_dsn1=diff2_line_dsn1 )) point = rec_ptr2 ;

		 state = 10 * missing( diff1_line_dsn2 ) + missing( diff2_line_dsn1 ) ;

		 select( state ) ;
		 when( 00 ) do ; rec_ptr1 + ^lastobs1 ; rec_ptr2 + ^lastobs2      ; output ; end ; /*  text1 &  text2 */
		 when( 01 ) do ; linenum1 = . ; text1 = '' ; rec_ptr2 + ^lastobs2 ; output ; end ; /* ^text1 &  text2 */
		 when( 10 ) do ; linenum2 = . ; text2 = '' ; rec_ptr1 + ^lastobs1 ; output ; end ; /*  text1 & ^text2 */
		 when( 11 ) do ;
						rec_ptr1 + ^lastobs1 ; rec_ptr2 + ^lastobs2 ;

						text1 = ifc( text1 = lag( text1 ), '', text1 ) ;
		 				text2 = ifc( text2 = lag( text2 ), '', text2 ) ;

						output ;
					end ; /* ^text1 & ^text2 */
		 otherwise putlog '>>> ERROR: ought not to occur' ;
		 end ;

		 lastobs1 = rec_ptr1 > &N_OBS1 ; lastobs2 = rec_ptr2 > &N_OBS2 ;

         /* compute max size of comparison window */

         if ^missing( diff1_line_dsn2 ) & ^missing( diff2_line_dsn1 )
         then max_diff = max(  abs( diff1_line_dsn2 - diff2_line_dsn1 ), max_diff ) ;
      end ;

      putlog 'NOTE: Maximum difference for comparison window = ' max_diff ;

      stop ;
   run ;

   /*==================================================================*/
   /* report results of comparing original to new file
   /*==================================================================*/

   %let COLWIDTH = %sysfunc( ceil( %sysfunc( log10( &MAXOBS )))) ;

   data diff ;
      length lineno1 lineno2 $ &COLWIDTH source $ 1 text1 text2 $ &MAXLEN ;

      do i = 1 to n_obs ;
         set diff12( keep= linenum: line: text: ) nobs=n_obs point = i ;


         if  lengthn( text1 ) & ^lengthn( text2 )                      then source = '<' ; else
         if ^lengthn( text1 ) &  lengthn( text2 )                      then source = '>' ; else
         if  lengthn( text1 ) &  lengthn( text2 ) & ( text1  = text2 ) then source = ' ' ; else
         if  lengthn( text1 ) &  lengthn( text2 ) & ( text1 ^= text2 ) then source = '|' ;

         %if &FLG_IGNORE_MATCHES %then %str( if missing( source ) then continue ; ) ;

         lineno1 = ifc( source = '>', ' ', put( linenum1, &COLWIDTH.. )) ;
         lineno2 = ifc( source = '<', ' ', put( linenum2, &COLWIDTH.. )) ;

         output ;
      end ;

      stop ;

      drop linenum: ;
   run ;

   %if &FLG_FLOW %then %let FLOW = flow ; %else %let FLOW = ;

   %if ^%length( &LINESIZE ) %then %let LINESIZE = %sysfunc( getoption( LS )) ;

   /* "- 5" because spacing=1 btwn 4 columns and source char requires 1 column */

   %let WIDTH = %sysfunc( min( &MAXLEN, %sysevalf( .5 *( &LINESIZE - 2 * &COLWIDTH - 5 ), integer ))) ;

   title '%SASDIFF Differential File Comparison' ;
   proc report data=diff headline headskip ls=&LINESIZE nowindows spacing=1 split='`' ;
      column lineno1 text1 source text2 lineno2 ;

      define lineno1  / display width=&COLWIDTH    " "              ;
      define text1    / display &FLOW width=&WIDTH "%trim(&FNAME1)" ;
      define source   / display       width=1      " "              ;
      define text2    / display &FLOW width=&WIDTH "%trim(&FNAME2)" ;
      define lineno2  / display width=&COLWIDTH    " "              ;
   run ;
   title ;

   %goto L9999 ;

%ERROR_PATH1:
   %put %sysfunc( sysmsg()) ;
   %goto L9999 ;

%ERROR_PATH2:
   %put %sysfunc( sysmsg()) ;
   %goto L9999 ;

%L9999: /* exit the macro */

   proc datasets library=work nolist ; delete diff diff1 diff2 diff12 dsn1 dsn2 ; quit ;

%mend SASDIFF ;
Version history
Last update:
‎12-20-2022 04:15 PM
Updated by:
Contributors

sas-innovate-white.png

Our biggest data and AI event of the year.

Don’t miss the livestream kicking off May 7. It’s free. It’s easy. And it’s the best seat in the house.

Join us virtually with our complimentary SAS Innovate Digital Pass. Watch live or on-demand in multiple languages, with translations available to help you get the most out of every session.

 

Register now!

SAS AI and Machine Learning Courses

The rapid growth of AI technologies is driving an AI skills gap and demand for AI talent. Ready to grow your AI literacy? SAS offers free ways to get started for beginners, business leaders, and analytics professionals of all skill levels. Your future self will thank you.

Get started

Article Tags