BookmarkSubscribeRSS Feed

%SASGREP, a SAS macro to perform the same functions as the UNIX GREP utility

Started ‎12-19-2022 by
Modified ‎12-19-2022 by
Views 1,028

I once worked on a project that required extensive use of regular expression pattern matching, so I wrote the %SASGREP macro to facilitate my work. I share it with the SAS community for others to use. This code is written for applications using the Windows operating system, but because SAS is easily ported to other hosts, it can be modified to work on other computers, including those running UNIX.

 

If you peruse the code, you will see how %SASGREP uses the SAS named pipe to search directories recursively, applies Perl regular expression pattern matching to files in a directory, and selectively reports the results of the search.

 

%macro SASGREP( PATH     /* Windows path to directory the files of which to search              */
              , PRX      /* Perl regular expression used to match strings in files              */
              , CASE=    /* [optional] case-sensitivity flag                                    */
              , LS=120   /* [optional] line size for printing results of match                  */
              , OUT=     /* [optional] name of SAS dataset containing results of match          */
              , PRINT=   /* [optional] switches to print date, time, line # of matched lines    */
              , REPORT=Y /* [optional] switch to create results of search                       */
              , SUBDIR=Y /* [optional] switch to search subdirectories of &PATH                 */
              ) ;

   /* PURPOSE: create SAS version of UNIX Global Regular Expression Print (GREP) utility
    *
    * NOTE:    %SASGREP is designed to be run on SAS installations using the Windows XP operating system
    *
    * NOTE:    &PATH must contain valid Windows path, e.g., 'c:' or 'c:\documents and settings'
    *
    * NOTE:    &PRX must be a Perl regular expression (PRX). in the nominal case where a simple string
    *          search is to be performed, &PRX may consist simply of the delimited character string sought,
    *          e.g., /Text to be found/, and no other PRX metacharacters.
    *
    *          the syntax of a PRX may be found in the Functions and CALL Routines section of the
    *          Dictionary of Language Elements, which is contained in the SAS Language Dictionary
    *
    * NOTE:    &CASE controls the sensitivity to case (upper-case different from lower-case)
    *             default: upper/lower case is important, e.g., "Case" ^= "CASE" ^= "case"
    *             I ::= case-insensitive search will be performed
    *                   PRX metacharacters will be added to simple search string
    *
    * NOTE:    &PRINT controls printing of date file last written, time file last written,
    *          line # in file of match, size of file
    *             D ::= print date file last written
    *             N ::= print line # of line matched relative to start of file
    *             P ::= print directory path containing file in which string was matched
    *             S ::= print size of file in bytes
    *             T ::= print time file last written 
    *
    * NOTE:    if &SUBDIR = Y then all subdirectories of &PATH will be searched
    *          otherwise, only the path named in &PATH will be searched
    *
    * ALGORITHM:
    *    use Windows pipe with file reference to execute 'dir' command to obtain directory contents
    *    parse pipe output as if it were a file to extract file names, other info
    *    output complete path name to dataset 'filenames'
    *    apply Perl regular expression pattern matching to files in 'filenames'
    *    write successful matches to dataset &DSNOUT ( = &OUT if requested )
    *    report successful matches ( if requested )
    *
    * EXAMPLES OF USE:
    *    *** simple string search for "SAS" ***
    *    %SASGREP( c:/documents and settings/user_name/My SAS Files, /SAS/ )
    *
    *    *** simple string search for "sasgrep" ***
    *    %SASGREP( c:/my docs/*.txt, /sasgrep/, out=sasgrep_out, report=n, print=dnst )
    *
    *    *** simple case-insensitive string search for "sasgrep" ***
    *    %SASGREP( c:/my docs/*.sas, /sasgrep/, case=i, out=sasgrep_out, report=n, print=dnst )
    *
    *    *** simple case-insensitive string search for "Perl regular expression" ***
    *    %SASGREP( c:/path, /Perl regular expression/, case=i, out=search_results, report=n, print= )
    *
    *    *** string search for telephone number in format '(nnn)nnn-nnn' or '(nnn) nnn-nnn' ***
    *    %let PRX = /\([2-9]\d\d\) ?[2-9]\d\d-\d\d\d\d/ ;
    *    %SASGREP( c:/path, &PRX, case=i, out=search_results, report=n, print= )
    */

   /* verify syntax of &PRX. if error, exit the macro */

   data _null_ ;
      prx = prxparse( "&PRX" ) ;

      call symput( 'PRXPARSE_ERROR', put( _error_, 1. )) ;
   run ;

   %if &PRXPARSE_ERROR %then %goto L9999 ;

   %let DELIM = ' ' ;

   %let CASE = %eval( %upcase( "&CASE" ) = "I" ) ;

   %if %length( &OUT ) > 0 %then %let DSNOUT = &OUT ; %else %let DSNOUT = sasgrep ;

   %if %length( &PRINT ) > 0
   %then %do ;
      %let PRINT = %upcase( &PRINT ) ;

      %let P_DATE = %eval( %index( &PRINT, D ) > 0 ) ;
      %let P_NUM  = %eval( %index( &PRINT, N ) > 0 ) ;
      %let P_PATH = %eval( %index( &PRINT, P ) > 0 ) ;
      %let P_SIZE = %eval( %index( &PRINT, S ) > 0 ) ;
      %let P_TIME = %eval( %index( &PRINT, T ) > 0 ) ;
   %end ;
   %else
      %do ; %let P_DATE = 0 ; %let P_NUM = 0 ; %let P_PATH = 0 ; %let P_SIZE = 0 ; %let P_TIME = 0 ; %end ;
   
   %if %length( &REPORT ) > 0 %then %let REPORT = %upcase( &REPORT ) ; %else %let REPORT = N ;

   %if %upcase( &SUBDIR ) = Y %then %let SUBDIR = /s ; %else %let SUBDIR = ;

   %let SEARCHSAS = %eval( %index( &PATH, . ) > 0 ) ; /* flag to control parsing of filename for 'SAS' */

   /*============================================================================*/
   /* external storage references
   /*============================================================================*/

   /* run Windows "dir" DOS command as pipe to get contents of data directory */

   filename DIRLIST pipe "dir /-c /q &SUBDIR /t:c ""&PATH""" ;

   /*############################################################################*/
   /* begin executable code
   /*############################################################################*/

   /* use Windows pipe to recursively find all files in &PATH
    * parse out extraneous data, including unreadable directory paths
    *
    * directory list structure:
    *    "Directory of" record precedes listing of contents of directory:
    *    
    *    Directory of  \  [ \ \... ]
    *    mm/dd/yy hh:mm:ss [AM|PM] ['' | size ] filename.type
    *
    *    example:
    *
    *       Volume in drive C is WXP
    *       Volume Serial Number is 18C2-3BAA
    *
    *       Directory of C:\Documents and Settings\robett\My Documents\My SAS Files\V8\Test
    *
    *       05/21/03  10:58 AM              CARYNT\robett          .
    *       05/21/03  10:58 AM              CARYNT\robett          ..
    *       12/24/03  10:22 AM              CARYNT\robett          Codebook
    *       04/23/01  02:42 PM               387 CARYNT\robett          printCharMat.sas
    *       10/09/03  11:35 AM             20582 CARYNT\robett          test.log
    *       10/28/03  08:02 AM             58682 CARYNT\robett          test.lst
    *       10/09/03  11:35 AM              1575 CARYNT\robett          test.sas
    */
 
   data filenames( keep= date dir_path filename size time ) ;
      format date mmddyy8. time timeampm8. ;
      length dir_path filename $256 temp $16 ;
      retain dir_path prx ;

      if _n_ = 1
      then do ;
         /* establish regex to parse input record for date, time, size, owner, filename
          *    regex                    matches
          *    (\d{2}\/d{2}\/\d{4})\s+  'dd/mm/ccyy' and >= 1 white space
          *    (\d\d:\d\d (?:AM|PM))\s+ 'hh:mm AM' or 'hh:mm PM' and >= 1 white space
          *    (\d+)\s+                 nnnnnnn and >= 1 white space
          *    (\S+)\s+                 any character that is not white space and >= 1 white space
          *    (\S.*)                   any char that is not white space followed by . followed by >= 0 chars
          */

         prx = prxparse('/(\d{2}\/\d{2}\/\d{4})\s+(\d\d:\d\d (?:AM|PM))\s+(\d+)\s+(\S+)\s+(\S.*)/') ;
      end ;

      infile dirlist ; /* use pipe to get filenames */
      input ;          /* read into_infile_ buffer. faster than reading into variable */

      /* parse directory     record for directory path
       * parse non-directory record for filename, associated information
       */

      if prxmatch( prx, _infile_ )
      then do;
         filename = prxposn( prx, 5, _infile_ ) ;
         if filename in ( '.' '..' ) then delete ;

         date = input( prxposn( prx, 1, _infile_ ), mmddyy10. ) ;
         time = input( prxposn( prx, 2, _infile_ ), time8.    ) ;
         size = input( prxposn( prx, 3, _infile_ ), best.     ) ;

         /* bug in DOS DIR cmmd: if specify 'dir path\*.sas', get SAS datasets as well as SAS program files
          * correct error by omitting observations containing SAS dataset names
          */

         if &SEARCHSAS
         then do ;
            ndx = index( filename, '.' ) ;
            if ndx > 0
            then do;
               temp = upcase( substr( filename, ndx + 1 )) ;

               if temp =: 'SAS' & length( trim( temp )) > 3 then delete ;
            end;
         end ;
         output ;
      end ;
      else
      if upcase( scan( _infile_, 1, &DELIM )) = 'DIRECTORY'
      then dir_path = left( substr( _infile_, length( "Directory of" ) + 2 )) ;
   run ;

   /* use path+file name to read external files, perform pattern recognition */

   options nonotes ; /* turn off printing of NOTES: since each filename read from pipe is printed to log file */

   data &DSNOUT ;
      length file2read $256 line $32767 ;
      set filenames ;

      retain prx ;

      file2read = catx( '\', dir_path, filename ) ;

      infile dummy filevar=file2read end=lastobs length=reclen ;

      lineno = 0 ; /* initialize line counter relative to file being read */

      if _n_ = 1
      then do ;
         %if &CASE %then %let PRX = &PRX.i ; /* create regex for case-insensitive search */

         prx = prxparse( "&PRX" ) ; /* initialize regular expression environment */
      end ;

      /* read from file2read, match regex to chars in line read from file. if match, output */

      do while( not lastobs ) ;
         input line $varying32767. reclen ;
         lineno + 1 ;

         if prxmatch( prx, line ) > 0 then output ;
      end ;

      drop prx ;
   run ;

   /*============================================================================*/
   /* create report of files by owner, if requested
   /* set PROC REPORT line column size adaptively according to items to be printed
   /*============================================================================*/

   /* subtract field width + 1 space character per optional descriptive item
    * max of &LS chars/line - 1 - length( filename ) - 1 (print control char)
    */

   %let LINE_SIZE = %eval( &LS - 17 - 1 - 1 - 17*&P_PATH - 7*&P_DATE - 6*&P_TIME - 8*&P_SIZE - 5 *&P_NUM ) ;

  %if &REPORT = Y
  %then %do ;
      title1 '%SASGREP Listing' ;
      title2 "Perl Regular Expression Search Using Regexp=&PRX" ;
      proc report data=&DSNOUT headskip nocenter nowindows spacing=1 split='~' ;
         column
            %if &P_PATH %then dir_path ;
            filename
            %if &P_DATE %then date ;
            %if &P_TIME %then time ;
            %if &P_SIZE %then size ;
            %if &P_NUM  %then lineno ;
            line
            ;

         %if &P_PATH %then %str( define dir_path / order width=16 flow 'Directory' ; ) ;

         define filename / display width=16 flow 'File' ;

         %if &P_DATE %then %str( define date   / display format=mmddyy6. 'Date'   ; ) ;
         %if &P_TIME %then %str( define time   / display format=time5.   'Time'   ; ) ;
         %if &P_SIZE %then %str( define size   / display format=comma7.  'Size'   ; ) ;
         %if &P_NUM  %then %str( define lineno / display format=4.       'Line~#' ; ) ;

         define line / display width=&LINE_SIZE flow 'Line' ;
      run ;
      title ;
   %end ;

   option notes ; /* restore printing of NOTES: messages to log file */

%L9999:
%mend SASGREP ;
Comments

This looks really cool. Thanks for sharing it. Do you mind if I use it with proper attribution?

Regards .... Rich

Thank you for your interest in my work, Rich. Yes, you can distribute it freely and thanks for asking for permission. Ross

Thank you for your interest in my work, Rich. Yes, you can distribute it
freely and thanks for asking for permission. Ross
Version history
Last update:
‎12-19-2022 02:49 PM
Updated by:
Contributors

SAS Innovate 2025: Call for Content

Are you ready for the spotlight? We're accepting content ideas for SAS Innovate 2025 to be held May 6-9 in Orlando, FL. The call is open until September 25. Read more here about why you should contribute and what is in it for you!

Submit your idea!

Free course: Data Literacy Essentials

Data Literacy is for all, even absolute beginners. Jump on board with this free e-learning  and boost your career prospects.

Get Started

Article Tags