BookmarkSubscribeRSS Feed
Ronein
Meteorite | Level 14

Hello

Let's say that I want to check for each char var what is its max length and then reduce the length of each var to its maximum length.

I have started write the code but I need to do more 2 steps and I need the advice/help here

/**DETERMINE THE MAX LENGTH FOR EACH CHAR VAR**/

/***Apply the rule that if max len is higher then current len than reduce the var len****/

 

%macro RRR(data=);
%local 
 data_upcase
  library_name
 member_name 
character_vars 
length_character_vars
 num_charvars
 i;

%let    data_upcase = %upcase(&data); 
%let   member_name=%scan(&data_upcase, -1, %str(.));  
%let   library_name=%scan(&data_upcase, 1, %str(.)); 
 %if    &library_name = &member_name 
   %then    %let    library_name=WORK;

/**COPY CHARACTER VARIABLE NAMES AND LENGTHS TO MACRO VARIABLES; TERMINATE IF NO CHARACTER VARIABLES**/
proc sql noprint; 
select name, length
 into :character_vars separated by ' ', 
:length_character_vars separated by ' '
 from dictionary.columns
 where libname="&library_name"
             and memname="&member_name" 
             and type = "char";
 %let num_charvars=&sqlobs; 
quit;

/**If the data set has no character variables, notify the user and terminate**/
%if &num_charvars=0   %then   %do; 
%put   NOTE:    No char variables in data set &data, macro ends;
 %return; 
%end;

/**DETERMINE THE MAX LENGTH FOR EACH CHAR VAR**/

/***Apply the rule that if max len is higher then current len than reduce the var len****/







 

8 REPLIES 8
Ksharp
Super User

Here is an example code. Only just need change the first two line (path of sas dataset) .

 

/*The path of original sas dataset*/
libname in v9  "D:\XiaKeShan\数据管理\批量修改数据集的名称_LABEL_变量的名称_LABEL_长度2\原始数据"    access=readonly;

/*The path of output*/
libname out v9  "D:\XiaKeShan\数据管理\批量修改数据集的名称_LABEL_变量的名称_LABEL_长度2\处理后的数据" ;




















/*清空导出库的 数据集*/
proc datasets library=out kill nolist nodetails;
quit;
/*清空work库的 数据集*/
proc datasets library=work kill nolist nodetails;
quit;
/*将 数据集名 统一为 小写*/
proc copy in=in out=out memtype=data;
run;

/********************更改变量存储长度 **********************************/
%macro change_var_len;
/*计算所有数据集中字符变量的长度*/
data _null_;
 set sashelp.vcolumn(keep=libname memname name type where=(libname='OUT' and upcase(type)='CHAR')) end=last;
 by memname;
 if _n_=1 then call execute('proc sql;');
 if first.memname then call execute(catt('create table _',memname,' as select '));
 call execute(cat('max(length(',strip(name),')) as ',name));
 if not last.memname then call execute(',');
  else call execute(catt('from out.',memname,';'));
 if last then call execute('quit;');
run;
/*将上面产生的数据集转置 wide -> long*/
data _null_;
 set sashelp.vtable(keep=libname memname where=(libname='WORK' and memname =: '_'));
 call execute(catt('proc transpose data=',memname,' out=_',memname,';run;'));
run;
/*合并转置的数据集*/
data change_var_len;
 length _name_ $ 40;
 set __: indsname=indsname;
 dsn=indsname;
run;
/*计算变量在所有数据集中的 最长长度*/
proc sql;
create table change_var_len_1 as
 select *,substr(dsn,8) as new_dsn length=40,max(col1) as max_len
  from change_var_len
   where col1 is not missing
    group by _NAME_
     order by dsn;
quit;
/*按变量的最长长度 修改数据集中所有变量的长度*/
data _null_;
 set change_var_len_1 end=last;
 by dsn;
 if _n_=1 then call execute('proc sql;');
 if first.dsn then call execute(catt('alter table out.',new_dsn,' modify '));
 call execute(catt(_name_,' char(',max_len,')'));
 if not last.dsn then call execute(',');
  else call execute(catt(';'));
 if last then call execute('quit;');
run;
%mend;
%change_var_len
Kurt_Bremser
Super User

Next step: get actual maximum lengths.

%if &num_charvars. = 0
%then %do; 
  %put   NOTE:    No char variables in data set &data, macro ends;
  %return; 
%end;
%else %do;
  proc sql;
  create table max_len as select
  %do i = 1 %to &num_charvars.;
    %if &i. > 1 %then ,;
    "%scan(&character_vars.,&i.)"
    , max(length(%scan(&character_vars.,&i.)))
  %end;
  from &library_name..&member_name.;
  quit;
%end;

Now you have a dataset containing variable names and their max used length, from which you can create a DATA step (using CALL EXECUTE or writing to an include file) that sets the lengths.

 

Code is untested, posted from my tablet.

Tom
Super User Tom
Super User

A simple way to find the maximum length of variable is an SQL query that takes the MAX() of the LENGTH() of the variable.

 

There is no need to compare the defined length to the actual maximum length.  Only use it might have is in reporting or perhaps it cancelling the change if all of the variables are already short.

 

You probably need to keep ALL of the variables, not just the character ones.  Otherwise whatever method you use to shorten the names might change the order of the variables in the dataset.

 

%macro RRR(data,out);
%local memname libname num_vars i  ;
* Pull the libname and memname from data ;
%let memname=%scan(%upcase(&data),-1,.);
%let libname=%scan(%upcase(WORK.&data),-2,.);
* Check if OUT is specified ;
%if 0=%length(&out) %then %do;
  %put WARNING: &sysmacroname: No output dataset specified.
The input dataset will be replaced.;
  %let out=&data;
%end;

* Create macro variables with info on all variables;
proc sql noprint;
select varnum, type, length, nliteral(name)
 into :type1,:type1-,:len1-,:var1-
 from dictionary.columns
 where libname="&libname"
   and memname="&memname"
 order by varnum
;
%let num_vars=&sqlobs;
quit;

%if &num_vars %then %do;
* Create dataset with max length of each variable ;
proc sql ;
create table max_len as select
  %do i=1 %to &num_vars;
    %if &i>1 %then ,;
    %if &&type&i=num %then "&&len&i";
    %else %if &&len&i=1 %then '$1';
    %else cats('$',max(length(&&var&i)));
    as &&var&i length=6
  %end;
from &libname..&memname
;
quit;

proc transpose data=max_len out=max_len;
  var _all_;
run;

data _null_;
  if _n_=1 then call execute("data &out; length ");
  set max_len;
  call execute(catx(' ',nliteral(_name_),col1));
  if _n_=&num_vars then call execute(";set &data;run;");
run;

%end;
%else %do;
  %put ERROR: &sysmacroname: Did not find any variables in &=data..;
%end;
%mend rrr;

Let's try it:

data class1 ;
  length name $20 sex $8 age 3;
  set sashelp.class;
run;
%rrr(class1,class2);
proc compare data=class1 compare=class2;
run;

Results

Variables Summary

Number of Variables in Common: 5.
Number of Variables with Differing Attributes: 2.


Listing of Common Variables with Differing Attributes

Variable  Dataset      Type  Length

name      WORK.CLASS1  Char      20
          WORK.CLASS2  Char       7
sex       WORK.CLASS1  Char       8
          WORK.CLASS2  Char       1


Observation Summary

Observation      Base  Compare

First Obs           1        1
Last  Obs          19       19

Number of Observations in Common: 19.
Total Number of Observations Read from WORK.CLASS1: 19.
Total Number of Observations Read from WORK.CLASS2: 19.

Number of Observations with Some Compared Variables Unequal: 0.
Number of Observations with All Compared Variables Equal: 19.

NOTE: No unequal values were found. All values compared are exactly equal.
s_lassen
Meteorite | Level 14

It can be done in two steps, like this:

filename tempsas temp;

data _null_;
  set sashelp.cars end=done;
  array _chars_ _character_;
  array _lengths_ (1:1000) _temporary_;
  do _N_=1 to dim(_chars_);
    _lengths_(_N_)=max(_lengths_(_N_),length(_chars_(_N_)));
    end;
  if done;
  file tempsas;
  do _N_=1 to dim(_chars_);
    _name_=vname(_chars_(_N_));
    put 'length ' _name_ '$' _lengths_(_N_) ';';
    end;
run;

data cars;
  %include tempsas/source2;
  set sashelp.cars;
run;

So, basically, first you generate the LENGTH statements in a temporary file, then you execute them with %INCLUDE. The example shown will generate a warning in the log:

WARNING: Multiple lengths were specified for the variable Type by input data set(s). This can cause truncation of data.

- which actually shows that the TYPE variable is longer than necessary in the original data set.

 

I am not sure that there is that much to be gained by shortening the lengths of character variables, though. If you want to save disk space, it is much more efficient to use the 

options compress=char;

statement before creating the output data.

Patrick
Opal | Level 21

@s_lassen wrote:

...

I am not sure that there is that much to be gained by shortening the lengths of character variables, though. If you want to save disk space, it is much more efficient to use the 

options compress=char;

@s_lassen Not so sure that your statement holds true. Besides of the compress overhead variables get also fully expanded in memory (pdv, hash table). Compare the runtimes of below sample code and you'll see.

options fullstimer;

/* write */
data test_1(compress=char);
  array vars{100} $1 (100*'A');
  do i=1 to 5000;
    output;
  end;
run;

data test_2(compress=char);
  array vars{100} $32767 (100*'A');
  do i=1 to 5000;
    output;
  end;
run;

/* read */
data _null_;
  set test_1;
run;

data _null_;
  set test_2;
run;

 SAS log from run on my laptop

Spoiler
31         /* write */
32         data test_1(compress=char);
33           array vars{100} $1 (100*'A');
34           do i=1 to 5000;
35             output;
36           end;
37         run;

NOTE: The data set WORK.TEST_1 has 5000 observations and 101 variables.
NOTE: Compressing data set WORK.TEST_1 decreased size by 66.67 percent. 
      Compressed is 3 pages; un-compressed would require 9 pages.
NOTE: DATA statement used (Total process time):
      real time           0.01 seconds
      user cpu time       0.00 seconds
      system cpu time     0.01 seconds
      memory              768.56k
      OS Memory           20080.00k
      Timestamp           11/20/2023 07:13:26 PM
      Step Count                        52  Switch Count  2
      

38         
39         data test_2(compress=char);
40           array vars{100} $32767 (100*'A');
41           do i=1 to 5000;
42             output;
43           end;
44         run;

NOTE: The data set WORK.TEST_2 has 5000 observations and 101 variables.
NOTE: Compressing data set WORK.TEST_2 decreased size by 99.76 percent. 
      Compressed is 12 pages; un-compressed would require 5000 pages.
NOTE: DATA statement used (Total process time):
      real time           7.01 seconds
      user cpu time       6.64 seconds
      system cpu time     0.04 seconds
      memory              22972.43k
      OS Memory           43792.00k
      Timestamp           11/20/2023 07:13:33 PM
      Step Count                        53  Switch Count  2
      

45         
46         /* read */
47         data _null_;
48           set test_1;
49         run;

NOTE: There were 5000 observations read from the data set WORK.TEST_1.
NOTE: DATA statement used (Total process time):
      real time           0.00 seconds
      user cpu time       0.00 seconds
      system cpu time     0.00 seconds
      memory              1001.62k
      OS Memory           21360.00k
      Timestamp           11/20/2023 07:13:33 PM
      Step Count                        54  Switch Count  2
      

50         
51         data _null_;
52           set test_2;
53         run;

NOTE: There were 5000 observations read from the data set WORK.TEST_2.
NOTE: DATA statement used (Total process time):
      real time           1.90 seconds
      user cpu time       1.87 seconds
      system cpu time     0.01 seconds
      memory              26612.15k
      OS Memory           46360.00k
      Timestamp           11/20/2023 07:13:35 PM
      Step Count                        55  Switch Count  4

 

Kurt_Bremser
Super User

Uncompressing a dataset will need resources, but I've found that the performance penalty is usually negligible (the CPU time needed for uncompressing is outweighed by the much smaller amount of data to be physically read from disk).

But where unnecessary long variables really have an impact is during sorting and SQL operations, because the utility files are always uncompressed.

Patrick
Opal | Level 21

Even though most of the time not an issue, what I don't like about all the proposed solutions including the %squeeze macro is that they all don't maintain table properties like indexes and constraints. 

 

The main idea of below code is to use Proc SQL/Modify that allows to change variable lengths in place. If a copy of the source table is required with the changed variable lengths then the code uses Proc Datasets to also include table properties like indexes and constraints.

 

I've also added the option to collect the before/after changes in a log table to cover the comment in your code "If the data set has no character variables, notify the user and terminate".

You then could use this change log table to send out an email with the changes - or just use it for documentation. 

To collect the change info in a table (and not terminate the process) allows you to run the macro over multiple tables and then take a single action (like single status change email) at the end.

 

Macro definition

Spoiler
%macro shrink_vars(
  source_tbl                  /* source_tbl: table name - required                              */
  ,target_tbl                 /* target_tbl: table name, empty if same as source_tbl - optional */
  ,change_log_name            /* name of table for collection of change log info - optional     */
  ,change_log_replace         /* Y: replace table &change_log_name, any other value: append     */
  );

  %local inlib intbl;
  %let inlib=%upcase(%scan(work.&source_tbl,-2,.));
  %let intbl=%upcase(%scan(&source_tbl,-1,.));

  %local outlib outtbl;
  %if %nrbquote(&target_tbl) = %nrbquote() %then
    %do;
      %let outlib=&inlib;
      %let outtbl=&intbl;
    %end;
  %else
    %do;
      %let outlib=%upcase(%scan(work.&target_tbl,-2,.));
      %let outtbl=%upcase(%scan(&target_tbl,-1,.));
    %end;
  
  %local sv_dlcreatedir;
  %let sv_dlcreatedir=%sysfunc(getoption(dlcreatedir));
  options dlcreatedir;
  libname _tmpwrk "%sysfunc(pathname(work))\_tmpwrk";

  /* identify vars in scope for change */
  %local varlist def_len_list n_vars_1;
  proc sql noprint;
    create table _tmpwrk.tbl_meta_pre as
    select
      c.name
      ,c.length
      ,c.varnum
    from 
      ( select distinct libname 
        from dictionary.libnames 
        where libname="&inlib" and engine='V9'
      ) l

      inner join dictionary.tables t
        on
              l.libname=t.libname
          and t.memname="&intbl" 
          and t.memtype='DATA'

      inner join dictionary.columns c
        on
              t.libname=c.libname
          and t.memname=c.memname
          and c.type='char'
          and c.length>1
    ;

    select 
      name
      ,length
      ,count(*)
       into
        :varlist separated by ' '
        ,:def_len_list separated by ','
        ,:n_vars_1 trimmed
    from _tmpwrk.tbl_meta_pre
    order by 
      varnum
    ;
  quit;

  /* process only if there are any vars in scope for change */
  %local n_vars_2;
  %let n_vars_2=0;

  %if &n_vars_1>0 %then
    %do;
      /* determine max string lengths for vars in scope */
      proc sql;
        create table _tmpwrk.max_lengths as
        select
          %do i=1 %to &n_vars_1;
            %let var=%scan(&varlist,&i,%str( ));
            %if &i>1 %then %str(,);
            "%scan(&varlist,&i,%str( ))" as var_name_&i length=32
            ,max(length(&var)) as var_len_&i
          %end;
        from &inlib..&intbl
        ;
      quit;

      /* create syntax for SQL Alter table modify statement */
      data _null_;
        set _tmpwrk.max_lengths;
        array vn {&n_vars_1} var_name_:;
        array vl {&n_vars_1} var_len_:;
        array dl {&n_vars_1} 8 _temporary_ (&def_len_list);
        length __expr $32767;
        do __i=1 to &n_vars_1;
          /* only generate change code for variables where the string length is shorter than */
          /* the already defined variable length                                             */
          if vl[__i] ne dl[__i] then
            do;
              __expr=catx(',',__expr,catx(' ',vn[__i],cats('char(',vl[__i],')')));
              __n_vars_2+1;
            end;
        end;
        call symputx('expr',__expr,'l');
        call symputx('n_vars_2',__n_vars_2,'l');
      run;
    %end; 

  /* if no target table provided or target same as source: changes applied in-place on source table */
  %if %nrbquote(&target_tbl) = %nrbquote() or (&inlib=&outlib and &intbl=&outtbl) %then
    %do;
      /* process only if there are any vars in scope remaining for change */
      %if &n_vars_2>0 %then
        %do;
          proc sql;
            alter table &inlib..&intbl
              modify &expr;
          quit;
        %end;
    %end;

  /* else if target table (&target_tbl populated) provided: changes applied on target table             */
  /* - requires to create the target table so this code needs to execute for any scenario               */
  /* - creates first a zero row table in _tmpwrk. This approach also allows to create the target table  */
  /*   under the same libref as the source table using Proc Datasets should that be requested           */
  %else
  %if %nrbquote(&target_tbl) ne %nrbquote() %then
    %do;            
      /* 1. copy table structure (zero rows) to target */
      %local sv_obs;
      %let sv_obs=%sysfunc(getoption(obs,keyword));
      options obs=0;
      proc datasets lib=_tmpwrk nolist;
        /* copy of source table with zero rows from &inlib to _tmpwrk */
        copy
          in=&inlib
          out=_tmpwrk
          constraint=yes
          index=yes
          memtype=data
          ;
        select &intbl;
        run;
        /* rename source table to target table name */
        change &intbl=&outtbl;
        run;
        /* move target table from _tmpwrk to &outlib */
        copy
          in=_tmpwrk
          out=&outlib
          constraint=yes
          index=yes
          memtype=data
          move
          ;
        select &outtbl;
        run;
      quit;
      options &sv_obs;

      /* 2. alter target table structure */
      /* process only if there are any vars in scope remaining for change */
      %if &n_vars_2>0 %then
        %do;
          proc sql;
            alter table &outlib..&outtbl
              modify &expr;
          quit;
        %end;

      /* 3 append source data to target table structure */

      /* ensure options varlenchk is not set to error */
      %local sv_varlenchk;
      %let sv_varlenchk=%sysfunc(getoption(varlenchk,keyword));
      options varlenchk=nowarn;

      /* append source data to zero row target table structure */

      /* option 1: performs better but throws warning */
      /**
      %if &n_vars_2 ne 0 %then
        %do;
          data _null_;
            put 'Note: This code returns warning: "Multiple lengths were specified..."';
            put 'Note: This is expected and o.k.';
          run;
        %end;
      proc append base=&outlib..&outtbl data=&inlib..&intbl force;
      run;quit;
      **/

      /* option 2: more IO but throws no warning */ 
      data _tmpwrk.source_tbl / view=_tmpwrk.source_tbl;
        if 0 then set &outlib..&outtbl;
        set &inlib..&intbl;
      run;
      proc append base=&outlib..&outtbl data=_tmpwrk.source_tbl force;
      run;quit;

      /* reset option varlenchk to previous value */
      options &sv_varlenchk;


    %end; /* processing for source ne target */


  /* create and populate change log table if requested (=parameter &change_log_name not empty) */
  %if %nrbquote(&change_log_name) ne %nrbquote() %then
    %do;
      data _tmpwrk.change_log_base;
        attrib 
          source_tbl_lib    length=$8
          source_tbl_table  length=$32
          target_tbl_lib    length=$8
          target_tbl_table  length=$32
          ;
        source_tbl_lib  ="&inlib";
        source_tbl_table="&intbl";
        target_tbl_lib  ="&outlib";
        target_tbl_table="&outtbl";
      run;

      proc sql;
        create table _tmpwrk.change_log_det as
          select 
            t1.*
            ,t2.*
            ,case
              when missing(l.engine) then 'Macro will only change SAS V9 tables'
              when missing(t2.name)  then 'No variables required change'
              else ' '
             end as comment
          from 
            _tmpwrk.change_log_base t1
            left join
            ( select distinct engine
              from dictionary.libnames
              where 
                libname="&outlib"
                and engine='V9'
            ) l
          on 1=1

          left join
          (
            select
               pre.name label='name'
              ,pre.length as length_before label='length_before'
              ,post.length as length_after label='length_after'
              ,pre.varnum label='varnum'
            from 
              _tmpwrk.tbl_meta_pre pre
              inner join
              (
                select
                  name
                  ,libname
                  ,memname
                  ,length
                  ,varnum
                from dictionary.columns
                where 
                  libname="&outlib" 
                  and memname="&outtbl" 
                  and type='char'
              ) post
            on 
              pre.name=post.name
              and pre.length ne post.length
          ) t2
          on 1=1
          order by t2.varnum
          ;
      quit;
      %if %upcase(&change_log_replace)=Y or %upcase(&change_log_replace)=YES %then
        %do;
          proc datasets lib=%scan(work.&change_log_name,-2,.) nolist nowarn;
            delete %scan(&change_log_name,-1,.);
            run;
          quit;
        %end;
      proc append base=&change_log_name data=_tmpwrk.change_log_det force nowarn;
      run;quit;
    %end;

  /* housekeeping */
  options &sv_dlcreatedir;
  proc datasets lib=_tmpwrk kill nolist nowarn;
  run;quit;
  libname _tmpwrk clear;

%mend;

 

Test macro calls

Spoiler
/* Two changes: name & new_var */
data have_1;
  set sashelp.class;
  length new_var $10;
  new_var=' AA';
run;

/* one change: name */
data have_2;
  set sashelp.class;
  new_var=' AA';
run;

/* no change */
data have_3(index=(sex name) label='Have 3');
  length Name $7;
  set sashelp.class(rename=(name=_name));
  name=_name;
  drop _name;
run;

data have_4;
  set have_3;
run;

data _null_;
  file print;
  infile datalines truncover dsd dlm=',';
  input source_tbl:$41. target_tbl:$41. change_log_name:$41. change_log_replace:$1.;
  length cmd $300;

  /* using dosubl() */
  cmd=cats('%shrink_vars(',source_tbl,',',target_tbl,',',change_log_name,',',change_log_replace,');');
  put "Macro call: " cmd;
  rc=dosubl( cmd );

  /* using call excute(): requires macro call within %nrstr(). Reason here: */
  /* https://communities.sas.com/t5/SAS-Programming/SAS-macro-variable-not-getting-resolved/td-p/618067#:~:text=There%20is%20a%20timing%20issue,()%20to%20invoke%20a%20macro.&text=call%20execute(%22%25mymacro(),from%20evaluating%20the%20macro%20expressions. */
/*  cmd=cats('%shrink_vars(',source_tbl,',',target_tbl,',',change_log_name,',',change_log_replace,');');*/
/*  cmd=cats('%nrstr(',cmd,')');*/
/*  put "Macro call: " cmd;*/
/*  call execute(cmd);*/

  datalines;
have_1,,change_log,y
work.have_2,have_2_new,work.change_log
work.have_3,have_3_new,work.change_log
work.have_4,,work.change_log
;

proc print data=work.change_log;
run;
proc contents data=have_3_new;
run;quit;

 

Results of above calls

Spoiler
Patrick_0-1700449545419.png

 

 

 

 

 

sas-innovate-2024.png

Available on demand!

Missed SAS Innovate Las Vegas? Watch all the action for free! View the keynotes, general sessions and 22 breakouts on demand.

 

Register now!

How to Concatenate Values

Learn how use the CAT functions in SAS to join values from multiple variables into a single value.

Find more tutorials on the SAS Users YouTube channel.

Click image to register for webinarClick image to register for webinar

Classroom Training Available!

Select SAS Training centers are offering in-person courses. View upcoming courses for:

View all other training opportunities.

Discussion stats
  • 8 replies
  • 979 views
  • 1 like
  • 7 in conversation