Background:
Recently, I got a task to read the text data to SAS automatically. And the text data is transferred via FTP from other application. The users select the data they want on the application, then click the "send" button, and the data will be send to the FTP. So, I write a SAS program to scan the FTP and find the text data, then download it to the local machine. Because the data is selected by different user for different purpose, the text data format is not same. So, I can only use the IMPORT procedure to read the data to SAS data set.
Problem:
1. Some ID variables are composed of more than 15 numbers, for example: 986509856125436590. The IMPORT procedure recognizes it as a numeric variable cause its value is all digits. And yet the numeric variable can not store a such big number, it will be distorted.
2. Some code values are fixed length, and composed of digits. for example: 001, 018, 023. The IMPORT procedure also recognizes it as a numeric variable, which is not we expected.
3. The IMPORT procedure doesn't support double characters delimiter, it will generate extra variables.
Solution in a general way:
1. For problem 1 and 2, I can ask the application guy to add quotation marks to the character values.
2. For problem 3, just use another data step to drop the extra variables.
My Solution:
As we know, when the IMPORT procedure reads the delimited file, actually the data is read by data step. The job of IMPORT procedure is to generate the data step code. So, Why not generate it by myself?
So, I write a macro to do this. Here is the result:
the text data:
Result of IMPORT procedure:
Result of import macro:
proc import datafile='D:\test.txt' out=test_import dbms=dlm replace;
delimiter='||';
getnames=yes;
run;
%custom_import_data(%str(D:\test.txt),%str(||),test_my_import,get_names=yes);
And Here is the macro code, it does solve my problem for now, I'm not sure it can import any kind of delimited file correctly. Maybe there is a bug I didn't notice, and I'm sure there is a way to improve the performance. So, welcome to update it.
%macro custom_import_data(file_path, /*path of delimited file.*/
file_dlm, /*delimiter, can be more than two characters.*/
out_dsn, /*name of output SAS data set.*/
file_encoding=wlatin1, /*encoding of delimitered file.*/
get_names=no, /*whether generates variable names from the first row. */
last_col_dlm=no, /*whether the line is end with the delimiter.*/
guessing_rows=1000); /*the number of rows to scan.*/
%local i;
filename read "&file_path" encoding="&file_encoding";
data _null_;
infile read obs=1;
input;
%if %lowcase(&last_col_dlm)=no %then var_num=count(_infile_,"&file_dlm")+1;
%else var_num=count(_infile_,"&file_dlm");
;
call symput('var_num',compress(var_num));
%if %lowcase(&get_names)=yes %then %do;
do i=1 to var_num;
call symput(cats('var',i),strip(scan(_infile_,i,"&file_dlm")));
end;
%end;
run;
%put The number of variables: &var_num;
data work._null_;
infile read _infile_=line
%if %lowcase(&get_names)=no %then firstobs=1;
%else firstobs=2;
end=last obs=&guessing_rows;
array var_type {&var_num} $ 1;
array var_length {&var_num};
array var_informat {&var_num} $ 20;
array var_format {&var_num} $ 20;
length value $ 1000;
retain var_: p_date1 p_date2 p_date3 p_time p_datetime1 p_datetime2 p_num1 p_num2;
if _n_=1 then do;
p_date1=prxparse('/^\d{4}(\\|-|\/)\d{1,2}(\\|-|\/)\d{1,2}$/');
p_date2=prxparse('/^\d{1,2}(\\|-|\/)\d{1,2}(\\|-|\/)\d{4}$/');
p_date3=prxparse('/^\d{1,2}\D{3}\d{4}$/');
p_time=prxparse('/^\d{1,2}:\d{1,2}:\d{1,2}$/');
p_datetime1=prxparse('/^\d{4}(\\|-|\/)\d{1,2}(\\|-|\/)\d{1,2}(:|\s)*\d{1,2}:\d{1,2}:\d{1,2}$/');
p_datetime2=prxparse('/^\d{1,2}\S{3}\d{4}(:|\s)*\d{1,2}:\d{1,2}:\d{1,2}$/');
p_num1=prxparse('/^-?[1-9]\d{0,10}(\.\d+)?$/');
p_num2=prxparse('/^-?0(\.\d+)?$/');
end;
input;
do i=1 to &var_num;
value=scan(line,i,"&file_dlm");
if var_type{i}='$' then do;
len=length(strip(value));
if var_type{i}='n' then do;
var_type{i}='$';
var_length{i}=len;
var_informat{i}=cats('$',len,'.');
var_format{i}=cats('$',len,'.');
end;
else do;
var_type{i}='$';
if len>var_length{i} then do;
var_length{i}=len;
var_informat{i}=cats('$',len,'.');
var_format{i}=cats('$',len,'.');
end;
end;
end;
else do;
if prxmatch(p_date1,strip(value))>0 then do;
var_type{i}='n';
var_length{i}=8;
var_informat{i}='yymmdd10.';
var_format{i}='yymmdd10.';
end;
else if prxmatch(p_date2,strip(value))>0 then do;
var_type{i}='n';
var_length{i}=8;
var_informat{i}='mmddyy10.';
var_format{i}='mmddyy10.';
end;
else if prxmatch(p_date3,strip(value))>0 then do;
var_type{i}='n';
var_length{i}=8;
var_informat{i}='date9.';
var_format{i}='date9.';
end;
else if prxmatch(p_time,strip(value))>0 then do;
var_type{i}='n';
var_length{i}=8;
var_informat{i}='time9.';
var_format{i}='time9.';
end;
else if prxmatch(p_datetime1,strip(value))>0 or prxmatch(p_datetime2,strip(value))>0 then do;
var_type{i}='n';
var_length{i}=8;
var_informat{i}='anydtdtm40.';
var_format{i}='datetime19.';
end;
else if prxmatch(p_num1,strip(value))>0 or prxmatch(p_num2,strip(value))>0 or strip(value)='' then do;
var_type{i}='n';
var_length{i}=8;
var_informat{i}='best12.';
var_format{i}='best12.';
end;
else do;
len=length(strip(value));
if var_type{i}='n' then do;
var_type{i}='$';
var_length{i}=len;
var_informat{i}=cats('$',len,'.');
var_format{i}=cats('$',len,'.');
end;
else do;
var_type{i}='$';
if len>var_length{i} then do;
var_length{i}=len;
var_informat{i}=cats('$',len,'.');
var_format{i}=cats('$',len,'.');
end;
end;
end;
end;
end;
if last or _n_=&guessing_rows then do;
output;
do i=1 to &var_num;
call symput(cats('var_type',i),strip(var_type{i}));
call symput(cats('var_length',i),strip(var_length{i}));
call symput(cats('var_informat',i),strip(var_informat{i}));
call symput(cats('var_format',i),strip(var_format{i}));
end;
end;
keep var_:;
run;
data &out_dsn;
infile read dlmstr="&file_dlm" dsd
%if %lowcase(&get_names)=no %then firstobs=1;
%else firstobs=2;
;
length
%do i=1 %to &var_num;
%if &&var_type&i=n %then %let var_type&i=;
%if %lowcase(&get_names)=yes %then "&&var&i"n &&var_type&i &&var_length&i;
%else var&i &&var_type&i &&var_length&i;
%end;
;
format
%do i=1 %to &var_num;
%if %lowcase(&get_names)=yes %then "&&var&i"n &&var_format&i;
%else var&i &&var_format&i;
%end;
;
input
%do i=1 %to &var_num;
%if %lowcase(&get_names)=yes %then "&&var&i"n : &&var_informat&i;
%else var&i : &&var_informat&i;
%end;
;
run;
%mend custom_import_data;
... View more