My requirement is to create a generic process to ingest file with dynamic schema. In other words, I want this process to read schema type data (name, datatype, length, is_mandatory, etc.,), and read the corresponding file, validate and accept/reject the records. I had this written completely using macro, with list of macro generating code first. But I had hard time having someone (coming from other language background) understand the macro code. So, in order to improve the readability of the code, thought of rewriting that with less of a macro and more of data step and fcmp. But, whatever I have done so far, I am still not happy 🙂 See below for my approach (removed a bunch of details) ** Load Schema information for a file;
data schema;
infile datalines delimiter='~' dsd;
input vname :$40. vtype :$10. vlen :8. mandatory :8.;
datalines;
name~string~20~1
age~number~3~1
regdate~date~20~1
;
run;
** Create few macro variables to hold the list of fields and related statements **;
data schema;
set schema end=last;
format cols i_stmt $255.;
retain cols i_stmt;
cols = catx(' ',trim(cols),cats("'",vname,"'"));
i_stmt = catx(' ',i_stmt,vname,cats(':$',vlen,'.'));
if last then do;
call symputx('cols',cols,'G');
call symputx('istmt',i_stmt,'G');
end;
drop cols i_stmt;
run;
%put &cols;
%put &istmt;
** Read the data file as all character data type **;
data indata;
infile datalines delimiter='~' dsd;
input &istmt;
datalines;
John~34~20080101
Jack~27~20161001
Jill~4X1~20000101
~35~20100101
Rob~58~19970A01
Bob~26~20180101
run;
**Few Functions **;
proc fcmp outlib=work.temp.funcs;
** Get field attributes of a given field **;
subroutine getattr(vname $, rc, vtype $, vlen, mandatory );
outargs rc, vtype, vlen, mandatory;
declare hash attr(dataset:'work.schema');
rc=attr.definekey('vname');
rc=attr.definedata('vtype', 'vlen', 'mandatory');
rc=attr.definedone();
rc=attr.find();
endsub;
** Check for missing;
function isblank(name $);
return(missing(name));
endsub;
** Check for valid number;
function isnumber(name $);
return(if prxmatch('/^[+-]?((\d+(\.\d*)?)|(\.\d+))$/',trim(name)) eq 0 then 0 else 1);
endsub;
** Check for leap year;
function isleap(yr);
return ((mod(yr,4) = 0) and ((mod(yr,100) ne 0) or (mod(yr,400) = 0)));
endsub;
** Check for valid date;
function isdate(name $);
if isnumber(name) = 0 then return(0);
dt = input(name,best.);
if dt > 99999999 or dt < 15000000 then return(0);
y = int(dt/10000); m = mod(int(dt/100),100); d = mod(dt,100);
put y m d;
if y > 9999 or y < 1500 then return(0);
if m < 1 or m > 12 then return(0);
if d < 1 or d > 31 then return(0);
if m in (4,6,9,11) then return(d <= 30);
if m = 2 then if isleap(y) then return(d <= 29); else return(d <= 28);
return(1);
endsub;
run;
quit;
options cmplib=work.temp;
** Evaluate the input data and validate data;
data evaluate (drop=col: rc vtype vlen mandatory cvalue);
set indata;
array col $20. col1-col3 (&cols);
length vtype $10. failreason $32.;
do over col;
cvalue = vvaluex(col);
** the getattr function will give me the vtype vlen and mandatory for the given col;
call getattr(col,rc,vtype,vlen,mandatory);
if rc = 0 then do;
** some sample validation **;
if mandatory then do;
if isblank(cvalue) then do;
failreason = 'Missing '||col;
putlog cvalue ' is blank...';
continue;
end;
end;
if vtype = 'number' then do;
putlog 'cvalue = ' cvalue;
if not isnumber(cvalue) then do;
failreason = 'Invalid '||col;
continue;
end;
end;
if vtype = 'date' then do;
if not isdate(cvalue) then do;
failreason = 'Invalid '||col;
continue;
end;
end;
end;
end;
run; Thanks for all your assistance.
... View more