I have tried the challenge. Here is the test code. data test; set test.earthquakes_dirty; format DATE_TIME datetime19.; ID=input(scan(id_regioncode,1,"-"),best12.); REGION_CODE=input(scan(id_regioncode,2,"-"),best12.); FLAG_TSUNAMI=upcase(FLAG_TSUNAMI); /*if missing(day) then day=1;*/ /* Previously i have used imputation so that if atleast if year is non missing */ /*if missing(month) then month=1;*/ /* then we will get the dates */ /*if n(month,day,year,hour,minute,seconds)=6 then*/ DATE_TIME=dhms(mdy(month,day,year),hour,minute,seconds); /*else DATE_TIME=dhms(mdy(month,day,year),0,0,0);*/ format EQ_PRIMARY 4.1; EQ_PRIMARY=coalesce(EQ_MAG_MW,EQ_MAG_MS,EQ_MAG_MB,EQ_MAG_ML,EQ_MAG_MFA,EQ_MAG_UNK); drop id_regioncode month day year hour minute seconds EQ_MAG_MW EQ_MAG_MS EQ_MAG_MB EQ_MAG_ML EQ_MAG_MFA EQ_MAG_UNK; run; proc sort data=test; by ID; run; proc sort data=test out=test2 nodupkey dupout=dup_rec(keep=ID); by ID; run; data inv_recs earthquakes_valid dup_rec1; merge test(in=a) dup_rec(in=b); by id; if a and b then output dup_rec1; else do; if REGION_CODE in (1, 10, 15, 20, 30, 40, 50, 60, 70, 80, 90,100, 110, 120, 130, 140, 150, 160, 170) and FLAG_TSUNAMI in (" ", "TSU") and not missing(DATE_TIME) and (0 <= EQ_PRIMARY <= 9.9) and (0 <= FOCAL_DEPTH <= 700) then do; output earthquakes_valid; end; else do; output inv_recs; end; end; run; data invalid; set inv_recs dup_rec1(in=a); if a then var0="DUPLICATE_ID"; if REGION_CODE not in (1, 10, 15, 20, 30, 40, 50, 60, 70, 80, 90,100, 110, 120, 130, 140, 150, 160, 170) then var1="REGION_CODE"; if FLAG_TSUNAMI not in (" ", "TSU") then var2="FLAG_TSUNAMI"; if missing(DATE_TIME) then var3="DATE_TIME"; if EQ_PRIMARY = . or EQ_PRIMARY > 9.9 then var4="EQ_PRIMARY"; if FOCAL_DEPTH < 0 or FOCAL_DEPTH > 700 then var5="FOCAL_DEPTH"; INVALID_DESCRIPTION=catx(', ',var0,var1,var2,var3,var4,var5); drop var:; run;
... View more