Hi All, I am working on modifying an existing SAS code to extract data from a text file and have output in an excel file. The code is working fine with the old input text file but however I am losing quite many observations when I give the new input text file.I have prior programming experience in Java-J2EE and PL-SQL and leveraging on my previous experience, I have modified the code slightly but as I am totally new with SAS, I am having problem comprehending the logic of the program. I would be really grateful if someone in this community could explain in few lines the logic of the code to me and likewise I can modify the program. I paste the code below for reference. Thanks in advance for your kind help. Regards, Archita libname Mylib "C:\Users\PhD\SAS\Workspace\Step1"; data WORK.test; %let _EFIERR_ = 0; infile 'C:\Users\PhD\SAS\Workspace\a.txt' delimiter='09'x MISSOVER DSD lrecl=32767 firstobs=1; informat VAR1 $15. ; informat VAR2 $200. ; informat VAR3 $14. ; informat VAR4 $13. ; informat VAR5 $16. ; informat VAR6 $22. ; format VAR1 $15. ; format VAR2 $200. ; format VAR3 $14. ; format VAR4 $13. ; format VAR5 $16. ; format VAR6 $22. ; input Var1 $ Var2 $ VAR3 $ VAR4 $ VAR5 $ VAR6 $ ; if _ERROR_ then call symput('_EFIERR_',1); keep var1 var2; run; proc import datafile="C:\Users\PhD\SAS\Workspace\country_list.xls" out = Mylib.country_list dbms = xls replace; run; options mprint merror symbolgen; %macro ETL_Phase1; data test1; set test; if var1 ~= '' then do; temp = var1; end; else do; var1 = temp; end; retain temp; drop temp; run; data test2; set test1; if var2 = '' then delete; run; data test3; set test2; if (upcase(compress(var1)) = "UPDATEDATE" or upcase(compress(var1)) = "GENERICNAME" or /*modified 15Mar2013*/ upcase(compress(var1)) = "LATESTPHASE" or upcase(compress(var1)) = "ACTIVEPROGRAM" or upcase(compress(var1)) = "COMPANY" or upcase(compress(var1)) = "CLASSDESCRIPTION" or /*modified 14Mar2013*/ upcase(compress(var1)) = "INDICATION" or upcase(compress(var1)) = "PATENTDATA" or /*upcase(compress(var1)) = "ATCPROFILES" or */ upcase(compress(var1)) = "ACTION" or upcase(compress(var1)) = "HISTORY"); run; data test4; set test3; retain count 0; if upcase(compress(var1)) = "UPDATEDATE" then count = count + 1; retain count; run; proc sql; create table template1 as select unique var1 from test4; create table template2 as select unique count from test4; create table template as select * from template1, template2; quit; proc sort data = template; by var1 count; proc sort data = test4; by var1 count; data finaldata; merge template test4; by var1 count; if strip(upcase(var1)) = "HISTORY" then temp = 1; else temp = 0; run; proc sort data = finaldata; by count temp; run; proc transpose data = Finaldata out = a_out1; var var1 var2; by count; run; /* Capture the number of variables in the dataset into a macro variable - nvars */ %let dsid = %sysfunc(open(work.a_out1, i)); %let nvars = %sysfunc(attrn(&dsid, nvars)); %if &dsid > 0 %then %let rc=%sysfunc(close(&dsid)); data Mylib.a_out1; array cols (%eval(&nvars-2)) $200. col1-col%eval(&nvars-2); array histories (%eval(&nvars-2)) $200. history1-history%eval(&nvars-2); set a_out1(where=(strip(upcase(_name_))="VAR2")); /*atc_Profiles = col1;*/ action = col1;/*modified 14 March 2013*/ active_program = col2;/*modified 14 March 2013*/ company= col3;/*modified 14 March 2013*/ product_name = col4;/*modified 14 March 2013*/ indication = col5;/*modified 14 March 2013*/ latest_phase = col6;/*modified 14 March 2013*/ patent_data = col7;/*modified 14 March 2013*/ update_date= col8;/*modified 14 March 2013*/ classdescription=col9;/*modified 14 March 2013*/ do i = 1 to %eval(&nvars-2); if i>=11 then histories(i-10) = cols(i); end; drop col1-col%eval(&nvars-2) _name_ i; run; /* Obtain the country list into a macro variable delimited by ":" */ proc sql noprint; select country into : country_list separated by ':' from Mylib.Country_list; quit; /* Extract data */ data Mylib.A; set Mylib.a_out1(obs=350); retain agree_check agree_term_check agree_no_check; retain precli_check; /* C002 */ retain phase1_check phase1a_check phase1b_check phase2_check phase3_check phase1_2_check phase2_3_check phase1_3_check; retain marketed_check registered_check marketed_usa_check registered_usa_check; retain not_reco_check not_reco_usa_check orphan_drug_check fasttrack_check; retain country_list "&country_list"; format agree_1-agree_10 date9.; format agree_term_1-agree_term_10 date9.; format phase1_1-phase1_5 date9.; format phase2_1-phase2_5 date9.; format phase3_1-phase3_5 date9.; format phase1a_1-phase1a_5 date9.; format phase1b_1-phase1b_5 date9.; format phase1_2_1-phase1_2_5 date9.; format phase2_3_1-phase2_3_5 date9.; format phase1_3_1-phase1_3_5 date9.; format marketed date9.; format registered date9.; format marketed_usa date9.; format registered_usa date9.; format orphan_drug date9.; format fasttrack date9.; format not_reco_1-not_reco_5 date9.; format not_reco_usa date9.; format country1-country50 $30.; array histories (%eval(&nvars-2)) $200. history1-history%eval(&nvars-2);/*modified nvars-12 14 March 2013*/ array countries (%eval(&nvars-2)) $200. country1-country%eval(&nvars-2);/*modified nvars-12 14 March 2013*/ array agree_ (10) agree_1-agree_10; array agree_names (10) $200. agree_name1-agree_name10; array agree_termins (10) $200. agree_termin1-agree_termin10; array agree_term_ (10) agree_term_1-agree_term_10; array phase1_ (5) phase1_1-phase1_5; array phase1a_ (5) phase1a_1-phase1a_5; array phase1b_ (5) phase1b_1-phase1b_5; array phase2_ (5) phase2_1-phase2_5; array phase3_ (5) phase3_1-phase3_5; array phase1_2_ (5) phase1_2_1-phase1_2_5; array phase2_3_ (5) phase2_3_1-phase2_3_5; array phase1_3_ (5) phase1_3_1-phase1_3_5; array not_reco (5) not_reco_1-not_reco_5; array country (50) $30. country1-country50; if _n_ = 1 then do; agree_check = prxparse("/(agreement)|(collaboration)|(collaborative)|(co\-promotion)|(option)|(licensed to)/i"); agree_term_check = prxparse("/(No longer in development with originator)|(terminated)/i"); /* C001 */ agree_no_check = prxparse("/(extended)|(amended)|(revision of)/i"); phase1_check = prxparse("/phase I[^abI\/]/i"); phase1a_check = prxparse("/phase Ia/i"); phase1b_check = prxparse("/phase Ib/i"); phase2_check = prxparse("/phase II[^\/I]/i"); phase3_check = prxparse("/phase III[^\/]/i"); phase1_2_check = prxparse("/phase I\/II/i"); phase2_3_check = prxparse("/phase II\/III/i"); phase1_3_check = prxparse("/phase I\/III/i"); marketed_check = prxparse("/marketed/i"); registered_check = prxparse("/registered/i"); marketed_usa_check = prxparse("/marketed\,\s?usa/i"); registered_usa_check = prxparse("/registered\,\s?usa/i"); not_reco_check = prxparse("/not recommended/i"); not_reco_usa_check = prxparse("/not recommended\,\s?usa/i"); orphan_drug_check = prxparse("/orphan\s?drug/i"); fasttrack_check = prxparse("/fast\s?track/i"); precli_check = prxparse("/preclinical/i"); end; agree_cnt = 1; agree_term_cnt = 1; phase1_cnt = 1; phase1a_cnt = 1; phase1b_cnt = 1; phase2_cnt = 1; phase3_cnt = 1; phase1_2_cnt = 1; phase2_3_cnt = 1; phase1_3_cnt = 1; not_reco_cnt = 1; /* marketed_cnt=1; registered_cnt=1; marketed_usa_cnt=1; registered_usa_cnt=1;*/ do i = 1 to dim(histories); /* Agreement */ if (prxmatch(agree_check, histories(i)) and not prxmatch(agree_term_check, histories(i)) and not prxmatch(agree_no_check, histories(i))) then do; if agree_cnt <= 10 then do; len = length(compress(scan(histories(i), 1, ':'))); if len = 7 then agree_(agree_cnt) = input(('01' || compress(scan(histories(i), 1, ':'))), date9.); else if len = 4 then agree_(agree_cnt) = input(('1306' || compress(scan(histories(i), 1, ':'))), ddmmyy8.); agree_names(agree_cnt) = histories(i); end; agree_cnt = agree_cnt + 1; end; /* Agreement Terminated */ if prxmatch(agree_term_check, histories(i)) then do; if agree_term_cnt <= 10 then do; len = length(compress(scan(histories(i), 1, ':'))); if len = 7 then agree_term_(agree_term_cnt) = input(('01' || compress(scan(histories(i), 1, ':'))), date9.); else if len = 4 then agree_term_(agree_term_cnt) = input(('1306' || compress(scan(histories(i), 1, ':'))), ddmmyy8.); agree_termins(agree_term_cnt) = histories(i); end; agree_term_cnt = agree_term_cnt + 1; end; /* Phase I */ if prxmatch(phase1_check, histories(i)) and not prxmatch(phase2_check, histories(i)) then do; if phase1_cnt <= 5 then do; len = length(compress(scan(histories(i), 1, ':'))); if len = 7 then phase1_(phase1_cnt) = input(('01' || compress(scan(histories(i), 1, ':'))), date9.); else if len = 4 then phase1_(phase1_cnt) = input(('1306' || compress(scan(histories(i), 1, ':'))), ddmmyy8.); end; phase1_cnt = phase1_cnt + 1; end; /* Phase Ia */ if prxmatch(phase1a_check, histories(i)) then do; if phase1a_cnt <= 5 then do; len = length(compress(scan(histories(i), 1, ':'))); if len = 7 then phase1a_(phase1a_cnt) = input(('01' || compress(scan(histories(i), 1, ':'))), date9.); else if len = 4 then phase1a_(phase1a_cnt) = input(('1306' || compress(scan(histories(i), 1, ':'))), ddmmyy8.); end; phase1a_cnt = phase1a_cnt + 1; end; /* Phase Ib */ if prxmatch(phase1b_check, histories(i)) then do; if phase1b_cnt <= 5 then do; len = length(compress(scan(histories(i), 1, ':'))); if len = 7 then phase1b_(phase1b_cnt) = input(('01' || compress(scan(histories(i), 1, ':'))), date9.); else if len = 4 then phase1b_(phase1b_cnt) = input(('1306' || compress(scan(histories(i), 1, ':'))), ddmmyy8.); end; phase1b_cnt = phase1b_cnt + 1; end; /* Phase II */ if prxmatch(phase2_check, histories(i)) and not prxmatch(phase3_check, histories(i)) then do; if phase2_cnt <= 5 then do; len = length(compress(scan(histories(i), 1, ':'))); if len = 7 then phase2_(phase2_cnt) = input(('01' || compress(scan(histories(i), 1, ':'))), date9.); else if len = 4 then phase2_(phase2_cnt) = input(('1306' || compress(scan(histories(i), 1, ':'))), ddmmyy8.); end; phase2_cnt = phase2_cnt + 1; end; /* Phase III */ if prxmatch(phase3_check, histories(i)) then do; if phase3_cnt <= 5 then do; len = length(compress(scan(histories(i), 1, ':'))); if len = 7 then phase3_(phase3_cnt) = input(('01' || compress(scan(histories(i), 1, ':'))), date9.); else if len = 4 then phase3_(phase3_cnt) = input(('1306' || compress(scan(histories(i), 1, ':'))), ddmmyy8.); end; phase3_cnt = phase3_cnt + 1; end; /* Phase I/II */ if prxmatch(phase1_2_check, histories(i)) then do; if phase1_2_cnt <= 5 then do; len = length(compress(scan(histories(i), 1, ':'))); if len = 7 then phase1_2_(phase1_2_cnt) = input(('01' || compress(scan(histories(i), 1, ':'))), date9.); else if len = 4 then phase1_2_(phase1_2_cnt) = input(('1306' || compress(scan(histories(i), 1, ':'))), ddmmyy8.); end; phase1_2_cnt = phase1_2_cnt + 1; end; /* Phase II/III */ if prxmatch(phase2_3_check, histories(i)) then do; if phase2_3_cnt <= 5 then do; len = length(compress(scan(histories(i), 1, ':'))); if len = 7 then phase2_3_(phase2_3_cnt) = input(('01' || compress(scan(histories(i), 1, ':'))), date9.); else if len = 4 then phase2_3_(phase2_3_cnt) = input(('1306' || compress(scan(histories(i), 1, ':'))), ddmmyy8.); end; phase2_3_cnt = phase2_3_cnt + 1; end; /* Phase I/III */ if prxmatch(phase1_3_check, histories(i)) then do; if phase1_3_cnt <= 5 then do; len = length(compress(scan(histories(i), 1, ':'))); if len = 7 then phase1_3_(phase1_3_cnt) = input(('01' || compress(scan(histories(i), 1, ':'))), date9.); else if len = 4 then phase1_3_(phase1_3_cnt) = input(('1306' || compress(scan(histories(i), 1, ':'))), ddmmyy8.); end; phase1_3_cnt = phase1_3_cnt + 1; end; /* Marketed */ if prxmatch(marketed_check, histories(i)) and not prxmatch(marketed_usa_check, histories(i)) then do; /* if marketed_cnt <= 1 then do;*/ len = length(compress(scan(histories(i), 1, ':'))); if len = 7 then marketed = input(('01' || compress(scan(histories(i), 1, ':'))), date9.); else if len = 4 then marketed = input(('1306' || compress(scan(histories(i), 1, ':'))), ddmmyy8.); /* end; */ /* marketed_cnt = marketed_cnt + 1; */ end; /* Marketed USA */ if prxmatch(marketed_usa_check, histories(i)) then do; /* if marketed_usa_cnt <= 1 then do;*/ len = length(compress(scan(histories(i), 1, ':'))); if len = 7 then marketed_usa = input(('01' || compress(scan(histories(i), 1, ':'))), date9.); else if len = 4 then marketed_usa = input(('1306' || compress(scan(histories(i), 1, ':'))), ddmmyy8.); /* end; */ /* marketed_usa_cnt = marketed_usa_cnt + 1;*/ end; /* registered */ if prxmatch(registered_check, histories(i)) and not prxmatch(registered_usa_check, histories(i)) then do; /* if registered_cnt <= 1 then do; */ len = length(compress(scan(histories(i), 1, ':'))); if len = 7 then registered = input(('01' || compress(scan(histories(i), 1, ':'))), date9.); else if len = 4 then registered = input(('1306' || compress(scan(histories(i), 1, ':'))), ddmmyy8.); /* end; */ /* registered_cnt = registered_cnt + 1;*/ end; /* registered USA */ if prxmatch(registered_usa_check, histories(i)) then do; /* if registered_usa_cnt <= 1 then do; */ len = length(compress(scan(histories(i), 1, ':'))); if len = 7 then registered_usa = input(('01' || compress(scan(histories(i), 1, ':'))), date9.); else if len = 4 then registered_usa = input(('1306' || compress(scan(histories(i), 1, ':'))), ddmmyy8.); /* end;*/ /* registered_usa_cnt = registered_usa_cnt + 1;*/ end; /* Not Recommended */ if prxmatch(not_reco_check, histories(i)) and not prxmatch(not_reco_usa_check, histories(i)) then do; if not_reco_cnt <= 5 then do; len = length(compress(scan(histories(i), 1, ':'))); if len = 7 then not_reco(not_reco_cnt) = input(('01' || compress(scan(histories(i), 1, ':'))), date9.); else if len = 4 then not_reco(not_reco_cnt) = input(('1306' || compress(scan(histories(i), 1, ':'))), ddmmyy8.); end; not_reco_cnt = not_reco_cnt + 1; end; /* Not Recommended USA */ if prxmatch(not_reco_usa_check, histories(i)) then do; /* if not_reco_cnt <= 1 then do; */ len = length(compress(scan(histories(i), 1, ':'))); if len = 7 then not_reco_usa = input(('01' || compress(scan(histories(i), 1, ':'))), date9.); else if len = 4 then not_reco_usa = input(('1306' || compress(scan(histories(i), 1, ':'))), ddmmyy8.); /* end;*/ /* not_reco_cnt = not_reco_cnt + 1; */ end; /* Orphan Drug */ if prxmatch(orphan_drug_check, histories(i)) then do; /* if not_reco_cnt <= 1 then do; */ len = length(compress(scan(histories(i), 1, ':'))); if len = 7 then orphan_drug = input(('01' || compress(scan(histories(i), 1, ':'))), date9.); else if len = 4 then orphan_drug = input(('1306' || compress(scan(histories(i), 1, ':'))), ddmmyy8.); /* end;*/ /* not_reco_cnt = not_reco_cnt + 1; */ end; /* Fast Track */ if prxmatch(fasttrack_check, histories(i)) then do; /* if not_reco_cnt <= 1 then do; */ len = length(compress(scan(histories(i), 1, ':'))); if len = 7 then fasttrack = input(('01' || compress(scan(histories(i), 1, ':'))), date9.); else if len = 4 then fasttrack = input(('1306' || compress(scan(histories(i), 1, ':'))), ddmmyy8.); /* end; */ /* not_reco_cnt = not_reco_cnt + 1; */ end; /* To identify unique countries in a given project */ /* Country search would happen only if the history part of the information contains: preclinical, phase I, phase II and phase III phases (C003) */ if (prxmatch(precli_check, histories(i)) or prxmatch(phase1_check, histories(i)) or prxmatch(phase1a_check, histories(i)) or prxmatch(phase1b_check, histories(i)) or prxmatch(phase2_check, histories(i)) or prxmatch(phase3_check, histories(i)) or prxmatch(phase1_2_check, histories(i)) or prxmatch(phase2_3_check, histories(i)) or prxmatch(phase1_3_check, histories(i))) then do; j = 1; do while(scan(country_list, j, ":") ~= ""); /* loop for every country in the list */ curr_country = scan(country_list, j, ":"); /* Step to avoid any other substring matches with abbreviations like USA, UK or EU */ if strip(upcase(curr_country)) not in ("USA", "UK", "EU") then curr_history = upcase(strip(histories(i))); else curr_history = strip(histories(i)); /* A country is found in the history */ if index(strip(curr_history), upcase(strip(curr_country))) ~= 0 then do; k = 1; found = 0; /* Check the list of already found countries to see if this country has already occured. If so, then ignore, otherwise increase the count of countries found. Also add the current country to the list of found countries. */ do while(country(k) ~= "" and found ~= 1); if upcase(strip(country(k))) = upcase(strip(curr_country)) then found = 1; k = k + 1; end; if found = 0 then do; country(k) = upcase(strip(curr_country)); total_countries = k; end; end; j = j + 1; end; /* loop for every country in the list ends */ end; end; /* Filter on Phases for Country Search (C003 ends) */ drop i len; drop agree_check agree_term_check agree_no_check; drop precli_check; /* C002 ends */ drop phase1_check phase1a_check phase1b_check phase2_check phase3_check phase1_2_check phase2_3_check phase1_3_check; drop marketed_check registered_check marketed_usa_check registered_usa_check; drop not_reco_check not_reco_usa_check orphan_drug_check fasttrack_check; drop agree_cnt agree_term_cnt phase1_cnt phase1a_cnt phase1b_cnt phase2_cnt phase3_cnt phase1_2_cnt phase2_3_cnt phase1_3_cnt not_reco_cnt; drop history1-history%eval(&nvars-10); /*modified nvars-12 15 March 2013*/ drop j k curr_country country_list curr_history found; run; %mend ETL_Phase1; %ETL_Phase1;
... View more