/* I would head in a direction like this ... */ /* This is a simplified example ... */ /* Create audited entities set */ data audited_entities; input DUNS_ID $9. AUDITYEAR; datalines; 000323667 2001 000323667 2002 000323667 2003 067211318 2005 067211318 2006 067211318 2007 ; run; /* Sort for counting and later merge */ proc sort; by DUNS_ID; run; /* Create a summary set by counting audits */ data audited_entities_summary; set audited_entities; by DUNS_ID; retain count 0; /* count will eventually be your new interval/ordinal target */ if first.DUNS_ID then count= 0; count + 1; audit_flag= 1; /* audit flag will eventually be your new binary target */ if last.DUNS_ID then output; drop AUDITYEAR; run; /* Create a set for unaudited entities */ data unaudited_entities; input DUNS_ID $9.; datalines; 000353667 009567493 000874567 ; run; /* These entities have been not been audited so indicate that in the data */ data unaudited_entities; set unaudited_entities; count= 0; audit_flag= 0; run; /* Sort for later merge */ proc sort; by DUNS_ID; run; /* Let's add in some other characteristics - just for example */ data entity_characteristics; input DUNS_ID $9. descriptive_var1 descriptive_var2 descriptive_var3; datalines; 000323667 1 2 3 067211318 4 5 6 000353667 7 8 9 009567493 10 11 12 000874567 13 14 15 ; run; /* Sort for later merge */ proc sort; by DUNS_ID; run; /* Create set suitable for predictive modeling by merging the */ /* Audited entities data set, the unaudited entities data set and their characteristics */ data model_set (rename= (count= ordinal_target audit_flag= binary_target)); merge Audited_entities_summary Unaudited_entities Entity_characteristics; by DUNS_ID; run; /* Now ... I would try either logistic regression or decision tree on the binary target */ /* An alternative approach would be a Poission regression on the ordinal target */ /* All these models are available in SAS/STAT or Enterprise Miner */
... View more