Solved: Re: Devide the dataset into fixed observations

kumarK · Posted 03-25-2017 12:03 AM

Hi Guys,

can you please suggest how to optimize this code

data have;
infile cards;
input id count time $8. sequence$;
cards;
101 30 weekly 1a
102 20 weekly 1a
103 10 weekly 1a
104 37 weekly 1a
105 27 weekly 1a
106 17 weekly 1a
107 17 weekly 1a
108 39 Monthly 2a
109 33 Monthly 2a
110 41 Monthly 2a
111 23 Monthly 2a
112 25 Monthly 2a
113 36 Monthly 2a
114 25 Monthly 2a
115 28 Monthly 2a
116 25 Monthly 2a
117 35 Monthly 2a
118 26 Monthly 2b
119 35 Monthly 2b
120 41 Monthly 2b
121 37 Monthly 2b
122 24 Monthly 2b
123 49 Monthly 2b
124 28 Monthly 2b
run;

data week month;
set have;
if time='weekly' then output week;
if time='Monthly' then output month;
run;

proc sql noprint;
select count(id) into:obs from week;
quit;

data _null_;
set week;
if &obs > 5 then call symput('w_obs',ceil(&obs/5));
if &obs <=5 then call symput('w_obs',1);
run;

%put &obs;
%put &w_obs;

data d1;
set week(obs=&w_obs);
run;

proc sql outobs=&w_obs;
create table d2 as
select * from week
where id not in (select distinct id from d1);
quit;

proc sql outobs=&w_obs;
create table d3 as
select * from week
where id not in (select distinct id from d1) and id not in (select distinct id from d2);
quit;

like this iwant to create d5. so the week dataset will devide into d1 d2 d3 d4 d5 with undique id's

and month dataset also will devide the dataset into equal or fixed obs;

Thanks

Patrick · Posted 03-26-2017 07:18 PM

I've been referring to the simplified code version I've posted as there the change is really simple. See below:


data want_week;
  _full_buckets=mod(_nobs,5);
  if _full_buckets=0 then _full_buckets=5;
  _rows_in_full_buckets=ceil(_nobs/5);
  _currow=1;

  do _bucket =1 to 5;
    do _point=_currow to (_currow + ifn(_bucket >_full_buckets,_rows_in_full_buckets-2,_rows_in_full_buckets-1,0));
      set week nobs=_nobs point=_point;
      indicator='d'||put(_bucket,f1.);
      output;
    end;
    _currow= _point;
  end;
  
  drop _:;
  stop;
run;


data want_month;
  _full_buckets=mod(_nobs,5);
  if _full_buckets=0 then _full_buckets=5;
  _rows_in_full_buckets=ceil(_nobs/5);
  _currow=1;

  do _bucket =1 to 5;
    do _point=_currow to (_currow + ifn(_bucket >_full_buckets,_rows_in_full_buckets-2,_rows_in_full_buckets-1,0));
      set month nobs=_nobs point=_point;
      indicator='m'||put(_bucket,f1.);
      output;
    end;
    _currow= _point;
  end;
  
  drop _:;
  stop;
run;

For the "complex" code version: The change you've tried clearly shows that you haven't understood yet how it works. The hash table in the code is only required if you want to write the results to different output tables and you don't know in advance which/how many output tables you'll have.

Using the hash output() method is as far as I know the only way how you can create new SAS tables dynamically during SAS data step execution - but as you now only want to write the results to a single output table using the hash object is no longer required.

View solution in original post

lakshmi_74 · Posted 03-25-2017 01:22 AM

Hopefully below code works for you.

data week month;
set have;
if (time eq 'weekly') then output week;
else if (time eq 'Monthly') then output month;
run;
proc sql;
select count(id) into :obs_week from week;
select count(id) into :obs_month from month;
quit;
data d1 d2 d3 d4 d5;
set week;
chk_seq=mod(_n_,5);
select;
when(chk_seq=1) output d1;
when(chk_seq=2) output d2;
when(chk_seq=3) output d3;
when(chk_seq=4) output d4;
otherwise output d5;
end;
drop chk_seq;
run;

data m1 m2 m3 m4 m5 m6;
set month;
chk_seq=mod(_n_,6);
select;
when(chk_seq=1) output m1;
when(chk_seq=2) output m2;
when(chk_seq=3) output m3;
when(chk_seq=4) output m4;
when(chk_seq=5) output m5;
otherwise output m6;
end;
drop chk_seq;
run;

ArtC · Posted 03-25-2017 03:05 AM

The solution by @lakshmi_74 works nicely, but we can still make it a bit more efficient. First the format for TIME needs to be removed to correctly read in the data. Second the macro variables are not used and the SQL step can be eliminated. Finally the two DATA steps can be combined to eliminate another pass of the data. The slightly modified code by @lakshmi_74 is:

data d1 d2 d3 d4 d5
     m1 m2 m3 m4 m5 m6;
   set have;
   drop wcnt wseq mcnt mseq;
   if (time eq 'weekly') then do;
      wcnt+1;
      wseq= mod(wcnt,5);
      select;
         when(wseq=1) output d1;
         when(wseq=2) output d2;
         when(wseq=3) output d3;
         when(wseq=4) output d4;
         otherwise output d5;
      end;
   end;
   else if (time eq 'Monthly') then do;
      mcnt+1;
      mseq=mod(mcnt,6);
      select;
         when(mseq=1) output m1;
         when(mseq=2) output m2;
         when(mseq=3) output m3;
         when(mseq=4) output m4;
         when(mseq=5) output m5;
         otherwise output m6;
      end;
   end;
   run;

These solutions assume that the observations assigned to the subsets do not need to be selected either randomly or sequentially.

As an aside, PROC SURVEYSELECT is a nice procedure for breaking up data sets into subsets under various criteria.

kumarK · Posted 03-25-2017 05:40 AM

Thanks for your reply. i wnat the observations pick row wise inyour code its picking based on reminder. could you please suggest.

kumarK · Posted 03-25-2017 05:36 AM

Thanks Laxmi. Actually i want to pick the obervations in row wise. in d1 i wnat first 2 rows then in d2 next 2 rows like that. your code picks the vaues based on reminder .can you please suggest how to pick the obs in sequence wise.

Patrick · Posted 03-25-2017 08:13 AM

Does the following work for you? Else: Please provide a data step creating the desired output so we understand what you're after.

data have;
  infile cards;
  input id count time :$8. sequence$;
  cards;
101 30 weekly 1a
102 20 weekly 1a
103 10 weekly 1a
104 37 weekly 1a
105 27 weekly 1a
106 17 weekly 1a
107 17 weekly 1a
108 39 Monthly 2a
109 33 Monthly 2a
110 41 Monthly 2a
111 23 Monthly 2a
112 25 Monthly 2a
113 36 Monthly 2a
114 25 Monthly 2a
115 28 Monthly 2a
116 25 Monthly 2a
117 35 Monthly 2a
118 26 Monthly 2b
119 35 Monthly 2b
120 41 Monthly 2b
121 37 Monthly 2b
122 24 Monthly 2b
123 49 Monthly 2b
124 28 Monthly 2b
;
run;

data week month;
  set have;
  if time='weekly' then output week;
  else if time='Monthly' then output month;
run;

%macro bucketIT (inds,buckets,outdsFix);
  data _null_;
    
    if 0 then set &inds;
    dcl hash h1(dataset:"&inds(obs=0)",ordered:'y');
    h1.defineKey('id');
    h1.defineData(all:'y');
    h1.defineDone();
   
    _full_buckets=mod(_nobs,&buckets);
    if _full_buckets=0 then _full_buckets=&buckets;
    _rows_in_full_buckets=ceil(_nobs/&buckets);
    _currow=1;

    do _bucket =1 to &buckets;

      do _point=_currow to (_currow + ifn(_bucket >_full_buckets,_rows_in_full_buckets-2,_rows_in_full_buckets-1,0));
        set &inds nobs=_nobs point=_point;
        h1.add();
        put _all_;
      end;

      _currow= _point;
      h1.output(dataset:cats("&outdsFix",put(_bucket,f3.0)));
      h1.clear();

    end;

    stop;

  run;
%mend;

%bucketIT(week,5,d)
%bucketIT(month,5,m)

rogerjdeangelis · Posted 03-25-2017 10:42 AM

Advantages to this method

 1. Potential to be much faster because hast table much less likely to be cleared
    between invocations. Operating system sees one address space.
 2. Hash table should not be reloaded (if DOSUBL is compilation is smart)
 3. If week fails you can stop and decide if you want to run month (use pop up window?)
 4. You can compile and store the  program.
 5. No macros

Howvever it looks like the current
implementation of dosubl is no faster
then separate address spaces.
I would use it anyway because of SAS future enhancements.

data have;
  infile cards;
  input id count time :$8. sequence$;
cards4;
101 30 weekly 1a
102 20 weekly 1a
103 10 weekly 1a
104 37 weekly 1a
105 27 weekly 1a
106 17 weekly 1a
107 17 weekly 1a
108 39 Monthly 2a
109 33 Monthly 2a
110 41 Monthly 2a
111 23 Monthly 2a
112 25 Monthly 2a
113 36 Monthly 2a
114 25 Monthly 2a
115 28 Monthly 2a
116 25 Monthly 2a
117 35 Monthly 2a
118 26 Monthly 2b
119 35 Monthly 2b
120 41 Monthly 2b
121 37 Monthly 2b
122 24 Monthly 2b
123 49 Monthly 2b
124 28 Monthly 2b
;;;;
run;quit;

data week month;
  set have;
  if time='weekly' then output week;
  else if time='Monthly' then output month;
run;quit;

data _null_;

 array indsx[2]    $5    ("week","month");
 array outdsFixx[2] $1   ("d","m");

 do i=1 to 2;
   call symputx("inds",indsx[i]);
   call symputx("outdsFix",outdsFixx[i]);
   call symputx("buckets","5");
  rc=dosubl('
   data _null_;
    if 0 then set &inds;
    dcl hash h1(dataset:"&inds(obs=0)",ordered:"y");
    h1.defineKey("id");
    h1.defineData(all:"y");
    h1.defineDone();

    _full_buckets=mod(_nobs,&buckets);
    if _full_buckets=0 then _full_buckets=&buckets;
    _rows_in_full_buckets=ceil(_nobs/&buckets);
    _currow=1;
    do _bucket =1 to &buckets;
      do _point=_currow to (_currow + ifn(_bucket >_full_buckets
         ,_rows_in_full_buckets-2,_rows_in_full_buckets-1,0));
        set &inds nobs=_nobs point=_point;
        h1.add();
        put _all_;
      end;
      _currow= _point;
      h1.output(dataset:cats("&outdsFix",put(_bucket,f3.0)));
      h1.clear();
    end;
    stop;
   run;quit;
   ');
 end;

 if rc then stop; * stop if month buckets fails;

run;quit;

kumarK · Posted 03-25-2017 02:31 PM

Thanks for the reply.

I am getting this error.

ERROR 68-185: The function DOSUBL is unknown, or cannot be accessed.

Also i am not understing what's happeing in the code. could you please explian or comment out the code.

Actualy this is just sample data i need to change the code according to my original data. where the number of obs in not fixed i think in this code its taking 2 obs as fixed rows correct me if i was wrong

kumarK · Posted 03-25-2017 02:39 PM

Thanks for the code. This is working fine.

however i am not understanding whats happening in the code. is there any approch with out using hash tables.

if you dont mind can you please comment the code whats happeing by block wise Thanks for your time

Patrick · Posted 03-25-2017 07:43 PM

@kumarK

Using a hash table and a macro just adds a bit of flexibility. Below a simplified and scaled down version for exactly your number of 5 buckets.

In using below code version it should be easier for you to understand what I've added in the hash/macro approach which then can deal dynamically with different numbers of buckets and different names for output data sets.

data have;
  infile cards;
  input id count time :$8. sequence$;
  cards;
101 30 weekly 1a
102 20 weekly 1a
103 10 weekly 1a
104 37 weekly 1a
105 27 weekly 1a
106 17 weekly 1a
107 17 weekly 1a
108 39 Monthly 2a
109 33 Monthly 2a
110 41 Monthly 2a
111 23 Monthly 2a
112 25 Monthly 2a
113 36 Monthly 2a
114 25 Monthly 2a
115 28 Monthly 2a
116 25 Monthly 2a
117 35 Monthly 2a
118 26 Monthly 2b
119 35 Monthly 2b
120 41 Monthly 2b
121 37 Monthly 2b
122 24 Monthly 2b
123 49 Monthly 2b
124 28 Monthly 2b
;
run;

data week month;
  set have;
  if time='weekly' then output week;
  else if time='Monthly' then output month;
run;


data d1 d2 d3 d4 d5;
  _full_buckets=mod(_nobs,5);
  if _full_buckets=0 then _full_buckets=5;
  _rows_in_full_buckets=ceil(_nobs/5);
  _currow=1;

  do _bucket =1 to 5;
    do _point=_currow to (_currow + ifn(_bucket >_full_buckets,_rows_in_full_buckets-2,_rows_in_full_buckets-1,0));
      set week nobs=_nobs point=_point;
      select(_bucket);
        when(1) output d1;
        when(2) output d2;
        when(3) output d3;
        when(4) output d4;
        when(5) output d5;
        otherwise; 
      end;
    end;
    _currow= _point;
  end;
  
  drop _:;
  stop;
run;


data m1 m2 m3 m4 m5;
  _full_buckets=mod(_nobs,5);
  if _full_buckets=0 then _full_buckets=5;
  _rows_in_full_buckets=ceil(_nobs/5);
  _currow=1;

  do _bucket =1 to 5;
    do _point=_currow to (_currow + ifn(_bucket >_full_buckets,_rows_in_full_buckets-2,_rows_in_full_buckets-1,0));
      set month nobs=_nobs point=_point;
      select(_bucket);
        when(1) output m1;
        when(2) output m2;
        when(3) output m3;
        when(4) output m4;
        when(5) output m5;
        otherwise; 
      end;
    end;
    _currow= _point;
  end;
  
  drop _:;
  stop;
run;

kumarK · Posted 03-26-2017 04:09 AM

Thanks for the code.

i am thinking that if we can add one column indicator in have dataset itslef instead of deviding then its easy to break as required.

like this. sorry if i am asking too much info in this.

	DAY INDICATOR
101 30 weekly 1a	d1
102 20 weekly 1a	d1
103 10 weekly 1a	d2
104 37 weekly 1a	d2
105 27 weekly 1a	d3
106 17 weekly 1a	d4
107 17 weekly 1a	d5

Patrick · Posted 03-26-2017 05:02 AM

@kumarK

Sure. Just modify the code I've posted by populating a column with a value instead of writing the row to an output table and you're done.

kumarK · Posted 03-26-2017 05:13 AM

HI i have tried to change the below like this but its not giving any outut. Could you please check and suggest.

%macro bucketIT (inds,buckets,outdsFix);
data Want;

if 0 then set &inds;
dcl hash h1(dataset:"&inds(obs=0)",ordered:'y');
h1.defineKey('id');
h1.defineData(all:'y');
h1.defineDone();

_full_buckets=mod(_nobs,&buckets);
if _full_buckets=0 then _full_buckets=&buckets;
_rows_in_full_buckets=ceil(_nobs/&buckets);
_currow=1;

do _bucket =1 to &buckets;

do _point=_currow to (_currow + ifn(_bucket >_full_buckets,_rows_in_full_buckets-2,_rows_in_full_buckets-1,0));
set &inds nobs=_nobs point=_point;
h1.add();
put _all_;
end;

_currow= _point;
Indicator = cats("&outdsFix",put(_bucket,f3.0)) ; /* i have added this indicator */
/* h1.output(dataset:cats("&outdsFix",put(_bucket,f3.0)));*/
h1.clear();

end;

stop;

run;
%mend;

%bucketIT(week,5,d)
%bucketIT(month,5,m)

Patrick · Posted 03-26-2017 07:18 PM

I've been referring to the simplified code version I've posted as there the change is really simple. See below:


data want_week;
  _full_buckets=mod(_nobs,5);
  if _full_buckets=0 then _full_buckets=5;
  _rows_in_full_buckets=ceil(_nobs/5);
  _currow=1;

  do _bucket =1 to 5;
    do _point=_currow to (_currow + ifn(_bucket >_full_buckets,_rows_in_full_buckets-2,_rows_in_full_buckets-1,0));
      set week nobs=_nobs point=_point;
      indicator='d'||put(_bucket,f1.);
      output;
    end;
    _currow= _point;
  end;
  
  drop _:;
  stop;
run;


data want_month;
  _full_buckets=mod(_nobs,5);
  if _full_buckets=0 then _full_buckets=5;
  _rows_in_full_buckets=ceil(_nobs/5);
  _currow=1;

  do _bucket =1 to 5;
    do _point=_currow to (_currow + ifn(_bucket >_full_buckets,_rows_in_full_buckets-2,_rows_in_full_buckets-1,0));
      set month nobs=_nobs point=_point;
      indicator='m'||put(_bucket,f1.);
      output;
    end;
    _currow= _point;
  end;
  
  drop _:;
  stop;
run;

For the "complex" code version: The change you've tried clearly shows that you haven't understood yet how it works. The hash table in the code is only required if you want to write the results to different output tables and you don't know in advance which/how many output tables you'll have.

Using the hash output() method is as far as I know the only way how you can create new SAS tables dynamically during SAS data step execution - but as you now only want to write the results to a single output table using the hash object is no longer required.

kumarK · Posted 03-27-2017 01:16 PM

@Patrick Thanks alot.

Registration is open

SAS Training: Just a Click Away