I have a table "a" with the following data
data A;
informat datetime datetime21.;
input info $7. datetime Nr ;
format datetime datetime21.;
cards;
GC_5_0 30APR2020:00:30:00 1
GC_5_2 29JAN2016:16:00:00 2
GC_50_4 24NOV2010:21:00:00 3
GC_5_1 28OCT2003:13:35:00 4
GC_8_1 01AUG2016:11:20:00 5
GC_5_3 09AUG2016:18:10:00 6
GC_10_2 06SEP2019:06:40:00 7
GC_5_0 14MAY2015:00:20:00 8
GC_15_2 27NOV2006:09:45:00 9
GC_5_2 06FEB2019:12:55:00 10
;
In this case, 3 big data sets: GC_5_0, GC_5_1, GC_5_2 (variable info from table A), each approximately 1 GB.
example:
data GC_5_0;
informat datetime datetime21.;
input datetime high_temp low_temp end_temp;
format datetime datetime21.;
cards;
30APR2020:00:00:00 31.3 25.6 29.4
30APR2020:00:00:30 29.9 28.4 29.2
30APR2020:00:01:00 29.5 28.9 29.2
30APR2020:00:01:30 29.5 28.6 29
30APR2020:00:02:00 30 28.8 29.9
30APR2020:00:02:30 30.3 29.5 29.9
30APR2020:00:03:00 30.3 29.2 29.9
30APR2020:00:03:30 30.3 29 30.1
30APR2020:00:04:00 30.2 29.3 29.3
30APR2020:00:04:30 30.6 29.4 29.4
30APR2020:00:05:00 30.8 29.6 29.6
30APR2020:00:05:30 31 30 30.3
30APR2020:00:06:00 31 29.7 30.2
30APR2020:00:06:30 31 29.7 30.5
30APR2020:00:07:00 30.4 29.4 29.6
30APR2020:00:07:30 30 27.6 28.4
30APR2020:00:08:00 29.6 28.1 28.9
30APR2020:00:08:30 30.2 28.6 29.4
30APR2020:00:09:00 30 28.9 29.3
30APR2020:00:09:30 30.5 28.4 28.4
30APR2020:00:10:00 31.7 30.9 31.2
30APR2020:00:10:30 32 30.6 30.6
30APR2020:00:11:00 31.4 29.7 30.8
30APR2020:00:11:30 31.1 29.3 29.8
30APR2020:00:12:00 29.4 28.6 29
30APR2020:00:12:30 29.4 28.2 29
30APR2020:00:13:00 30.4 28.7 30.4
30APR2020:00:13:30 31.7 29.5 30.8
30APR2020:00:14:00 33.8 31.8 33
30APR2020:00:14:30 33 30.5 31.2
30APR2020:00:15:00 31.2 30.4 30.4
30APR2020:00:15:30 30.5 30 30
30APR2020:00:16:00 30.6 29 30.4
30APR2020:00:16:30 30 28.9 29.7
30APR2020:00:17:00 29.9 29.3 29.5
30APR2020:00:17:30 30.6 29.3 29.3
30APR2020:00:18:00 30.2 29.3 29.3
30APR2020:00:18:30 29.6 28.5 28.5
30APR2020:00:19:00 29.6 27.9 28.4
30APR2020:00:19:30 29.4 28.5 28.5
30APR2020:00:20:30 28.8 28 28
30APR2020:00:21:00 31.1 24.2 31
30APR2020:00:21:30 31.1 29.6 31.1
30APR2020:00:22:00 30.7 29.7 30.2
30APR2020:00:22:30 30.4 29.7 29.7
30APR2020:00:23:00 30.4 29.5 30.3
30APR2020:00:23:30 30.5 29.4 30.5
30APR2020:00:24:00 31 29.1 30.6
30APR2020:00:24:30 30.4 29.3 30.4
30APR2020:00:25:00 31.9 29.9 30.6
30APR2020:00:25:30 31.6 30.2 31.2
30APR2020:00:26:00 31 29.6 29.6
30APR2020:00:26:30 30.6 28.4 28.4
30APR2020:00:27:00 30.7 29.7 30.4
30APR2020:00:27:30 30.8 29.7 30.4
30APR2020:00:28:00 31.2 30 30.7
30APR2020:00:29:00 31 29.5 30.1
30APR2020:00:29:30 30.4 29.4 29.7
30APR2020:00:30:00 30.3 28.7 30.3
30APR2020:00:30:30 30.1 29.6 30.1
30APR2020:00:31:00 30 28.6 28.7
30APR2020:00:31:30 29.9 28.1 28.1
30APR2020:00:32:00 30.3 29.6 29.6
30APR2020:00:32:30 30.7 29.2 29.8
30APR2020:00:33:00 29.4 28.5 28.5
30APR2020:00:33:30 29.7 28.7 29.2
30APR2020:00:34:00 30.2 28.9 30.2
30APR2020:00:34:30 29.8 29 29.8
30APR2020:00:35:00 30.4 28.8 29.6
30APR2020:00:35:30 30.2 29 29.6
30APR2020:00:36:00 30.1 29.3 29.5
30APR2020:00:36:30 30.4 28.6 29.6
30APR2020:00:37:00 30.7 29.4 30
30APR2020:00:37:30 30.9 29.2 29.2
30APR2020:00:38:00 30.2 28.8 29.6
30APR2020:00:38:30 29.8 28.9 29.3
30APR2020:00:39:00 29.6 28.7 28.7
30APR2020:00:40:00 30.7 28.6 30.7
30APR2020:00:40:30 30.9 29.3 30.4
30APR2020:00:41:00 30.5 29.5 30.2
30APR2020:00:41:30 30.6 28.6 29.9
30APR2020:00:42:00 31.3 29.4 31.3
30APR2020:00:42:30 30.9 29.3 29.9
30APR2020:00:43:00 30.1 28.9 29.7
30APR2020:00:43:30 30.5 28.8 30.5
30APR2020:00:44:00 30.2 28.4 28.4
30APR2020:00:44:30 30.2 28 29.5
30APR2020:00:45:00 30.4 29.3 29.3
30APR2020:00:45:30 30.7 28.9 30.2
30APR2020:00:46:00 30.1 28.8 29.9
30APR2020:00:46:30 29.6 28.8 29.3
30APR2020:00:47:00 29 26.6 27.8
30APR2020:00:47:30 27.4 26.4 27.4
30APR2020:00:48:00 29.8 27.3 29.2
30APR2020:00:48:30 29.9 29.2 29.2
30APR2020:00:49:00 29.9 28.8 28.8
30APR2020:00:49:30 30.3 28.7 29.3
30APR2020:00:50:00 30 28.6 29.2
30APR2020:00:51:00 29.4 28.8 29.4
30APR2020:00:51:30 29.9 28.5 29.9
run;
data GC_5_2;
informat datetime datetime21.;
input datetime high_temp low_temp end_temp;
format datetime datetime21.;
cards;
30APR2020:00:00:00 31.3 25.6 29.4
30APR2020:00:00:30 29.9 28.4 29.2
30APR2020:00:01:00 29.5 28.9 29.2
30APR2020:00:01:30 29.5 28.6 29
30APR2020:00:02:00 30 28.8 29.9
30APR2020:00:02:30 30.3 29.5 29.9
30APR2020:00:03:00 30.3 29.2 29.9
30APR2020:00:03:30 30.3 29 30.1
30APR2020:00:04:00 30.2 29.3 29.3
30APR2020:00:04:30 30.6 29.4 29.4
30APR2020:00:05:00 30.8 29.6 29.6
30APR2020:00:05:30 31 30 30.3
30APR2020:00:06:00 31 29.7 30.2
30APR2020:00:06:30 31 29.7 30.5
30APR2020:00:07:00 30.4 29.4 29.6
30APR2020:00:07:30 30 27.6 28.4
30APR2020:00:08:00 29.6 28.1 28.9
30APR2020:00:08:30 30.2 28.6 29.4
30APR2020:00:09:00 30 28.9 29.3
30APR2020:00:09:30 30.5 28.4 28.4
30APR2020:00:10:00 31.7 30.9 31.2
30APR2020:00:10:30 32 30.6 30.6
30APR2020:00:11:00 31.4 29.7 30.8
30APR2020:00:11:30 31.1 29.3 29.8
30APR2020:00:12:00 29.4 28.6 29
30APR2020:00:12:30 29.4 28.2 29
30APR2020:00:13:00 30.4 28.7 30.4
30APR2020:00:13:30 31.7 29.5 30.8
30APR2020:00:14:00 33.8 31.8 33
30APR2020:00:14:30 33 30.5 31.2
30APR2020:00:15:00 31.2 30.4 30.4
30APR2020:00:15:30 30.5 30 30
30APR2020:00:16:00 30.6 29 30.4
30APR2020:00:16:30 30 28.9 29.7
30APR2020:00:17:00 29.9 29.3 29.5
30APR2020:00:17:30 30.6 29.3 29.3
30APR2020:00:18:00 30.2 29.3 29.3
30APR2020:00:18:30 29.6 28.5 28.5
30APR2020:00:19:00 29.6 27.9 28.4
30APR2020:00:19:30 29.4 28.5 28.5
30APR2020:00:20:30 28.8 28 28
30APR2020:00:21:00 31.1 24.2 31
30APR2020:00:21:30 31.1 29.6 31.1
30APR2020:00:22:00 30.7 29.7 30.2
30APR2020:00:22:30 30.4 29.7 29.7
30APR2020:00:23:00 30.4 29.5 30.3
30APR2020:00:23:30 30.5 29.4 30.5
30APR2020:00:24:00 31 29.1 30.6
30APR2020:00:24:30 30.4 29.3 30.4
30APR2020:00:25:00 31.9 29.9 30.6
30APR2020:00:25:30 31.6 30.2 31.2
30APR2020:00:26:00 31 29.6 29.6
30APR2020:00:26:30 30.6 28.4 28.4
30APR2020:00:27:00 30.7 29.7 30.4
30APR2020:00:27:30 30.8 29.7 30.4
30APR2020:00:28:00 31.2 30 30.7
30APR2020:00:29:00 31 29.5 30.1
30APR2020:00:29:30 30.4 29.4 29.7
30APR2020:00:30:00 30.3 28.7 30.3
30APR2020:00:30:30 30.1 29.6 30.1
30APR2020:00:31:00 30 28.6 28.7
30APR2020:00:31:30 29.9 28.1 28.1
30APR2020:00:32:00 30.3 29.6 29.6
30APR2020:00:32:30 30.7 29.2 29.8
30APR2020:00:33:00 29.4 28.5 28.5
30APR2020:00:33:30 29.7 28.7 29.2
30APR2020:00:34:00 30.2 28.9 30.2
30APR2020:00:34:30 29.8 29 29.8
30APR2020:00:35:00 30.4 28.8 29.6
30APR2020:00:35:30 30.2 29 29.6
30APR2020:00:36:00 30.1 29.3 29.5
30APR2020:00:36:30 30.4 28.6 29.6
30APR2020:00:37:00 30.7 29.4 30
30APR2020:00:37:30 30.9 29.2 29.2
30APR2020:00:38:00 30.2 28.8 29.6
30APR2020:00:38:30 29.8 28.9 29.3
30APR2020:00:39:00 29.6 28.7 28.7
30APR2020:00:40:00 30.7 28.6 30.7
30APR2020:00:40:30 30.9 29.3 30.4
30APR2020:00:41:00 30.5 29.5 30.2
30APR2020:00:41:30 30.6 28.6 29.9
30APR2020:00:42:00 31.3 29.4 31.3
30APR2020:00:42:30 30.9 29.3 29.9
30APR2020:00:43:00 30.1 28.9 29.7
30APR2020:00:43:30 30.5 28.8 30.5
30APR2020:00:44:00 30.2 28.4 28.4
30APR2020:00:44:30 30.2 28 29.5
30APR2020:00:45:00 30.4 29.3 29.3
30APR2020:00:45:30 30.7 28.9 30.2
30APR2020:00:46:00 30.1 28.8 29.9
30APR2020:00:46:30 29.6 28.8 29.3
30APR2020:00:47:00 29 26.6 27.8
30APR2020:00:47:30 27.4 26.4 27.4
30APR2020:00:48:00 29.8 27.3 29.2
30APR2020:00:48:30 29.9 29.2 29.2
30APR2020:00:49:00 29.9 28.8 28.8
30APR2020:00:49:30 30.3 28.7 29.3
30APR2020:00:50:00 30 28.6 29.2
30APR2020:00:51:00 29.4 28.8 29.4
30APR2020:00:51:30 29.9 28.5 29.9
run;
I want to reach the following table
data wont;
informat datetime datetime21.;
input datetime high_temp low_temp end_temp;
format datetime datetime21.;
cards;
/* from GC_5_0;/**/
30APR2020:00:19:30 29.4 28.5 28.5
30APR2020:00:20:30 28.8 28 28
30APR2020:00:21:00 31.1 24.2 31
30APR2020:00:21:30 31.1 29.6 31.1
30APR2020:00:22:00 30.7 29.7 30.2
30APR2020:00:22:30 30.4 29.7 29.7
30APR2020:00:23:00 30.4 29.5 30.3
30APR2020:00:23:30 30.5 29.4 30.5
30APR2020:00:24:00 31 29.1 30.6
30APR2020:00:24:30 30.4 29.3 30.4
30APR2020:00:25:00 31.9 29.9 30.6
30APR2020:00:25:30 31.6 30.2 31.2
30APR2020:00:26:00 31 29.6 29.6
30APR2020:00:26:30 30.6 28.4 28.4
30APR2020:00:27:00 30.7 29.7 30.4
30APR2020:00:27:30 30.8 29.7 30.4
30APR2020:00:28:00 31.2 30 30.7
30APR2020:00:29:00 31 29.5 30.1
30APR2020:00:29:30 30.4 29.4 29.7
30APR2020:00:30:00 30.3 28.7 30.3 /*<------ datetime from 'A'/**/
30APR2020:00:30:30 30.1 29.6 30.1
30APR2020:00:31:00 30 28.6 28.7
30APR2020:00:31:30 29.9 28.1 28.1
30APR2020:00:32:00 30.3 29.6 29.6
30APR2020:00:32:30 30.7 29.2 29.8
30APR2020:00:33:00 29.4 28.5 28.5
30APR2020:00:33:30 29.7 28.7 29.2
30APR2020:00:34:00 30.2 28.9 30.2
30APR2020:00:34:30 29.8 29 29.8
30APR2020:00:35:00 30.4 28.8 29.6
30APR2020:00:35:30 30.2 29 29.6
30APR2020:00:36:00 30.1 29.3 29.5
30APR2020:00:36:30 30.4 28.6 29.6
30APR2020:00:37:00 30.7 29.4 30
30APR2020:00:37:30 30.9 29.2 29.2
30APR2020:00:38:00 30.2 28.8 29.6
30APR2020:00:38:30 29.8 28.9 29.3
30APR2020:00:39:00 29.6 28.7 28.7
30APR2020:00:40:00 30.7 28.6 30.7
30APR2020:00:40:30 30.9 29.3 30.4
/* from GC_5_2;/**/
30APR2020:00:30:00 30.3 28.7 30.3
30APR2020:00:30:30 30.1 29.6 30.1
30APR2020:00:31:00 30 28.6 28.7
30APR2020:00:31:30 29.9 28.1 28.1
30APR2020:00:32:00 30.3 29.6 29.6
30APR2020:00:32:30 30.7 29.2 29.8
30APR2020:00:33:00 29.4 28.5 28.5
30APR2020:00:33:30 29.7 28.7 29.2
30APR2020:00:34:00 30.2 28.9 30.2
30APR2020:00:34:30 29.8 29 29.8
30APR2020:00:35:00 30.4 28.8 29.6
30APR2020:00:35:30 30.2 29 29.6
30APR2020:00:36:00 30.1 29.3 29.5
30APR2020:00:36:30 30.4 28.6 29.6
30APR2020:00:37:00 30.7 29.4 30
30APR2020:00:37:30 30.9 29.2 29.2
30APR2020:00:38:00 30.2 28.8 29.6
30APR2020:00:38:30 29.8 28.9 29.3
30APR2020:00:39:00 29.6 28.7 28.7
30APR2020:00:40:00 30.7 28.6 30.7 /*<------ datetime from 'A'/**/
30APR2020:00:40:30 30.9 29.3 30.4
30APR2020:00:41:00 30.5 29.5 30.2
30APR2020:00:41:30 30.6 28.6 29.9
30APR2020:00:42:00 31.3 29.4 31.3
30APR2020:00:42:30 30.9 29.3 29.9
30APR2020:00:43:00 30.1 28.9 29.7
30APR2020:00:43:30 30.5 28.8 30.5
30APR2020:00:44:00 30.2 28.4 28.4
30APR2020:00:44:30 30.2 28 29.5
30APR2020:00:45:00 30.4 29.3 29.3
30APR2020:00:45:30 30.7 28.9 30.2
30APR2020:00:46:00 30.1 28.8 29.9
30APR2020:00:46:30 29.6 28.8 29.3
30APR2020:00:47:00 29 26.6 27.8
30APR2020:00:47:30 27.4 26.4 27.4
30APR2020:00:48:00 29.8 27.3 29.2
30APR2020:00:48:30 29.9 29.2 29.2
30APR2020:00:49:00 29.9 28.8 28.8
30APR2020:00:49:30 30.3 28.7 29.3
30APR2020:00:50:00 30 28.6 29.2
/*and so on/**/
run;
Each subset has 40 observations in table 'wont' form table gc_5_0 gc_5_1 and so on (sometimes there is no temperature measurement therefore datetime - 600 < datetime < datetime + 600 It will not work).
Sometimes datetime may not be exactly from set 'A'
_n_ = 1 GC_5_0 30APR2020:00:30:00
then we look for
30APR2020:00:30:00 +- 30
Obviously table A is much bigger and there are more different sets in the variable 'info'.
I tried to do this with the hash table but it fails. And I can't handle it. I am asking for a quick way because the source tables exceed 1 GB.
I hope that I have presented the problem clearly and understandably.
Best regards and thank you in advance for your help.
@Reeza wrote:
1GB isn't a large data set for SAS in most cases. What makes you think it is?
A hundred files of 1GB is a lot, especially if the application is to work more or less in real time (10 seconds is a lot, 1 min is forever).
@Reeza wrote:
Are you trying to get 40 observations or a window of time (30 minutes it seems) relative to the time in table 1? I think its the latter but that isn't clear to me.
40 observation.
data A;
informat datetime datetime21.;
input info $7. datetime Nr ;
format datetime datetime21.;
cards;
GC_5_0 30APR2020:00:30:00 1
GC_5_2 30APR2020:00:40:00 2
GC_50_4 24NOV2010:21:00:00 3
GC_5_1 28OCT2003:13:35:00 4
GC_8_1 01AUG2016:11:20:00 5
GC_5_3 09AUG2016:18:10:00 6
GC_10_2 06SEP2019:06:40:00 7
GC_5_0 30APR2020:00:39:30 8
GC_15_2 27NOV2006:09:45:00 9
GC_5_2 30APR2020:00:17:00 10
run;
data GC_5_0;
set GC_5_0;
nrdb = _n_;
run;
data GC_5_2;
set GC_5_2;
nrdb = _n_;
run;
%macro test();
* for simplicity;
data AA;
set A;
if info = 'GC_5_0' | info = 'GC_5_2';
run;
proc sort data = AA out = AA_count nodupkey;
key info;
run;
* Obs_count variable needed for the loop;
data _NULL_;
if 0 then set AA_count nobs = n;
call symputx('Obs_count',n);
stop;
run;
%put no. of observations = &Obs_count;
* I need to search by number of observations, not by datetime;
data _null_;
if _n_ = 1 then do;
declare hash H (multidata:"y");
H.definekey("info");
H.defineData( "datetime","Nr");
H.definedone();
end;
do until (last.info);
set AA;
by info;
H.add();
end;
H.output (dataset: catx("_", "AA_", info));
H.clear();
run;
%do i = 1 %to &Obs_count %by 1;
* extracting the source file name;
data _NULL_;
set AA_count;
if _n_ = &i then do;
call symputx('info',info);
end;
run;
%put &info;
data aa__&info;
set aa__&info;
if _n_ = 1 then do;
declare hash H1 (dataset:"&info");
H1.definekey("datetime");
H1.definedata("nrdb");
H1.definedone();
end;
if H1.find() = 0 then nrdb = nrdb;
run;
* for simplicity, remove undiscovered data;
data aa__&info;
set aa__&info;
if nrdb ne .;
run;
data wont_&i;
set aa__&info;
if _n_ = 1
then do;
declare hash H (dataset:"&info");
H.definekey("nrdb");
H.definedata("datetime","high_temp","low_temp","end_temp");
H.definedone();
end;
do nrdb = nrdb - 20 to nrdb + 19 by 1;
if H.find() = 0 then output;
high_temp = high_temp;
low_temp = low_temp;
end_temp = end_temp;
end;
run;
* merging partial tables into the final table;
%if &i = 1 %then %do;
data wont;
set wont_&i (drop = nrdb);
run;
%end;
%else %do;
data wont;
set wont wont_&i (drop = nrdb);
run;
%end;
%end;
proc sort data = wont out = wont;
key nr;
run;
%mend test;
%test;
Maybe it can be done faster and smarter.
It's finally time to hack! Remember to visit the SAS Hacker's Hub regularly for news and updates.
Learn how use the CAT functions in SAS to join values from multiple variables into a single value.
Find more tutorials on the SAS Users YouTube channel.
Ready to level-up your skills? Choose your own adventure.