Hello all,
I have a large csv file blc_20220602_20220601220525.csv that I split into different files of 350,000 lines. However, after splitting the files,
I am left with these names:
Instead of 'aa', 'ab', 'ac', How can I name my split files into
blc_20220602_20220601220525_1.csv
blc_20220602_20220601220525_2.csv
blc_20220602_20220601220525_3.csv
etc...
%let blc = c:/myfiles/blc; /* ************* Construction of macro "SPLIT" ************* */ %macro split_unix(fichier_complet=, taille_split=, out=); %let fichier = %scan(&fichier_complet,-1,/); %let path = %substr(&fichier_complet,1,%index(&fichier_complet,&fichier)-1); %let fic = %scan(&fichier,1,.); %let extension = %scan(&fichier,2,.); %global cr_split; %let cr_split=0; %put path = &path; %put fichier = &fichier; %put fic = &fic; %put extension = &extension; %put &fic..&extension; /* *********** Verify if the file is empty *********** */ %let list_fic_vide =; %let flag_vide =; filename rep_cn pipe "wc -l &path./&fichier"; /* ************ Verify if the file exceeds the limit ************ */ %let count_row = 0; data rep_cn; infile rep_cn dsd truncover; input var:$250.; if not find(lowcase(var),"total"); count = input(scan(var,1," "),20.); fic = scan(var,-1,"/"); run; proc sql noprint; select "KO", fic into :flag_vide, :list_fic_vide separated by " - " from rep_cn where count <= 1; select max(count) into: count_row from rep_cn group by fic; quit; %put List of empty files : &list_fic_vide.; %put Max volume is : &count_row. nombre de lignes; /* **************************** */ %if &flag_vide ne KO and &count_row. > %scan(&taille_split,2) %then %do; x "cd &path"; systask command "split &taille_split &fic..&extension &fic" wait status=fg_split; %put CR Unix du split est : &fg_split; %if not &fg_split > 0 %then %do; filename list_fic pipe "find . ! -name '*.&extension' -a -name '*&fic*'"; data temp; infile list_fic missover truncover dsd; input var:$50.; var = scan(var,2,"/"); run; data _null_; set temp end=fin; call symputx(compress('split'||put(_n_,3.)),var,'g'); if fin then call symputx('nbre',put(_n_,3.)); run; %do i = 1 %to &nbre; %put split&i = &&split&i; x "mv &&split&i ./&out/&&split&i...&extension"; %let cr_split=%eval(&cr_split+&sysrc); %end; %end; %end; /* Fin %if &flag_vide ne KO */ %else %do; %let cr_split=1; %end; %mend; /* ************************************************************** */ %macro boucle_lancement(chemin=,fic_a_split=); filename fic pipe "ls &chemin/&fic_a_split."; data _null_; infile fic missover truncover end=fin; input fic:$100.; call symputx(compress('fic'||put(_n_,3.)),fic,'g'); if fin then call symputx('nbre_fic',put(_n_,3.),'g'); run; %do nb = 1 %to &nbre_fic; %split_unix(fichier_complet=&&fic&nb, taille_split=-l 350000, out=); %if &cr_split eq 0 %then %do; x "rm &&fic&nb"; %end; %end; %mend; %boucle_lancement(chemin=&blc., fic_a_split=blc_*.csv);
Why are they getting named that way to begin with? Is it because that is how the Unix command split names the files? Is there an option to change how it names the files so it numbers them instead appending letters?
If not why not just split the files yourself instead of calling that Unix command.
%let infile=big.txt ;
%let outbase=small ;
data _null_;
fileno+1;
infile "&infile" ;
outfile=cats("&outbase._",fileno,'.txt');
do row=1 to 350000;
input ;
file out filevar=outfile ;
put _infile_;
end;
run;
Yes, I think the systask command split just names the files with alphabets by default.
That's why I'm trying to find a way to change the name or maybe use another method.
I already showed you how to generate the split files with numeric suffixes using a different method.
If you want to rename the existing files then first get the list of files into data. Say you have the list in a dataset named FILES and the filename is in a variable named FILENAME. Then you could use code like this to generate NEW_NAME variable.
proc sql;
create table rename_files
select substr(filename,1,length(filename)-6) as basename
, count(*) as n_files
, filename
group by 1
having count(*) > 1
;
quit;
data rename_files;
do suffix=1 by 1 until(last.basename);
set rename_files;
by basename;
new_name = cats(basename,putn(suffix,cats(Z,length(cats(n_files)),'.'),'.csv');
output;
end;
run;
You could then use that list to generate code to move, rename or copy the files.
/*Assuming you are under Windows OS.
If your OS is Linux/Unix ,try to use LS command to repalce DIR command
*/
%let path_in= c:\temp\ ;
filename x pipe %sysfunc(quote(dir "&path_in."\*.csv /s /b));
data x;
infile x truncover;
input old $2000.;
if prxmatch('/^_\d+/',scan(old,-1,'/\')) then do;
n+1;
new=prxchange('s/[\.a-z]+$//i',1,strip(old));
new=cats("&path_in.\blc",scan(new,-1,'/\'),'_',n,'.csv');
rc=rename(strip(old),strip(new),'file');
end;
run;
It's finally time to hack! Remember to visit the SAS Hacker's Hub regularly for news and updates.
Learn how use the CAT functions in SAS to join values from multiple variables into a single value.
Find more tutorials on the SAS Users YouTube channel.
Ready to level-up your skills? Choose your own adventure.