Solved: Re: How to Split a name string with no delimiters into columns/variabl...

Rakesh93 · Posted 02-02-2020 01:05 PM

DineshReddy
RajeshRaoReddy
PraveenKumarRaoKumar

ans needed as

Var1 Var2 Var3 Var4
1. Dinesh Reddy
2. Rajesh Rao Reddy
3. Praveen Kumar Rao Kumar

total we need 4 variables

novinosrin · Posted 02-02-2020 02:15 PM


data have;
	input name :$50.;
	datalines;
DineshReddy
RajeshRaoReddy
PraveenKumarRaoKumar
;
run;

data want;
 set have;
 array names {4} $ var1-var4;
 do _pos=anyupper(name)by 0 while(_pos);
  _p=_pos;
  _pos=anyupper(name,_pos+1);
  _n=sum(_n,1);
  if _pos then  names(_n)=substr(name,_p,_pos-_p);
  else names(_n)=substr(name,_p);
 end;
 drop _:;
run;
proc print noobs;run;

name	var1	var2	var3	var4
DineshReddy	Dinesh	Reddy
RajeshRaoReddy	Rajesh	Rao	Reddy
PraveenKumarRaoKumar	Praveen	Kumar	Rao	Kumar

View solution in original post

MargoBlue · Posted 02-02-2020 01:37 PM

Maybe it helps:

https://communities.sas.com/t5/SAS-Programming/Split-the-string-into-two-parts/td-p/133091

unison · Posted 02-02-2020 01:39 PM

Something like this? Your delimiter is an UPPER case letter.

data have;
	input name :$50.;
	datalines;
DineshReddy
RajeshRaoReddy
PraveenKumarRaoKumar
;
run;

data want;
	set have;
	array names {4} $ var1-var4;

	do _i=1 to dim(names);
		_this_upper=anyupper(name, _this_upper+1);
		_next_upper=anyupper(name, _this_upper+1);

		if _next_upper=0 then
			_next_upper=length(name)+1;

		if _this_upper>0 then names[_i]=substr(name, _this_upper, _next_upper-_this_upper);
		else
			leave;
	end;
	drop _:;
run;

-unison

Rakesh93 · Posted 02-02-2020 03:27 PM

Can you please explain how does this code works??

unison · Posted 02-02-2020 03:34 PM

Here's a commented version -- tried to describe as best as possible..:

data want;
	set have;
	array names {4} $ var1-var4; *Create an array var1-var4;

	do _i=1 to dim(names); *Work across arrays i=1 to 4 in this case;
		*Use anyupper function to get positions of uppercase characters;
		*First parameter is string to search, second parameter is starting position to search;
		*-- returns 0 if there are no more uppercase characters after specified starting position.;
		_this_upper=anyupper(name, _this_upper+1); *position (index) of THIS upper character;
		_next_upper=anyupper(name, _this_upper+1); *position (index) of NEXT upper character;

		if _next_upper=0 then 
			_next_upper=length(name)+1; *If next upper is 0 (i.e. no more uppercase characters, then change to ending index +1 (helps to get very last chunk out properly);

		if _this_upper>0 then names[_i]=substr(name, _this_upper, _next_upper-_this_upper); *If this upper index is > 0 then capture the substr between this upper and the next one;
		else
			leave; *Else leave and move to next observation (i.e. go from row 1 to row 2);
	end;
	drop _:; *I've named all of my helper variables starting with '_', this drops all of them.;
run;

-unison

Rakesh93 · Posted 02-02-2020 04:25 PM

Thanks

ed_sas_member · Posted 02-02-2020 01:51 PM

Hi @Rakesh93

here is a way to achieve this, using pearl regular expressions.

NB: the proc sql step enables you not to specify arbitrary 4 as the maximum count of words.

data have;
	input name $80.;
	datalines;
DineshReddy
RajeshRaoReddy
PraveenKumarRaoKumar
;
run;

data have1;
	set have;
	_name= prxchange('s/([A-Z])/ $1/',-1,name);
run;

proc sql noprint;
	select max(countw(_name)) into:nb_max from have1;
quit;

data want;
	set have1;
	array var(&nb_max) $20.;
	do i=1 to countw(_name);
		var(i)=scan(_name,i," ");
	end;
	drop i _name;
run;

novinosrin · Posted 02-02-2020 02:15 PM


data have;
	input name :$50.;
	datalines;
DineshReddy
RajeshRaoReddy
PraveenKumarRaoKumar
;
run;

data want;
 set have;
 array names {4} $ var1-var4;
 do _pos=anyupper(name)by 0 while(_pos);
  _p=_pos;
  _pos=anyupper(name,_pos+1);
  _n=sum(_n,1);
  if _pos then  names(_n)=substr(name,_p,_pos-_p);
  else names(_n)=substr(name,_p);
 end;
 drop _:;
run;
proc print noobs;run;

name	var1	var2	var3	var4
DineshReddy	Dinesh	Reddy
RajeshRaoReddy	Rajesh	Rao	Reddy
PraveenKumarRaoKumar	Praveen	Kumar	Rao	Kumar

novinosrin · Posted 02-02-2020 02:35 PM

And a variant of the previous with COMPRESS and FINDC

data want;
 set have;
 array names {4} $ var1-var4;
  _t=strip(compress(name,,'kU'));
 do _pos=findc(compress(name),_t)by 0 while(_pos);
  _p=_pos;
  _pos=findc(compress(name),_t,_pos+1);
  _n=sum(_n,1);
  if _pos then  names(_n)=substr(name,_p,_pos-_p);
  else names(_n)=substr(name,_p);
 end;
 drop _:;
run;
proc print noobs;run;

name	var1	var2	var3	var4
DineshReddy	Dinesh	Reddy
RajeshRaoReddy	Rajesh	Rao	Reddy
PraveenKumarRaoKumar	Praveen	Kumar	Rao	Kumar

mkeintz · Posted 02-03-2020 12:00 AM

@Rakesh93

This code loops through the original NAME letter by letter, appending the individual letter to the "current" name component. Every time the loop encounters an uppercase letter, it just increments which word is considered current:

data have;
  input name :$30.;
datalines;
DineshReddy
RajeshRaoReddy
PraveenKumarRaoKumar
;

data want (drop=L w);
  set have;
  array n {4} $10 name1-name4;
  do L=1 to length(name);
    if char(name,L) = upcase(char(name,L)) then w=sum(w,1);
    n{w}=cats(n{w},char(name,L));
  end;
run;

--------------------------
The hash OUTPUT method will overwrite a SAS data set, but not append. That can be costly. Consider voting for Add a HASH object method which would append a hash object to an existing SAS data set

Would enabling PROC SORT to simultaneously output multiple datasets be useful? Then vote for
Allow PROC SORT to output multiple datasets

--------------------------

Ksharp · Posted 02-04-2020 01:32 AM

data have;
  input name :$30.;
datalines;
DineshReddy
RajeshRaoReddy
PraveenKumarRaoKumar
;
data temp;
set have;
id+1;
pid=prxparse('/[A-Z][a-z]+/');
s=1;e=length(name);
call prxnext(pid,s,e,name,p,l);
do while(p>0);
 var=substr(name,p,l);
 output;
call prxnext(pid,s,e,name,p,l);
end;
keep id var;
run;
proc transpose data=temp out=want prefix=var_;
by id;
var var;
run;

Satish_Parida · Posted 02-04-2020 02:26 AM

This works

data have;
input names:$50.;
cards;
DineshReddy
RajeshRaoReddy
PraveenKumarRaoKumar
;
run;


data temp(drop=i);
set have;
 do i=1 to length(names)+1;
 	if (rank(substr(names,i,1)) gt 64 and rank(substr(names,i,1)) lt 91 and i ne 1) or i=length(names)+1 then do;
 		output;
 		name=substr(names,i,1);
 	end;
 	else name=cats(name,substr(names,i,1));
 end;
run;

proc sort data=temp;
by names;
run;

proc transpose data=temp out=want(drop=_name_);
by names;
var name;
run;

Please let us know if it worked for you.

How to Split a name string with no delimiters into columns/variables

Re: How to Split a name string with no delimiters into columns/variables

Re: How to Split a name string with no delimiters into columns/variables

Re: How to Split a name string with no delimiters into columns/variables

Re: How to Split a name string with no delimiters into columns/variables

Re: How to Split a name string with no delimiters into columns/variables

Re: How to Split a name string with no delimiters into columns/variables

Re: How to Split a name string with no delimiters into columns/variables

Re: How to Split a name string with no delimiters into columns/variables

Re: How to Split a name string with no delimiters into columns/variables

Re: How to Split a name string with no delimiters into columns/variables

Re: How to Split a name string with no delimiters into columns/variables

Re: How to Split a name string with no delimiters into columns/variables

Registration is open