Solved: Re: Question about checking and cleaning data

hellorc · Posted 10-03-2022 12:11 PM

Closed

SubbuPaz · Posted 10-03-2022 01:17 PM

Try this code:

data have;
input ID state $ city $ score @@;
datalines;
1 A A 100
1 A B 100
1 A C 101
1 B D 102
2 B E 99
2 B F 99
2 B G 99
3 A C 88
4 C H 120
4 D J 110
4 E H 111
4 E I 121
;
run;

data tmp_want_state;
set have;
by id state city;
retain state_num;
if first.id then state_num = 1;
else state_num = state_num + 1;
state_id  = compress('State'|| state_num);
run;

proc transpose data=tmp_want_state out=tmp_want;
    by ID;
    id state_id;
    var state;
run;

proc sql;
create table want1 as
select 
	a.*, b.same_state 
from tmp_want a 
left join 
(select 
	ID, count(distinct state) as num_distinct_states,
	case when count(distinct state) = 1 then 'yes'
	else 'no'
	end as same_state
from have
group by ID)b
on a.ID = b.ID;

create table want2 as
select 
	a.*, b.same_score 
from tmp_want a 
left join 
(select 
	ID, count(distinct score) as num_distinct_scores,
	case when count(distinct score) = 1 then 'yes'
	else 'no'
	end as same_score
from have
group by ID)b
on a.ID = b.ID;

quit;

View solution in original post

ballardw · Posted 10-03-2022 12:20 PM

Cleaning by creating those data sets, in my opinion, isn't particularly helpful.

I would start with something like

Proc freq data=have;
   tables id*state*score / list;
run;

Which will give counts of the same combinations and show the differences near each other.

Actually and output data set could be made with the counts and filtered to only those where the count indicates a problem.

Or consider REPORTS instead of data sets

Proc tabulate data=have;
   class id state ;
   table id,
           state
           /misstext=' '
  ;
run;

SubbuPaz · Posted 10-03-2022 01:17 PM

Try this code:

data have;
input ID state $ city $ score @@;
datalines;
1 A A 100
1 A B 100
1 A C 101
1 B D 102
2 B E 99
2 B F 99
2 B G 99
3 A C 88
4 C H 120
4 D J 110
4 E H 111
4 E I 121
;
run;

data tmp_want_state;
set have;
by id state city;
retain state_num;
if first.id then state_num = 1;
else state_num = state_num + 1;
state_id  = compress('State'|| state_num);
run;

proc transpose data=tmp_want_state out=tmp_want;
    by ID;
    id state_id;
    var state;
run;

proc sql;
create table want1 as
select 
	a.*, b.same_state 
from tmp_want a 
left join 
(select 
	ID, count(distinct state) as num_distinct_states,
	case when count(distinct state) = 1 then 'yes'
	else 'no'
	end as same_state
from have
group by ID)b
on a.ID = b.ID;

create table want2 as
select 
	a.*, b.same_score 
from tmp_want a 
left join 
(select 
	ID, count(distinct score) as num_distinct_scores,
	case when count(distinct score) = 1 then 'yes'
	else 'no'
	end as same_score
from have
group by ID)b
on a.ID = b.ID;

quit;

Quentin · Posted 10-03-2022 01:37 PM

One way to approach this would be to count the number of unique values for STATE for each subject.

With a data step, you could do this using BY-group processing, like:

data have;
input ID state $ city $ score @@;
datalines;
1 A A 100
1 A B 100
1 A C 101
1 B D 102
2 B E 99
2 B F 99
2 B G 99
3 A C 88
4 C H 120
4 D J 110
4 E H 111
4 E I 121
;
run;

data want (keep=id statecount);
  set have (keep=id state);
  by id state ;

  if first.id then statecount=0 ;      *If this is a new ID, set a counter variable to 0 ;
  if first.state then statecount++1 ;  *If this is a new state, increment the counter ;
  if last.id ;                         *If this is the last record for an ID, use a subsetting IF to select it ;

  put (id statecount)(=) ;
run ;

The Boston Area SAS Users Group is hosting free webinars!

Register now at https://www.basug.org/events.

Question about checking and cleaning data

Re: Question about checking and cleaning data

Re: Question about checking and cleaning data

Re: Question about checking and cleaning data

Re: Question about checking and cleaning data

Catch up on SAS Innovate 2026