Hi,
I have a data set with several variable. Out of this, I want to create a separate data set with some existing variables and some new variables, that are produced out of an existing variable. This works pretty fine with this SAS code:
proc sql;
create table dataset_new as
select id,
year,
count(distinct medication) as n_medication,
case
when count(distinct medication) = 1 then max(medication)
end as medication_count_1,
case
when count(distinct medication) = 2 then catx(', ', put(min(medication), 8.), put(max(medication), 8.))
end as medication_count_2,
case
when count(distinct medication) = 3 then '1, 2, 3'
end as medication_count_3,
case
when count(distinct medication) not in (1, 2, 3) then 'Unknown'
end as medication_count_other
from dataset_have
group by id, year;
quit;
medication is eighter 1, 2, or 3.
Now I would like to include another variable ("factor") into the dataset_new, next to "id" and "year", I tried to addapt the code as followed:
proc sql;
create table dataset_new as
select id,
year,
factor,
count(distinct medication) as n_medication,
case
when count(distinct medication) = 1 then max(medication)
end as medication_count_1,
case
when count(distinct medication) = 2 then catx(', ', put(min(medication), 8.), put(max(medication), 8.))
end as medication_count_2,
case
when count(distinct medication) = 3 then '1, 2, 3'
end as medication_count_3,
case
when count(distinct medication) not in (1, 2, 3) then 'Unknown'
end as medication_count_other
from dataset_have
group by id, year;
quit;
But, now the dataset_new has way more records / rows than before and "id"s appear more than once per year (should only be once per year, as before).
Why is that, where is the fault? How can I fix the code?
Thank you very much!
You need to either include FACTOR in the GROUP BY list. Or apply some aggregate function to it (like the COUNT() function you have).
Which value of FACTOR do you want when there are multiple observations per GROUP BY grouping?
Assuming that you want to include it in the BY grouping.
data dataset_have;
infile datalines dlm='|' ;
input year id :$20. factor medication ;
datalines;
2020|x1|5.19721817|1
2017|x2|14.06790123|1
2019|x3|9.03109676|2
2017|x4|5.15445063|1
2015|x5|5.94598581|1
2013|x6|4.43984891|1
2014|x6|4.38786664|1
2015|x6|4.41836734|1
2018|x6|4.6132294|1
2017|x7|6.76264097|1
2020|x8|11.8294265|1
2021|x9|6.2484863|1
2021|x9|6.2484863|1
2022|x1|6.03128464|1
2022|x1|6.03128464|2
2012|x11|8.47462116|1
2020|x12|4.30458199|1
2018|x13|4.39752762|2
2018|x13|4.39752762|2
2022|x14|6.3191628|2
2022|x14|6.3191628|2
2018|x15|9.42133597|1
2018|x15|9.42133597|1
2019|x15|8.4617257|1
2020|x15|6.8252753|1
2022|x16|6.84502614|2
2017|x17|11.11335597|1
2020|x18|8.3209465|2
2022|x18|8.06962168|2
2022|x19|4.79468183|2
2015|x20|6.82342541|1
2016|x20|7.47662275|1
2016|x20|7.47662275|1
2016|x20|7.47662275|1
2016|x20|7.47662275|1
2016|x20|7.47662275|1
2022|x21|8.73427034|1
2016|x22|6.13107083|1
2017|x22|6.7473995|1
2019|x23|6.24992605|1
2018|x24|11.54881266|2
2018|x24|11.54881266|2
2021|x25|9.28757739|2
2012|x26|8.32126661|1
2020|x27|6.48118077|2
2022|x27|6.60381036|2
;
proc sql;
create table dataset_new as
select id
, year
, factor
, count(distinct medication) as n_medication
, case
when count(distinct medication) = 1 then max(medication)
else . end as medication_count_1
, case
when count(distinct medication) = 2
then catx(', ', put(min(medication), 8.), put(max(medication), 8.))
else ' ' end as medication_count_2
, case
when count(distinct medication) = 3 then '1, 2, 3'
else ' ' end as medication_count_3
, case
when count(distinct medication) not in (1, 2, 3) then 'Unknown'
else ' ' end as medication_count_other
from dataset_have
group by id ,year ,factor
;
quit;
Please provide (a portion of) the data in data set DATASET_HAVE, following these examples and instructions. Do NOT attach files.
data dataset_have
input year: best5. id: $20. factor: best20. medication best1.
dataline;
2020 | x1 | 5.19721817 | 1
2017 | x2 | 14.06790123 | 1
2019 | x3 | 9.03109676 | 2
2017 | x4 | 5.15445063 | 1
2015 | x5 | 5.94598581 | 1
2013 | x6 | 4.43984891 | 1
2014 | x6 | 4.38786664 | 1
2015 | x6 | 4.41836734 | 1
2018 | x6 | 4.6132294 | 1
2017 | x7 | 6.76264097 | 1
2020 | x8 | 11.8294265 | 1
2021 | x9 | 6.2484863 | 1
2021 | x9 | 6.2484863 | 1
2022 | x1 |0 6.03128464 | 1
2022 | x1 |0 6.03128464 | 2
2012 | x11 | 8.47462116 | 1
2020 | x12 | 4.30458199 | 1
2018 | x13 | 4.39752762 | 2
2018 | x13 | 4.39752762 | 2
2022 | x14 | 6.3191628 | 2
2022 | x14 | 6.3191628 | 2
2018 | x15 | 9.42133597 | 1
2018 | x15 | 9.42133597 | 1
2019 | x15 | 8.4617257 | 1
2020 | x15 | 6.8252753 | 1
2022 | x16 | 6.84502614 | 2
2017 | x17 | 11.11335597 | 1
2020 | x18 | 8.3209465 | 2
2022 | x18 | 8.06962168 | 2
2022 | x19 | 4.79468183 | 2
2015 | x20 | 6.82342541 | 1
2016 | x20 | 7.47662275 | 1
2016 | x20 | 7.47662275 | 1
2016 | x20 | 7.47662275 | 1
2016 | x20 | 7.47662275 | 1
2016 | x20 | 7.47662275 | 1
2022 | x21 | 8.73427034 | 1
2016 | x22 | 6.13107083 | 1
2017 | x22 | 6.7473995 | 1
2019 | x23 | 6.24992605 | 1
2018 | x24 | 11.54881266 | 2
2018 | x24 | 11.54881266 | 2
2021 | x25 | 9.28757739 | 2
2012 | x26 | 8.32126661 | 1
2020 | x27 | 6.48118077 | 2
2022 | x27 | 6.60381036 | 2
;
I hope this is fine. Thank you
You need to either include FACTOR in the GROUP BY list. Or apply some aggregate function to it (like the COUNT() function you have).
Which value of FACTOR do you want when there are multiple observations per GROUP BY grouping?
Assuming that you want to include it in the BY grouping.
data dataset_have;
infile datalines dlm='|' ;
input year id :$20. factor medication ;
datalines;
2020|x1|5.19721817|1
2017|x2|14.06790123|1
2019|x3|9.03109676|2
2017|x4|5.15445063|1
2015|x5|5.94598581|1
2013|x6|4.43984891|1
2014|x6|4.38786664|1
2015|x6|4.41836734|1
2018|x6|4.6132294|1
2017|x7|6.76264097|1
2020|x8|11.8294265|1
2021|x9|6.2484863|1
2021|x9|6.2484863|1
2022|x1|6.03128464|1
2022|x1|6.03128464|2
2012|x11|8.47462116|1
2020|x12|4.30458199|1
2018|x13|4.39752762|2
2018|x13|4.39752762|2
2022|x14|6.3191628|2
2022|x14|6.3191628|2
2018|x15|9.42133597|1
2018|x15|9.42133597|1
2019|x15|8.4617257|1
2020|x15|6.8252753|1
2022|x16|6.84502614|2
2017|x17|11.11335597|1
2020|x18|8.3209465|2
2022|x18|8.06962168|2
2022|x19|4.79468183|2
2015|x20|6.82342541|1
2016|x20|7.47662275|1
2016|x20|7.47662275|1
2016|x20|7.47662275|1
2016|x20|7.47662275|1
2016|x20|7.47662275|1
2022|x21|8.73427034|1
2016|x22|6.13107083|1
2017|x22|6.7473995|1
2019|x23|6.24992605|1
2018|x24|11.54881266|2
2018|x24|11.54881266|2
2021|x25|9.28757739|2
2012|x26|8.32126661|1
2020|x27|6.48118077|2
2022|x27|6.60381036|2
;
proc sql;
create table dataset_new as
select id
, year
, factor
, count(distinct medication) as n_medication
, case
when count(distinct medication) = 1 then max(medication)
else . end as medication_count_1
, case
when count(distinct medication) = 2
then catx(', ', put(min(medication), 8.), put(max(medication), 8.))
else ' ' end as medication_count_2
, case
when count(distinct medication) = 3 then '1, 2, 3'
else ' ' end as medication_count_3
, case
when count(distinct medication) not in (1, 2, 3) then 'Unknown'
else ' ' end as medication_count_other
from dataset_have
group by id ,year ,factor
;
quit;
Thank you!
By including FACTOR in the GROUP BY list, I get the output I was looking for.
Don’t miss the livestream kicking off May 7. It’s free. It’s easy. And it’s the best seat in the house.
Join us virtually with our complimentary SAS Innovate Digital Pass. Watch live or on-demand in multiple languages, with translations available to help you get the most out of every session.
Learn how use the CAT functions in SAS to join values from multiple variables into a single value.
Find more tutorials on the SAS Users YouTube channel.
Ready to level-up your skills? Choose your own adventure.