Solved: Re: Filling in observations from a different dataset

TrueTears · Posted 12-12-2020 06:12 AM

As a minimal working example, I have dataset similar to the following:



DATA have ;
  input group_name $ fips	year	quarter	NA;
DATALINES;
sam	1055	1978	1	0
sam	1055	1978	2	0
sam	1055	1978	3	0
sam	1055	1978	4	1
sam	1055	1979	1	0
sam	1055	1979	2	1
sam	1055	1979	3	1
sam	1055	1979	4	1
sam	1055	1980	1	0
sam	1055	1980	2	0
sam	1055	1980	3	0
sam	1055	1980	4	0
sam	1055	1981	1	0
sam	1055	1981	2	1
sam	1055	1981	3	1
sam	1055	1981	4	1
sam	1066	1978	1	0
sam	1066	1978	2	1
sam	1066	1978	3	0
sam	1066	1978	4	0
sam	1066	1979	1	0
sam	1066	1979	2	0
sam	1066	1979	3	0
sam	1066	1979	4	0
sam	1066	1980	1	0
sam	1066	1980	2	0
sam	1066	1980	3	0
sam	1066	1980	4	0
sam	1066	1981	1	0
sam	1066	1981	2	0
sam	1066	1981	3	0
sam	1066	1981	4	0
harry	1066	1978	1	0
harry	1066	1978	2	0
harry	1066	1978	3	0
harry	1066	1978	4	0
harry	1066	1979	1	0
harry	1066	1979	2	0
harry	1066	1979	3	0
harry	1066	1979	4	0
harry	1066	1980	1	1
harry	1066	1980	2	1
harry	1066	1980	3	1
harry	1066	1980	4	1
harry	1066	1981	1	1
harry	1066	1981	2	1
harry	1066	1981	3	1
harry	1066	1981	4	1
;
RUN;

I have another dataset that lists the unique "fips":


DATA fips ;
  input fips;
DATALINES;
1001
1044
1055
1066
;
RUN;

I want to create the following dataset:



DATA want ;
  input group_name $ fips	year	quarter	NA;
DATALINES;
sam	1001	1978	1	0
sam	1001	1978	2	0
sam	1001	1978	3	0
sam	1001	1978	4	0
sam	1001	1979	1	0
sam	1001	1979	2	0
sam	1001	1979	3	0
sam	1001	1979	4	0
sam	1001	1980	1	0
sam	1001	1980	2	0
sam	1001	1980	3	0
sam	1001	1980	4	0
sam	1001	1981	1	0
sam	1001	1981	2	0
sam	1001	1981	3	0
sam	1001	1981	4	0
sam	1044	1978	1	0
sam	1044	1978	2	0
sam	1044	1978	3	0
sam	1044	1978	4	0
sam	1044	1979	1	0
sam	1044	1979	2	0
sam	1044	1979	3	0
sam	1044	1979	4	0
sam	1044	1980	1	0
sam	1044	1980	2	0
sam	1044	1980	3	0
sam	1044	1980	4	0
sam	1044	1981	1	0
sam	1044	1981	2	0
sam	1044	1981	3	0
sam	1044	1981	4	0
sam	1055	1978	1	0
sam	1055	1978	2	0
sam	1055	1978	3	0
sam	1055	1978	4	1
sam	1055	1979	1	0
sam	1055	1979	2	1
sam	1055	1979	3	1
sam	1055	1979	4	1
sam	1055	1980	1	0
sam	1055	1980	2	0
sam	1055	1980	3	0
sam	1055	1980	4	0
sam	1055	1981	1	0
sam	1055	1981	2	1
sam	1055	1981	3	1
sam	1055	1981	4	1
sam	1066	1978	1	0
sam	1066	1978	2	1
sam	1066	1978	3	0
sam	1066	1978	4	0
sam	1066	1979	1	0
sam	1066	1979	2	0
sam	1066	1979	3	0
sam	1066	1979	4	0
sam	1066	1980	1	0
sam	1066	1980	2	0
sam	1066	1980	3	0
sam	1066	1980	4	0
sam	1066	1981	1	0
sam	1066	1981	2	0
sam	1066	1981	3	0
sam	1066	1981	4	0
harry	1001	1978	1	0
harry	1001	1978	2	0
harry	1001	1978	3	0
harry	1001	1978	4	0
harry	1001	1979	1	0
harry	1001	1979	2	0
harry	1001	1979	3	0
harry	1001	1979	4	0
harry	1001	1980	1	0
harry	1001	1980	2	0
harry	1001	1980	3	0
harry	1001	1980	4	0
harry	1001	1981	1	0
harry	1001	1981	2	0
harry	1001	1981	3	0
harry	1001	1981	4	0
harry	1044	1978	1	0
harry	1044	1978	2	0
harry	1044	1978	3	0
harry	1044	1978	4	0
harry	1044	1979	1	0
harry	1044	1979	2	0
harry	1044	1979	3	0
harry	1044	1979	4	0
harry	1044	1980	1	0
harry	1044	1980	2	0
harry	1044	1980	3	0
harry	1044	1980	4	0
harry	1044	1981	1	0
harry	1044	1981	2	0
harry	1044	1981	3	0
harry	1044	1981	4	0
harry	1055	1978	1	0
harry	1055	1978	2	0
harry	1055	1978	3	0
harry	1055	1978	4	0
harry	1055	1979	1	0
harry	1055	1979	2	0
harry	1055	1979	3	0
harry	1055	1979	4	0
harry	1055	1980	1	0
harry	1055	1980	2	0
harry	1055	1980	3	0
harry	1055	1980	4	0
harry	1055	1981	1	0
harry	1055	1981	2	0
harry	1055	1981	3	0
harry	1055	1981	4	0
harry	1066	1978	1	0
harry	1066	1978	2	0
harry	1066	1978	3	0
harry	1066	1978	4	0
harry	1066	1979	1	0
harry	1066	1979	2	0
harry	1066	1979	3	0
harry	1066	1979	4	0
harry	1066	1980	1	1
harry	1066	1980	2	1
harry	1066	1980	3	1
harry	1066	1980	4	1
harry	1066	1981	1	1
harry	1066	1981	2	1
harry	1066	1981	3	1
harry	1066	1981	4	1
;
RUN;

The idea is simple: For each group_name, if the fips does not appear in "have" but is in "fips", then fill in NA = 0 from 1978Q1 to 1981Q4. So for instance, fips=1001 and 1044 do not appear for group_name = sam, so these are filled in. Similarly, for group_name = harry, it is missing 1001, 1044, and 1055, so these are filled in.

Ksharp · Posted 12-12-2020 06:40 AM

Assuming I understood what you mean.


DATA have ;
  input group_name $ fips	year	quarter	NA;
DATALINES;
sam	1055	1978	1	0
sam	1055	1978	2	0
sam	1055	1978	3	0
sam	1055	1978	4	1
sam	1055	1979	1	0
sam	1055	1979	2	1
sam	1055	1979	3	1
sam	1055	1979	4	1
sam	1055	1980	1	0
sam	1055	1980	2	0
sam	1055	1980	3	0
sam	1055	1980	4	0
sam	1055	1981	1	0
sam	1055	1981	2	1
sam	1055	1981	3	1
sam	1055	1981	4	1
sam	1066	1978	1	0
sam	1066	1978	2	1
sam	1066	1978	3	0
sam	1066	1978	4	0
sam	1066	1979	1	0
sam	1066	1979	2	0
sam	1066	1979	3	0
sam	1066	1979	4	0
sam	1066	1980	1	0
sam	1066	1980	2	0
sam	1066	1980	3	0
sam	1066	1980	4	0
sam	1066	1981	1	0
sam	1066	1981	2	0
sam	1066	1981	3	0
sam	1066	1981	4	0
harry	1066	1978	1	0
harry	1066	1978	2	0
harry	1066	1978	3	0
harry	1066	1978	4	0
harry	1066	1979	1	0
harry	1066	1979	2	0
harry	1066	1979	3	0
harry	1066	1979	4	0
harry	1066	1980	1	1
harry	1066	1980	2	1
harry	1066	1980	3	1
harry	1066	1980	4	1
harry	1066	1981	1	1
harry	1066	1981	2	1
harry	1066	1981	3	1
harry	1066	1981	4	1
;
RUN;


DATA fips ;
  input fips;
DATALINES;
1001
1044
1055
1066
;
RUN;

proc sql;
create table want as
select a.*,coalesce(b.NA,0) as 	NA  from
(
select * from
   (select distinct group_name from have),
   (select distinct fips from fips),
   (select distinct year from have),
   (select distinct quarter from have)
) as a left join 	have as b
on 	a.group_name=b.group_name and a.fips=b.fips  
and a.year=b.year and a.quarter=b.quarter ;
quit;

View solution in original post

Ksharp · Posted 12-12-2020 06:40 AM

Assuming I understood what you mean.


DATA have ;
  input group_name $ fips	year	quarter	NA;
DATALINES;
sam	1055	1978	1	0
sam	1055	1978	2	0
sam	1055	1978	3	0
sam	1055	1978	4	1
sam	1055	1979	1	0
sam	1055	1979	2	1
sam	1055	1979	3	1
sam	1055	1979	4	1
sam	1055	1980	1	0
sam	1055	1980	2	0
sam	1055	1980	3	0
sam	1055	1980	4	0
sam	1055	1981	1	0
sam	1055	1981	2	1
sam	1055	1981	3	1
sam	1055	1981	4	1
sam	1066	1978	1	0
sam	1066	1978	2	1
sam	1066	1978	3	0
sam	1066	1978	4	0
sam	1066	1979	1	0
sam	1066	1979	2	0
sam	1066	1979	3	0
sam	1066	1979	4	0
sam	1066	1980	1	0
sam	1066	1980	2	0
sam	1066	1980	3	0
sam	1066	1980	4	0
sam	1066	1981	1	0
sam	1066	1981	2	0
sam	1066	1981	3	0
sam	1066	1981	4	0
harry	1066	1978	1	0
harry	1066	1978	2	0
harry	1066	1978	3	0
harry	1066	1978	4	0
harry	1066	1979	1	0
harry	1066	1979	2	0
harry	1066	1979	3	0
harry	1066	1979	4	0
harry	1066	1980	1	1
harry	1066	1980	2	1
harry	1066	1980	3	1
harry	1066	1980	4	1
harry	1066	1981	1	1
harry	1066	1981	2	1
harry	1066	1981	3	1
harry	1066	1981	4	1
;
RUN;


DATA fips ;
  input fips;
DATALINES;
1001
1044
1055
1066
;
RUN;

proc sql;
create table want as
select a.*,coalesce(b.NA,0) as 	NA  from
(
select * from
   (select distinct group_name from have),
   (select distinct fips from fips),
   (select distinct year from have),
   (select distinct quarter from have)
) as a left join 	have as b
on 	a.group_name=b.group_name and a.fips=b.fips  
and a.year=b.year and a.quarter=b.quarter ;
quit;

PGStats · Posted 12-12-2020 04:10 PM

If you only want the years already present for a given group_name (and not years that may only be present in other group_names) :

proc sql;
create table want as
select
    b.group_name,
    a.fips,
    b.year,
    b.quarter,
    coalesce(c.NA, 0) as NA
from
    fips as a cross join
    (select distinct group_name, year, quarter from have) as b left join
    have as c on b.group_name=c.group_name and a.fips=c.fips and b.year=c.year and b.quarter=c.quarter
order by group_name desc, fips, year, quarter;
quit;

PG

TrueTears · Posted 12-13-2020 05:55 AM

Thanks to you both! But I was wondering what is the difference between your solution and Ksharp's? It seems both provide the same outcome. To be more specific about the rule: For each group_name in "have", I want to look through the fips variable, and any value that does not appear in the master list of fips in the "fips" dataset will need to be filled in with 0's for NA. So for instance, for the group_name = sam, the corresponding fips are 1055 and 1066. From the list of fips in "fips" dataset, the only missing fips are 1001 and 1044, so these are filled in with 0's. Similarly for group_name = harry, it only has fips = 1066 and it is missing 1001, 1044, and 1055 from the masterlist, so these are filled in for harry. The "fips" dataset is a masterlist, it will always contain all possible fips values.

PGStats · Posted 12-13-2020 05:09 PM

They differ in the way they treat the years that are not present in every group. Add the following observation to the data :

harry   1066    1972    1   0

And run the queries :

proc sql;
create table wantPG as
select
    c.group_name,
    a.fips,
    c.year,
    b.quarter,
    coalesce(d.NA, 0) as NA
from
    fips as a cross join
    (select distinct quarter from have) as b cross join
    (select distinct group_name, year from have) as c left join
    have as d on c.group_name=d.group_name and a.fips=d.fips and c.year=d.year and b.quarter=d.quarter
order by group_name desc, fips, year, quarter;
quit;

proc sql;
create table wantKS as
select a.*,coalesce(b.NA,0) as  NA  from
(
select * from
   (select distinct group_name from have),
   (select distinct fips from fips),
   (select distinct year from have),
   (select distinct quarter from have)
) as a left join    have as b
on  a.group_name=b.group_name and a.fips=b.fips  
and a.year=b.year and a.quarter=b.quarter;
quit;

proc sql;
title "Year = 1972 obs. - PG";
select * from wantPG where year=1972;
title "Year = 1972 obs. - KS";
select * from wantKS where year=1972;
quit;

PG

hhinohar · Posted 12-12-2020 10:39 PM

DATA have ;
  infile datalines dlm="09"x;
  input group_name $ fips year quarter NA;
DATALINES;
sam	1055	1978	1	0
sam	1055	1978	2	0
sam	1055	1978	3	0
sam	1055	1978	4	1
sam	1055	1979	1	0
sam	1055	1979	2	1
sam	1055	1979	3	1
sam	1055	1979	4	1
sam	1055	1980	1	0
sam	1055	1980	2	0
sam	1055	1980	3	0
sam	1055	1980	4	0
sam	1055	1981	1	0
sam	1055	1981	2	1
sam	1055	1981	3	1
sam	1055	1981	4	1
sam	1066	1978	1	0
sam	1066	1978	2	1
sam	1066	1978	3	0
sam	1066	1978	4	0
sam	1066	1979	1	0
sam	1066	1979	2	0
sam	1066	1979	3	0
sam	1066	1979	4	0
sam	1066	1980	1	0
sam	1066	1980	2	0
sam	1066	1980	3	0
sam	1066	1980	4	0
sam	1066	1981	1	0
sam	1066	1981	2	0
sam	1066	1981	3	0
sam	1066	1981	4	0
harry	1066	1978	1	0
harry	1066	1978	2	0
harry	1066	1978	3	0
harry	1066	1978	4	0
harry	1066	1979	1	0
harry	1066	1979	2	0
harry	1066	1979	3	0
harry	1066	1979	4	0
harry	1066	1980	1	1
harry	1066	1980	2	1
harry	1066	1980	3	1
harry	1066	1980	4	1
harry	1066	1981	1	1
harry	1066	1981	2	1
harry	1066	1981	3	1
harry	1066	1981	4	1
;
RUN;

DATA fips ;
  infile datalines dlm="09"x;
  input fips;
DATALINES;
1001
1044
1055
1066
;
RUN;

data want;
  length group_name $ 8 fips year quarter NA 8;
	set fips;
	do i=1 to nobs;
		set have(rename=(fips=_fips)) point=i nobs=nobs;
		if fips ne _fips then do;
			na=0;
		end;
		output;
	end;
	drop _fips;
run;

proc sort data=want;
	by descending group_name fips year quarter descending NA;
run;

proc sort data=want nodupkey;
	by descending group_name fips year quarter;
run;

SAS Innovate 2025: Call for Content

Classroom Training Available!