Re: How to split Census tracts by 50% in code?

LMP · Posted 07-27-2018 03:34 PM

Hello,

I am trying to split partial census tracts by 50%. I am using American Fact Finder data that I am defining neighborhoods and there are neighborhoods that have partial tracts that need to be split in half to have a better idea of the population for that neighborhood. I really don't know what to do moving forward. Please help.

I have attached my code thus far.

Thanks

data mydata.Cortheneighborhoodsn2000v3;
set mydata.Cortheneighborhoodsn2000v2;

tract = scan(geography,1,",");
tract2 = compress(left(scan(tract,3," ")));
censustract = input(tract2,best12.);

if censustract in(34,66,67,68,69) then Neighborhood = "Avondale                            ";
if censustract in(63,64) then Neighborhood = "Bond Hill";
if censustract in(45) then neighborhood = "California";
if censustract in(28) then neighborhood = "Camp Washington";
if censustract in(61) then neighborhood = "Carthage";
if censustract in(6,7) then neighborhood = "Downtown";
if censustract in(70,71,72) then neighborhood = "Clifton";
if censustract in(81,82.01,82.02,84,111,218.01,221.02) then neighborhood = "College Hill";
if censustract in(32,33) then neighborhood = "Corryville";
if censustract in(43,44) then neighborhood = "East End";
if censustract in(92,93,94,95,96) then neighborhood = "East Price Hill";
if censustract in(20,42) then neighborhood = "East Walnut Hills";
if censustract in(38,39,40) then neighborhood = "Evanston";
if censustract in(41) then neighborhood = "Evanston/East Walnut Hills";
if censustract in(25,26,27,29,30) then neighborhood = "CUF";
if censustract in(85.02) then neighborhood = "Roll Hill";
if censustract in(60,222) then neighborhood = "Hartwell";
if censustract in(49,50,51) then neighborhood = "Hyde Park";
if censustract in(58) then neighborhood = "Kennedy Heights";
if censustract in(47.02) then neighborhood = "Linwood";
if censustract in(91, 1) then neighborhood = "Lower Price Hill/Queensgate";
if censustract in(55,56,108,242,244) then neighborhood = "Madisonville";
if censustract in(12, 13) then neighborhood = "Mt. Adams";
if censustract in(83,85.01,208.11) then neighborhood = "Mt. Airy";
if censustract in(18,22,23) then neighborhood = "Mt. Auburn";
if censustract in(48) then neighborhood = "Mt. Lookout";
if censustract in(47.01) then neighborhood = "Mt. Lookout/Columbia Tusculum";
if censustract in(46.01,46.02,46.03,250.01) then neighborhood = "Mt. Washington";
if censustract in(65) then neighborhood = "North Avondale/Paddock Hills";
if censustract in(86.01) then neighborhood = "North Fairmount/English Woods";
if censustract in(74,75,78,79) then neighborhood = "Northside";
if censustract in(52,53,54,253) then neighborhood = "Oakley";
if censustract in(9,10,11,16,17) then neighborhood = "Over-the-Rhine";
if censustract in(57.01,57.02,59) then neighborhood = "Pleasant Ridge";
*if censustract in(1) then neighborhood = "Lower Price Hill";
if censustract in(104) then neighborhood = "Riverside-Sayler Park";
if censustract in(62.01,62.02,110,232.01) then neighborhood = "Roselawn";
if censustract in(105,106) then neighborhood = "Sayler Park";
if censustract in(103) then neighborhood = "Sedamsville/Riverside";
if censustract in(73) then neighborhood = "Spring Grove Village";
if censustract in(77) then neighborhood = "South Cumminsville/Millvale";
if censustract in(87,89) then neighborhood = "South Fairmount";
if censustract in(19,21,35,36,37) then neighborhood = "Walnut Hills";
if censustract in(2,3.01,3.02,4,8,14,15) then neighborhood = "West End";
if censustract in(97,98,99.01,99.02,107,212.02,214.01) then neighborhood = "West Price Hill";
if censustract in(88,100.01,100.02,101,102.01,102.02,109,209.01,209.02) then neighborhood = "Westwood/ East Westwood";
if censustract in(80) then neighborhood = "Winton Hills";

*Need to split the partials in half...NEED to FIX the following;
if censustract in(45, 218.01, 221.02, 222, 242, 244, 208.11, 250.01, 253, 232.01, 99.01, 99.02, 107, 212.02, 214.01, 209.01, 209.02) then partial = '1';

if neighborhood =' ' or tot_popn =0 then delete; 

*drop tract tract2 geography partial;
drop tract tract2 geography ;
run;

,

LMP

Reeza · Posted 07-27-2018 03:44 PM

Is it possible to use a smaller area to help supplement this instead? Ie use CT for the biggest areas and then fill in with county subdivisions or a smaller area if it exists?

Sorry, not too familiar with US census data though I often do this my Census data.

There's also usually a mapping of regions to names that would allow you to use PROC FORMAT instead of a lot of if then statements.

If you know where it goes you can do something like the following:

data have;
set duplicates;

*these need to be split;
if N>1 then do;
pop=pop/2; *split pop in half;
area=area1; *assign name of one area;
output;
area=area2; *assign name of second area;
output;
end;
else output; *output individual data;

run;

And a last option - split weighted by the area overlap. If only 10% of the area overlaps just assign 10% of the population. This is the type of work I'd usually do in ArcGIS or QGIS though, not necessarily SAS.

@LMP wrote:

Hello,

I am trying to split partial census tracts by 50%. I am using American Fact Finder data that I am defining neighborhoods and there are neighborhoods that have partial tracts that need to be split in half to have a better idea of the population for that neighborhood. I really don't know what to do moving forward. Please help.

I have attached my code thus far.

Thanks

data mydata.Cortheneighborhoodsn2000v3;
set mydata.Cortheneighborhoodsn2000v2;

tract = scan(geography,1,",");
tract2 = compress(left(scan(tract,3," ")));
censustract = input(tract2,best12.);

if censustract in(34,66,67,68,69) then Neighborhood = "Avondale                            ";
if censustract in(63,64) then Neighborhood = "Bond Hill";
if censustract in(45) then neighborhood = "California";
if censustract in(28) then neighborhood = "Camp Washington";
if censustract in(61) then neighborhood = "Carthage";
if censustract in(6,7) then neighborhood = "Downtown";
if censustract in(70,71,72) then neighborhood = "Clifton";
if censustract in(81,82.01,82.02,84,111,218.01,221.02) then neighborhood = "College Hill";
if censustract in(32,33) then neighborhood = "Corryville";
if censustract in(43,44) then neighborhood = "East End";
if censustract in(92,93,94,95,96) then neighborhood = "East Price Hill";
if censustract in(20,42) then neighborhood = "East Walnut Hills";
if censustract in(38,39,40) then neighborhood = "Evanston";
if censustract in(41) then neighborhood = "Evanston/East Walnut Hills";
if censustract in(25,26,27,29,30) then neighborhood = "CUF";
if censustract in(85.02) then neighborhood = "Roll Hill";
if censustract in(60,222) then neighborhood = "Hartwell";
if censustract in(49,50,51) then neighborhood = "Hyde Park";
if censustract in(58) then neighborhood = "Kennedy Heights";
if censustract in(47.02) then neighborhood = "Linwood";
if censustract in(91, 1) then neighborhood = "Lower Price Hill/Queensgate";
if censustract in(55,56,108,242,244) then neighborhood = "Madisonville";
if censustract in(12, 13) then neighborhood = "Mt. Adams";
if censustract in(83,85.01,208.11) then neighborhood = "Mt. Airy";
if censustract in(18,22,23) then neighborhood = "Mt. Auburn";
if censustract in(48) then neighborhood = "Mt. Lookout";
if censustract in(47.01) then neighborhood = "Mt. Lookout/Columbia Tusculum";
if censustract in(46.01,46.02,46.03,250.01) then neighborhood = "Mt. Washington";
if censustract in(65) then neighborhood = "North Avondale/Paddock Hills";
if censustract in(86.01) then neighborhood = "North Fairmount/English Woods";
if censustract in(74,75,78,79) then neighborhood = "Northside";
if censustract in(52,53,54,253) then neighborhood = "Oakley";
if censustract in(9,10,11,16,17) then neighborhood = "Over-the-Rhine";
if censustract in(57.01,57.02,59) then neighborhood = "Pleasant Ridge";
*if censustract in(1) then neighborhood = "Lower Price Hill";
if censustract in(104) then neighborhood = "Riverside-Sayler Park";
if censustract in(62.01,62.02,110,232.01) then neighborhood = "Roselawn";
if censustract in(105,106) then neighborhood = "Sayler Park";
if censustract in(103) then neighborhood = "Sedamsville/Riverside";
if censustract in(73) then neighborhood = "Spring Grove Village";
if censustract in(77) then neighborhood = "South Cumminsville/Millvale";
if censustract in(87,89) then neighborhood = "South Fairmount";
if censustract in(19,21,35,36,37) then neighborhood = "Walnut Hills";
if censustract in(2,3.01,3.02,4,8,14,15) then neighborhood = "West End";
if censustract in(97,98,99.01,99.02,107,212.02,214.01) then neighborhood = "West Price Hill";
if censustract in(88,100.01,100.02,101,102.01,102.02,109,209.01,209.02) then neighborhood = "Westwood/ East Westwood";
if censustract in(80) then neighborhood = "Winton Hills";

*Need to split the partials in half...NEED to FIX the following;
if censustract in(45, 218.01, 221.02, 222, 242, 244, 208.11, 250.01, 253, 232.01, 99.01, 99.02, 107, 212.02, 214.01, 209.01, 209.02) then partial = '1';

if neighborhood =' ' or tot_popn =0 then delete; 

*drop tract tract2 geography partial;
drop tract tract2 geography ;
run;

,

LMP

ballardw · Posted 07-27-2018 03:55 PM

It might help demonstrate what you mean by "split" if you show a very small starting data set with the minimum of variables needed to demonstrate the issue, identify at least one "split" case and on non-split and show what the final result is supposed to be as a dataset for that example input.

LMP · Posted 07-27-2018 04:28 PM

I need the census tracts that are partials to be split in half.... I just don't know how phrase this in my code. I need only half the observations for these census tracts with partials to be counted by 50%. I already defined that some of these partial census tract belong to particular neighborhoods but keeping them the way they are makes the populations larger than they actually are because they are contain partial census tracts.

Census tracts that have partials:

if censustract in(45, 218.01, 221.02, 222, 242, 244, 208.11, 250.01, 253, 232.01, 99.01, 99.02, 107, 212.02, 214.01, 209.01, 209.02) then %half split;

Would this work? I'm trying to say divide the observations under these census tracts by 2 (half) --> I only want half the value of these census tracts.

%DO censustract=(45, 218.01, 221.02, 222, 242, 244, 208.11, 250.01, 253, 232.01, 99.01, 99.02, 107, 212.02, 214.01, 209.01, 209.02) <%BY 2> ;

(I've never done macro yet so, I'm not sure if this is right).

Thanks!

Reeza · Posted 07-27-2018 05:03 PM

@LMP did you try the code I suggested, with the split/2 and then assigning the new names and explicit output?

LMP · Posted 07-27-2018 05:10 PM

Unfortunately,
It is a little more complicated then that because the data is broken down by age category ages 0-110 and it has the total population. I need all the observations for these variables split in half as well.

LMP · Posted 07-27-2018 05:11 PM

And thanks for the suggestion!

Reeza · Posted 07-27-2018 05:13 PM

You can use an array and a loop to split them all in two then.

For census data I usually flip it long anyways to avoid those issues until the summaries are complete. Wide data is really only good for displaying.

ballardw · Posted 07-27-2018 05:47 PM

Showing some input values that are to be "split" and the desired result after splitting that you can do by hand should be practical.

If you can't do it manually then you are very unlikely to program it successfully.

SuzanneDorinski · Posted 08-02-2018 12:57 AM

I'm guessing that this post is about neighborhoods in Cincinnati, Ohio. I used American FactFinder on the Census web site to find data from the 2010 Census with ages up to 110.

The files I downloaded from American FactFinder for Hamilton County, Ohio are attached. Cincinnati is located in Hamilton County, but not all the Hamilton County records are within Cincinnati.

There are 871 variables when you import the DEC_10_SF1_QTP2.csv. I dropped the variables that I didn't use, and renamed the rest to include the age categories.

* metadata CSV in download from American Fact Finder explains
* what the variable names mean. using the QT-P2 table from the 
* 2010 Census for Hamilton County Ohio in this example. ;

FILENAME REFFILE1 '/folders/myfolders/Hamilton County Ohio 2010 age data/DEC_10_SF1_QTP2_metadata.csv';

data WORK.VARIABLE_LIST    ;
%let _EFIERR_ = 0; /* set the ERROR detection macro variable */
infile REFFILE1 delimiter = ',' MISSOVER DSD  ;
   informat VAR1 $17. ;
   informat VAR2 $79. ;
   format VAR1 $17. ;
   format VAR2 $79. ;
input
            VAR1  $
            VAR2  $
;
if _ERROR_ then call symputx('_EFIERR_',1);  /* set ERROR detection macro variable */
run;


* table from American Fact Finder has lots of data.  
* HD03_Snn is males per 100 females for various age groups.
* SUBHD0101_Snn has the number for both sexes for various age groups.
* SUBHD0102_Snn is the number of males for various age groups.
* SUBHD0103_Snn is the number of females for various age groups.
* SUBHD0201_Snn is the percent of the total population for various age groups.
* SUBHD0202_Snn is the percent male of the total population for various age groups.
* SUBHD0203_Snn is the percent female of the total population for various age groups. 
* Assume we are only interested in some of the SUBHD0101_Snn variables ;

* want to rename the population counts ;

proc sql noprint;
  create table age_vars as
  select var1,
         var2
    from variable_list
      where var1 contains 'SUBHD0101' and
            var1 not in ('SUBHD0101_S02',
                         'SUBHD0101_S08',
                         'SUBHD0101_S14',
                         'SUBHD0101_S20',
                         'SUBHD0101_S26',
                         'SUBHD0101_S32',
                         'SUBHD0101_S38',
                         'SUBHD0101_S44',
                         'SUBHD0101_S50',
                         'SUBHD0101_S56',
                         'SUBHD0101_S62',
                         'SUBHD0101_S68',
                         'SUBHD0101_S74',
                         'SUBHD0101_S80',
                         'SUBHD0101_S86',
                         'SUBHD0101_S92',
                         'SUBHD0101_S98',
                         'SUBHD0101_S104',
                         'SUBHD0101_S110',
                         'SUBHD0101_S116');
                         
  select var1
    into :keep_list separated by ' '
      from variable_list
        where var1 contains 'SUBHD0101' and
            var1 not in ('SUBHD0101_S02',
                         'SUBHD0101_S08',
                         'SUBHD0101_S14',
                         'SUBHD0101_S20',
                         'SUBHD0101_S26',
                         'SUBHD0101_S32',
                         'SUBHD0101_S38',
                         'SUBHD0101_S44',
                         'SUBHD0101_S50',
                         'SUBHD0101_S56',
                         'SUBHD0101_S62',
                         'SUBHD0101_S68',
                         'SUBHD0101_S74',
                         'SUBHD0101_S80',
                         'SUBHD0101_S86',
                         'SUBHD0101_S92',
                         'SUBHD0101_S98',
                         'SUBHD0101_S104',
                         'SUBHD0101_S110',
                         'SUBHD0101_S116');
quit;

data age_vars;
  length new_name $ 32;
  set age_vars;
  if var1='SUBHD0101_S01' then new_name='tot_pop';
  else new_name='tot_pop'||tranwrd(trim(scan(var2,-1,'-')),' ','_');
run;

proc sql noprint;
  select var1||" = "||new_name
    into :rename_count_list separated by ' '
      from age_vars;
quit;

* annotation CSV in download from American Fact Finder has geo_id, geo_id2, and 
* geo_display_label, along with information about all the other variables.  row 2 of 
* the CSV has information that we don't want, so start on row 3.  will have to keep
* and rename variables.  we can easily get the census tract from geo_display_label. ;

FILENAME REFFILE2 '/folders/myfolders/Hamilton County Ohio 2010 age data/DEC_10_SF1_QTP2_ann.csv';

data geo_info;
%let _EFIERR_ = 0; /* set the ERROR detection macro variable */
infile REFFILE2 delimiter = ',' MISSOVER DSD firstobs=3 ;
   informat geo_id $20. ;
   informat geo_id2 $11. ;
   informat geo_display_label $42. ;
   format geo_id $20. ;
   format geo_id2 $11. ;
   format geo_display_label $42. ;
input
            geo_id  $
            geo_id2  $
            geo_display_label $
;
if _ERROR_ then call symputx('_EFIERR_',1);  /* set ERROR detection macro variable */
run;

* data step below assumes that all the neighborhoods in Cincinnati have been listed.
* drop census tracts in Hamilton County but not in Cincinnati ;

data geo_info_cincy_neighborhoods;
  length Neighborhood $ 29;
  set geo_info;
  censustract=input(tranwrd((scan(geo_display_label,1,',')),'Census Tract ',''),best.);
  if censustract in(34,66,67,68,69) then Neighborhood = "Avondale                            ";
  if censustract in(63,64) then Neighborhood = "Bond Hill";
  if censustract in(45) then neighborhood = "California";
  if censustract in(28) then neighborhood = "Camp Washington";
  if censustract in(61) then neighborhood = "Carthage";
  if censustract in(6,7) then neighborhood = "Downtown";
  if censustract in(70,71,72) then neighborhood = "Clifton";
  if censustract in(81,82.01,82.02,84,111,218.01,221.02) then neighborhood = "College Hill";
  if censustract in(32,33) then neighborhood = "Corryville";
  if censustract in(43,44) then neighborhood = "East End";
  if censustract in(92,93,94,95,96) then neighborhood = "East Price Hill";
  if censustract in(20,42) then neighborhood = "East Walnut Hills";
  if censustract in(38,39,40) then neighborhood = "Evanston";
  if censustract in(41) then neighborhood = "Evanston/East Walnut Hills";
  if censustract in(25,26,27,29,30) then neighborhood = "CUF";
  if censustract in(85.02) then neighborhood = "Roll Hill";
  if censustract in(60,222) then neighborhood = "Hartwell";
  if censustract in(49,50,51) then neighborhood = "Hyde Park";
  if censustract in(58) then neighborhood = "Kennedy Heights";
  if censustract in(47.02) then neighborhood = "Linwood";
  if censustract in(91, 1) then neighborhood = "Lower Price Hill/Queensgate";
  if censustract in(55,56,108,242,244) then neighborhood = "Madisonville";
  if censustract in(12, 13) then neighborhood = "Mt. Adams";
  if censustract in(83,85.01,208.11) then neighborhood = "Mt. Airy";
  if censustract in(18,22,23) then neighborhood = "Mt. Auburn";
  if censustract in(48) then neighborhood = "Mt. Lookout";
  if censustract in(47.01) then neighborhood = "Mt. Lookout/Columbia Tusculum";
  if censustract in(46.01,46.02,46.03,250.01) then neighborhood = "Mt. Washington";
  if censustract in(65) then neighborhood = "North Avondale/Paddock Hills";
  if censustract in(86.01) then neighborhood = "North Fairmount/English Woods";
  if censustract in(74,75,78,79) then neighborhood = "Northside";
  if censustract in(52,53,54,253) then neighborhood = "Oakley";
  if censustract in(9,10,11,16,17) then neighborhood = "Over-the-Rhine";
  if censustract in(57.01,57.02,59) then neighborhood = "Pleasant Ridge";
  if censustract in(104) then neighborhood = "Riverside-Sayler Park";
  if censustract in(62.01,62.02,110,232.01) then neighborhood = "Roselawn";
  if censustract in(105,106) then neighborhood = "Sayler Park";
  if censustract in(103) then neighborhood = "Sedamsville/Riverside";
  if censustract in(73) then neighborhood = "Spring Grove Village";
  if censustract in(77) then neighborhood = "South Cumminsville/Millvale";
  if censustract in(87,89) then neighborhood = "South Fairmount";
  if censustract in(19,21,35,36,37) then neighborhood = "Walnut Hills";
  if censustract in(2,3.01,3.02,4,8,14,15) then neighborhood = "West End";
  if censustract in(97,98,99.01,99.02,107,212.02,214.01) then neighborhood = "West Price Hill";
  if censustract in(88,100.01,100.02,101,102.01,102.02,109,209.01,209.02) then neighborhood = "Westwood/East Westwood";
  if censustract in(80) then neighborhood = "Winton Hills";
  if neighborhood ne '';
run;

FILENAME REFFILE3 '/folders/myfolders/Hamilton County Ohio 2010 age data/DEC_10_SF1_QTP2.csv';

PROC IMPORT DATAFILE=REFFILE3
	DBMS=CSV
	OUT=population
	replace;
	GETNAMES=YES;
RUN;

data population(rename=(&rename_count_list));
  set population(keep=geo_id &keep_list);
run;

proc sql;
  create table cincy_neighborhood_pop as
    select a.neighborhood,
           a.censustract,
           b.*
      from geo_info_cincy_neighborhoods as a LEFT JOIN
           population as b
        on a.geo_id=b.geo_id
          order by geo_id; 
quit;

proc print data=cincy_neighborhood_pop split='_';
  where censustract in (45, 218.01, 221.02, 222, 242, 
                        244, 208.11, 250.01, 253, 232.01, 
                        99.01, 99.02, 107, 212.02, 214.01, 
                        209.01, 209.02);
  title 'These are partial tracts in neighborhoods -- need to split data to avoid overcount';
run;

proc sql noprint;
  select name 
    into :pop_counts_list separated by ' '
      from sashelp.vcolumn 
        where libname='WORK' and
              memname='CINCY_NEIGHBORHOOD_POP' and
              upcase(name) contains 'TOT_POP';
quit;

%macro set_up_array;
  array pop_counts [*] &pop_counts_list;
%mend set_up_array;

options mprint;

data cincy_partial_tracts_split_pop(drop=i);
  %set_up_array ;
  set cincy_neighborhood_pop;
  if censustract in (45, 218.01, 221.02, 222, 242, 
                     244, 208.11, 250.01, 253, 232.01, 
                     99.01, 99.02, 107, 212.02, 214.01, 
                     209.01, 209.02);
  do i=1 to dim(pop_counts);
    pop_counts[i] = round(pop_counts[i]/2);
  end;
run;  

proc print data=cincy_partial_tracts_split_pop split='_';
  var neighborhood censustract geo_id &pop_counts_list ;
  title 'After halving population counts in partial tracts';
run;

title;

data corrected_cincy_pop_counts;
  update cincy_neighborhood_pop
         cincy_partial_tracts_split_pop;
  by censustract;
run;

proc sql;
  select sum(tot_pop) as sum_before label=
    'Total population in Cincinnati neighborhoods before splitting partial census tracts'
      format=comma10.
    from cincy_neighborhood_pop;
quit;

proc sql;
  select sum(tot_pop) as sum_after label=
    'Total population in Cincinnati neighborhoods AFTER splitting partial census tracts'
      format=comma10.
    from corrected_cincy_pop_counts;
quit;

The screen shot below from the log shows the array with the population counts.

Array with all the population counts

Registration is open

SAS Training: Just a Click Away