Skip to content

Commit 1d3ae22

Browse files
Merge pull request #54 from PolicyEngine/fix/remove-us-data-folder
Add ZIP code generation scripts, restructure `__init__.py`s
2 parents f873bd6 + c8447eb commit 1d3ae22

10 files changed

Lines changed: 111 additions & 7 deletions

File tree

CHANGELOG.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,16 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## [1.4.0] - 2024-09-18 03:05:11
9+
10+
### Added
11+
12+
- Geography generation module (previously in US package)
13+
14+
### Changed
15+
16+
- Fixed export structure within __init__ files
17+
818
## [1.3.1] - 2024-09-17 19:37:44
919

1020
### Added
@@ -65,6 +75,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
6575

6676

6777

78+
[1.4.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.3.1...1.4.0
6879
[1.3.1]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.3.0...1.3.1
6980
[1.3.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.2.1...1.3.0
7081
[1.2.1]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.2.0...1.2.1

changelog.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,10 @@
4545
added:
4646
- Jupyter Book documentation.
4747
date: 2024-09-17 19:37:44
48+
- bump: minor
49+
changes:
50+
added:
51+
- Geography generation module (previously in US package)
52+
changed:
53+
- Fixed export structure within __init__ files
54+
date: 2024-09-18 03:05:11

policyengine_us_data/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
from .datasets import *
2+
from .geography import ZIP_CODE_DATASET
Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,19 @@
1-
from .cps import *
2-
from .puf import *
1+
from .cps import (
2+
CPS_2019,
3+
CPS_2020,
4+
CPS_2021,
5+
CPS_2022,
6+
CPS_2023,
7+
CPS_2024,
8+
CensusCPS_2018,
9+
CensusCPS_2019,
10+
CensusCPS_2020,
11+
CensusCPS_2021,
12+
CensusCPS_2022,
13+
CensusCPS_2023,
14+
EnhancedCPS_2024,
15+
ReweightedCPS_2024,
16+
)
17+
from .puf import PUF_2015, PUF_2021, PUF_2024, IRS_PUF_2015
318

419
DATASETS = [CPS_2022, PUF_2021, CPS_2024, EnhancedCPS_2024]

policyengine_us_data/datasets/cps/enhanced_cps.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
CPS_2024,
1616
)
1717
import torch
18-
import os
1918

2019

2120
def reweight(
@@ -52,9 +51,7 @@ def loss(weights):
5251
optimizer = torch.optim.Adam([weights], lr=1e-2)
5352
from tqdm import trange
5453

55-
iterator = (
56-
trange(10_000) if not os.environ.get("TEST_LITE") else trange(100)
57-
)
54+
iterator = trange(10_000)
5855
for i in iterator:
5956
optimizer.zero_grad()
6057
l, worst_name, worst_val = loss(torch.exp(weights))
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# data/geography
2+
3+
This folder contains scripts and datasets used for geographic mappings and imputations.
4+
5+
## `create_zip_code_dataset.py`
6+
7+
This script generates the dataset `zip_codes.csv.gz`, which contains a row for every ZIP code, providing its:
8+
* ZIP code tabulation area
9+
* County
10+
* Population
11+
* State
12+
13+
The dataset is pre-generated and stored in the repo, so it'd only need to be updated when new ACS 5-year estimates come out.
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
from pathlib import Path
2+
import pandas as pd
3+
4+
ZIP_CODE_DATASET_PATH = (
5+
Path(__file__).parent.parent / "geography" / "zip_codes.csv.gz"
6+
)
7+
8+
ZIP_CODE_DATASET = pd.read_csv(ZIP_CODE_DATASET_PATH, compression="gzip")
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import pandas as pd
2+
3+
# Per-ZCTA population dataset
4+
# ACS 5-year estimates, download URL: https://data.census.gov/cedsci/table?q=DP05%3A%20ACS%20DEMOGRAPHIC%20AND%20HOUSING%20ESTIMATES&g=0100000US%248600000&tid=ACSDP5Y2020.DP05
5+
6+
zcta = pd.read_csv(
7+
"zcta_2020_population.csv",
8+
low_memory=False,
9+
usecols=["DP05_0001E", "NAME"],
10+
skiprows=1,
11+
header=0,
12+
names=["population", "zcta"],
13+
)
14+
zcta.zcta = zcta.zcta.apply(lambda x: x.split(" ")[1])
15+
zcta.zcta = zcta.zcta.astype(int)
16+
zcta.population = zcta.population.astype(int)
17+
18+
# ZCTA-county dataset
19+
# 2020 ZCTA to County Relationship File, download URL: https://www.census.gov/geographies/reference-files/time-series/geo/relationship-files.html#zcta
20+
zcta_to_county = pd.read_csv(
21+
"zcta_2020_to_county_2020.csv",
22+
delimiter="|",
23+
usecols=["GEOID_ZCTA5_20", "NAMELSAD_COUNTY_20"],
24+
names=["zcta", "county"],
25+
)
26+
zcta_to_county = zcta_to_county.dropna()
27+
zcta_to_county.zcta = zcta_to_county.zcta.astype(int)
28+
# Some ZCTAs have more than one county - select a random one
29+
zcta_to_county = zcta_to_county.groupby("zcta").apply(lambda x: x.sample(1))
30+
31+
# ZIP code-ZCTA dataset
32+
# Download URL: https://udsmapper.org/zip-code-to-zcta-crosswalk/
33+
zip_code = pd.read_csv(
34+
"zip_code_to_zcta.csv",
35+
usecols=["ZIP_CODE", "ZCTA", "STATE"],
36+
names=["zip_code", "zcta", "state"],
37+
)
38+
zip_code.zip_code = zip_code.zip_code.astype(int)
39+
zip_code = zip_code[zip_code.zcta != "No ZCTA"]
40+
zip_code.zcta = zip_code.zcta.astype(int)
41+
zip_code = zip_code[zip_code.zcta.isin(zcta.zcta)]
42+
zip_code = zip_code[zip_code.zcta.isin(zcta_to_county.zcta)]
43+
44+
# ZCTAs have multiple ZIP codes - split each ZCTA population equally into its component ZIP codes
45+
zip_code["population"] = (
46+
zcta.set_index("zcta").population[zip_code.zcta].values
47+
/ zip_code.groupby("zcta").zip_code.count()[zip_code.zcta].values
48+
)
49+
zip_code["county"] = (
50+
zcta_to_county.set_index("zcta").county[zip_code.zcta].values
51+
)
52+
zip_code.to_csv("zip_codes.csv", compression="gzip")
400 KB
Binary file not shown.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "policyengine_us_data"
7-
version = "1.3.1"
7+
version = "1.4.0"
88
description = "A package to create representative microdata for the US."
99
readme = "README.md"
1010
authors = [

0 commit comments

Comments
 (0)