Merge pull request #54 from PolicyEngine/fix/remove-us-data-folder

nikhilwoodruff · web-flow · commit 1d3ae223198f · 2024-09-18T14:44:24.000+01:00
Add ZIP code generation scripts, restructure `__init__.py`s
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,16 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.4.0] - 2024-09-18 03:05:11
+
+### Added
+
+- Geography generation module (previously in US package)
+
+### Changed
+
+- Fixed export structure within __init__ files
+
 ## [1.3.1] - 2024-09-17 19:37:44
 
 ### Added
@@ -65,6 +75,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 
 
+[1.4.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.3.1...1.4.0
 [1.3.1]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.3.0...1.3.1
 [1.3.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.2.1...1.3.0
 [1.2.1]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.2.0...1.2.1
diff --git a/changelog.yaml b/changelog.yaml
@@ -45,3 +45,10 @@
     added:
     - Jupyter Book documentation.
   date: 2024-09-17 19:37:44
+- bump: minor
+  changes:
+    added:
+    - Geography generation module (previously in US package)
+    changed:
+    - Fixed export structure within __init__ files
+  date: 2024-09-18 03:05:11
diff --git a/policyengine_us_data/__init__.py b/policyengine_us_data/__init__.py
@@ -1 +1,2 @@
 from .datasets import *
+from .geography import ZIP_CODE_DATASET
diff --git a/policyengine_us_data/datasets/__init__.py b/policyengine_us_data/datasets/__init__.py
@@ -1,4 +1,19 @@
-from .cps import *
-from .puf import *
+from .cps import (
+    CPS_2019,
+    CPS_2020,
+    CPS_2021,
+    CPS_2022,
+    CPS_2023,
+    CPS_2024,
+    CensusCPS_2018,
+    CensusCPS_2019,
+    CensusCPS_2020,
+    CensusCPS_2021,
+    CensusCPS_2022,
+    CensusCPS_2023,
+    EnhancedCPS_2024,
+    ReweightedCPS_2024,
+)
+from .puf import PUF_2015, PUF_2021, PUF_2024, IRS_PUF_2015
 
 DATASETS = [CPS_2022, PUF_2021, CPS_2024, EnhancedCPS_2024]
diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -15,7 +15,6 @@
     CPS_2024,
 )
 import torch
-import os
 
 
 def reweight(
@@ -52,9 +51,7 @@ def loss(weights):
     optimizer = torch.optim.Adam([weights], lr=1e-2)
     from tqdm import trange
 
-    iterator = (
-        trange(10_000) if not os.environ.get("TEST_LITE") else trange(100)
-    )
+    iterator = trange(10_000)
     for i in iterator:
         optimizer.zero_grad()
         l, worst_name, worst_val = loss(torch.exp(weights))
diff --git a/policyengine_us_data/geography/README.md b/policyengine_us_data/geography/README.md
@@ -0,0 +1,13 @@
+# data/geography
+
+This folder contains scripts and datasets used for geographic mappings and imputations.
+
+## `create_zip_code_dataset.py`
+
+This script generates the dataset `zip_codes.csv.gz`, which contains a row for every ZIP code, providing its:
+* ZIP code tabulation area
+* County
+* Population
+* State
+
+The dataset is pre-generated and stored in the repo, so it'd only need to be updated when new ACS 5-year estimates come out.
diff --git a/policyengine_us_data/geography/__init__.py b/policyengine_us_data/geography/__init__.py
@@ -0,0 +1,8 @@
+from pathlib import Path
+import pandas as pd
+
+ZIP_CODE_DATASET_PATH = (
+    Path(__file__).parent.parent / "geography" / "zip_codes.csv.gz"
+)
+
+ZIP_CODE_DATASET = pd.read_csv(ZIP_CODE_DATASET_PATH, compression="gzip")
diff --git a/policyengine_us_data/geography/create_zip_code_dataset.py b/policyengine_us_data/geography/create_zip_code_dataset.py
@@ -0,0 +1,52 @@
+import pandas as pd
+
+# Per-ZCTA population dataset
+# ACS 5-year estimates, download URL: https://data.census.gov/cedsci/table?q=DP05%3A%20ACS%20DEMOGRAPHIC%20AND%20HOUSING%20ESTIMATES&g=0100000US%248600000&tid=ACSDP5Y2020.DP05
+
+zcta = pd.read_csv(
+    "zcta_2020_population.csv",
+    low_memory=False,
+    usecols=["DP05_0001E", "NAME"],
+    skiprows=1,
+    header=0,
+    names=["population", "zcta"],
+)
+zcta.zcta = zcta.zcta.apply(lambda x: x.split(" ")[1])
+zcta.zcta = zcta.zcta.astype(int)
+zcta.population = zcta.population.astype(int)
+
+# ZCTA-county dataset
+# 2020 ZCTA to County Relationship File, download URL: https://www.census.gov/geographies/reference-files/time-series/geo/relationship-files.html#zcta
+zcta_to_county = pd.read_csv(
+    "zcta_2020_to_county_2020.csv",
+    delimiter="|",
+    usecols=["GEOID_ZCTA5_20", "NAMELSAD_COUNTY_20"],
+    names=["zcta", "county"],
+)
+zcta_to_county = zcta_to_county.dropna()
+zcta_to_county.zcta = zcta_to_county.zcta.astype(int)
+# Some ZCTAs have more than one county - select a random one
+zcta_to_county = zcta_to_county.groupby("zcta").apply(lambda x: x.sample(1))
+
+# ZIP code-ZCTA dataset
+# Download URL: https://udsmapper.org/zip-code-to-zcta-crosswalk/
+zip_code = pd.read_csv(
+    "zip_code_to_zcta.csv",
+    usecols=["ZIP_CODE", "ZCTA", "STATE"],
+    names=["zip_code", "zcta", "state"],
+)
+zip_code.zip_code = zip_code.zip_code.astype(int)
+zip_code = zip_code[zip_code.zcta != "No ZCTA"]
+zip_code.zcta = zip_code.zcta.astype(int)
+zip_code = zip_code[zip_code.zcta.isin(zcta.zcta)]
+zip_code = zip_code[zip_code.zcta.isin(zcta_to_county.zcta)]
+
+# ZCTAs have multiple ZIP codes - split each ZCTA population equally into its component ZIP codes
+zip_code["population"] = (
+    zcta.set_index("zcta").population[zip_code.zcta].values
+    / zip_code.groupby("zcta").zip_code.count()[zip_code.zcta].values
+)
+zip_code["county"] = (
+    zcta_to_county.set_index("zcta").county[zip_code.zcta].values
+)
+zip_code.to_csv("zip_codes.csv", compression="gzip")
diff --git a/policyengine_us_data/geography/zip_codes.csv.gz b/policyengine_us_data/geography/zip_codes.csv.gz
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "policyengine_us_data"
-version = "1.3.1"
+version = "1.4.0"
 description = "A package to create representative microdata for the US."
 readme = "README.md"
 authors = [

Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`	`1`	`from .datasets import *`
	`2`	`+from .geography import ZIP_CODE_DATASET`