Skip to content

Commit 4e1d1e0

Browse files
Add ACS, rent and property taxes and 3-year CPS (#35)
* Migrate ACS from policyengine-us Fixes #31 * populate acs * Update PolicyEngine US data * format * data fix * test * changelog * Update PolicyEngine US data * remove extra * chagelog * Update PolicyEngine US data * readme file * property tax * changelog * Update PolicyEngine US data * format * changelog * Pool 3 CPS years Fixes #66 * Upload ECPS result in PRs * Feed into ECPS * Bump version and ECPS file * changelog * Move back to old ECPS * init * storage * Fix imports * Move versioning back * Add URL for ACS 2022 * Add QRF rewrite and full imputations * Add calibration * Shift to branch of US * Make optional install * Generate ACS before CPS * What a silly error * Minor improvements * Fix bugs * Adjust QRF to enable single-output predictions * Fix bug in QRF --------- Co-authored-by: Github Actions[bot] <PavelMakarchuk@users.noreply.github.com> Co-authored-by: Nikhil Woodruff <nikhil.woodruff@outlook.com>
1 parent 659fac0 commit 4e1d1e0

20 files changed

Lines changed: 634 additions & 66 deletions

.github/workflows/pull_request.yaml

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -67,9 +67,12 @@ jobs:
6767
POLICYENGINE_US_DATA_GITHUB_TOKEN: ${{ secrets.POLICYENGINE_US_DATA_GITHUB_TOKEN }}
6868
- name: Build datasets
6969
run: make data
70-
env:
71-
TEST_LITE: true
7270
- name: Run tests
7371
run: pytest
7472
- name: Test documentation builds
75-
run: make documentation
73+
run: make documentation
74+
- name: Upload ECPS 2024
75+
uses: actions/upload-artifact@v4
76+
with:
77+
name: enhanced_cps_2024.h5
78+
path: policyengine_us_data/storage/enhanced_cps_2024.h5

Makefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,9 @@ documentation:
3232
jb clean docs && jb build docs
3333

3434
data:
35+
python policyengine_us_data/datasets/acs/acs.py
3536
python policyengine_us_data/datasets/cps/cps.py
37+
python policyengine_us_data/datasets/cps/extended_cps.py
3638
python policyengine_us_data/datasets/cps/enhanced_cps.py
3739

3840
clean:

changelog_entry.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
- bump: minor
2+
changes:
3+
added:
4+
- Migrate the ACS from the US-repository.
5+
changed:
6+
- Enhanced CPS now uses a 3-year pooled CPS.

policyengine_us_data/datasets/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
CPS_2022,
66
CPS_2023,
77
CPS_2024,
8+
Pooled_3_Year_CPS_2023,
89
CensusCPS_2018,
910
CensusCPS_2019,
1011
CensusCPS_2020,
@@ -15,5 +16,6 @@
1516
ReweightedCPS_2024,
1617
)
1718
from .puf import PUF_2015, PUF_2021, PUF_2024, IRS_PUF_2015
19+
from .acs import ACS_2022
1820

19-
DATASETS = [CPS_2022, PUF_2021, CPS_2024, EnhancedCPS_2024]
21+
DATASETS = [CPS_2022, PUF_2021, CPS_2024, EnhancedCPS_2024, ACS_2022]
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
2022 ACS 1 Year Data Dictionary:
2+
https://www2.census.gov/programs-surveys/acs/tech_docs/pums/data_dict/PUMS_Data_Dictionary_2022.pdf
3+
User Guide:
4+
https://www2.census.gov/programs-surveys/acs/tech_docs/pums/2022ACS_PUMS_User_Guide.pdf
5+
PUMS Documentation:
6+
https://www.census.gov/programs-surveys/acs/microdata/documentation.html
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from .acs import *
2+
from .census_acs import *
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
import logging
2+
from policyengine_core.data import Dataset
3+
import h5py
4+
from policyengine_us_data.datasets.acs.census_acs import CensusACS_2022
5+
from policyengine_us_data.storage import STORAGE_FOLDER
6+
from pandas import DataFrame
7+
import numpy as np
8+
import pandas as pd
9+
10+
11+
class ACS(Dataset):
12+
data_format = Dataset.ARRAYS
13+
time_period = None
14+
census_acs = None
15+
16+
def generate(self) -> None:
17+
"""Generates the ACS dataset."""
18+
19+
raw_data = self.census_acs(require=True).load()
20+
acs = h5py.File(self.file_path, mode="w")
21+
person, household = [
22+
raw_data[entity] for entity in ("person", "household")
23+
]
24+
25+
self.add_id_variables(acs, person, household)
26+
self.add_person_variables(acs, person, household)
27+
self.add_household_variables(acs, household)
28+
29+
acs.close()
30+
raw_data.close()
31+
32+
@staticmethod
33+
def add_id_variables(
34+
acs: h5py.File,
35+
person: DataFrame,
36+
household: DataFrame,
37+
) -> None:
38+
# Create numeric IDs based on SERIALNO
39+
h_id_to_number = pd.Series(
40+
np.arange(len(household)), index=household["SERIALNO"]
41+
)
42+
household["household_id"] = h_id_to_number[
43+
household["SERIALNO"]
44+
].values
45+
person["household_id"] = h_id_to_number[person["SERIALNO"]].values
46+
person["person_id"] = person.index + 1
47+
48+
acs["person_id"] = person["person_id"]
49+
acs["household_id"] = household["household_id"]
50+
acs["spm_unit_id"] = acs["household_id"]
51+
acs["tax_unit_id"] = acs["household_id"]
52+
acs["family_id"] = acs["household_id"]
53+
acs["marital_unit_id"] = acs["household_id"]
54+
acs["person_household_id"] = person["household_id"]
55+
acs["person_spm_unit_id"] = person["household_id"]
56+
acs["person_tax_unit_id"] = person["household_id"]
57+
acs["person_family_id"] = person["household_id"]
58+
acs["person_marital_unit_id"] = person["household_id"]
59+
acs["household_weight"] = household.WGTP
60+
61+
@staticmethod
62+
def add_person_variables(
63+
acs: h5py.File, person: DataFrame, household: DataFrame
64+
) -> None:
65+
acs["age"] = person.AGEP
66+
acs["is_male"] = person.SEX == 1
67+
acs["employment_income"] = person.WAGP
68+
acs["self_employment_income"] = person.SEMP
69+
acs["social_security"] = person.SSP
70+
acs["taxable_private_pension_income"] = person.RETP
71+
person[["rent", "real_estate_taxes"]] = (
72+
household.set_index("household_id")
73+
.loc[person["household_id"]][["RNTP", "TAXAMT"]]
74+
.values
75+
)
76+
acs["is_household_head"] = person.SPORDER == 1
77+
factor = person.SPORDER == 1
78+
person.rent *= factor * 12
79+
person.real_estate_taxes *= factor
80+
acs["rent"] = person.rent
81+
acs["real_estate_taxes"] = person.real_estate_taxes
82+
acs["tenure_type"] = (
83+
household.TEN.astype(int)
84+
.map(
85+
{
86+
1: "OWNED_WITH_MORTGAGE",
87+
2: "OWNED_OUTRIGHT",
88+
3: "RENTED",
89+
}
90+
)
91+
.fillna("NONE")
92+
.astype("S")
93+
)
94+
95+
@staticmethod
96+
def add_spm_variables(acs: h5py.File, spm_unit: DataFrame) -> None:
97+
acs["spm_unit_net_income_reported"] = spm_unit.SPM_RESOURCES
98+
acs["spm_unit_spm_threshold"] = spm_unit.SPM_POVTHRESHOLD
99+
100+
@staticmethod
101+
def add_household_variables(acs: h5py.File, household: DataFrame) -> None:
102+
acs["household_vehicles_owned"] = household.VEH
103+
acs["state_fips"] = acs["household_state_fips"] = household.ST.astype(
104+
int
105+
)
106+
107+
108+
class ACS_2022(ACS):
109+
name = "acs_2022"
110+
label = "ACS 2022"
111+
time_period = 2022
112+
file_path = STORAGE_FOLDER / "acs_2022.h5"
113+
census_acs = CensusACS_2022
114+
url = "release://PolicyEngine/policyengine-us-data/release/acs_2022.h5"
115+
116+
117+
if __name__ == "__main__":
118+
ACS_2022().generate()
Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
from io import BytesIO
2+
import logging
3+
from typing import List
4+
from zipfile import ZipFile
5+
import pandas as pd
6+
from policyengine_core.data import Dataset
7+
import requests
8+
from tqdm import tqdm
9+
from policyengine_us_data.storage import STORAGE_FOLDER
10+
11+
logging.getLogger().setLevel(logging.INFO)
12+
13+
PERSON_COLUMNS = [
14+
"SERIALNO", # Household ID
15+
"SPORDER", # Person number within household
16+
"PWGTP", # Person weight
17+
"AGEP", # Age
18+
"CIT", # Citizenship
19+
"MAR", # Marital status
20+
"WAGP", # Wage/salary
21+
"SSP", # Social security income
22+
"SSIP", # Supplemental security income
23+
"SEX", # Sex
24+
"SEMP", # Self-employment income
25+
"SCHL", # Educational attainment
26+
"RETP", # Retirement income
27+
"PAP", # Public assistance income
28+
"OIP", # Other income
29+
"PERNP", # Total earnings
30+
"PINCP", # Total income
31+
"POVPIP", # Income-to-poverty line percentage
32+
"RAC1P", # Race
33+
]
34+
35+
HOUSEHOLD_COLUMNS = [
36+
"SERIALNO", # Household ID
37+
"PUMA", # PUMA area code
38+
"ST", # State code
39+
"ADJHSG", # Adjustment factor for housing dollar amounts
40+
"ADJINC", # Adjustment factor for income
41+
"WGTP", # Household weight
42+
"NP", # Number of persons in household
43+
"BDSP", # Number of bedrooms
44+
"ELEP", # Electricity monthly cost
45+
"FULP", # Fuel monthly cost
46+
"GASP", # Gas monthly cost
47+
"RMSP", # Number of rooms
48+
"RNTP", # Monthly rent
49+
"TEN", # Tenure
50+
"VEH", # Number of vehicles
51+
"FINCP", # Total income
52+
"GRNTP", # Gross rent
53+
"TAXAMT", # Property taxes
54+
]
55+
56+
57+
class CensusACS(Dataset):
58+
data_format = Dataset.TABLES
59+
60+
def generate(self) -> None:
61+
spm_url = f"https://www2.census.gov/programs-surveys/supplemental-poverty-measure/datasets/spm/spm_{self.time_period}_pu.dta"
62+
person_url = f"https://www2.census.gov/programs-surveys/acs/data/pums/{self.time_period}/1-Year/csv_pus.zip"
63+
household_url = f"https://www2.census.gov/programs-surveys/acs/data/pums/{self.time_period}/1-Year/csv_hus.zip"
64+
65+
with pd.HDFStore(self.file_path, mode="w") as storage:
66+
household = self.process_household_data(
67+
household_url, "psam_hus", HOUSEHOLD_COLUMNS
68+
)
69+
person = self.process_person_data(
70+
person_url, "psam_pus", PERSON_COLUMNS
71+
)
72+
person = person[person.SERIALNO.isin(household.SERIALNO)]
73+
household = household[household.SERIALNO.isin(person.SERIALNO)]
74+
storage["household"] = household
75+
storage["person"] = person
76+
77+
@staticmethod
78+
def process_household_data(
79+
url: str, prefix: str, columns: List[str]
80+
) -> pd.DataFrame:
81+
req = requests.get(url, stream=True)
82+
with BytesIO() as f:
83+
pbar = tqdm()
84+
for chunk in req.iter_content(chunk_size=1024):
85+
if chunk:
86+
pbar.update(len(chunk))
87+
f.write(chunk)
88+
f.seek(0)
89+
zf = ZipFile(f)
90+
a = pd.read_csv(
91+
zf.open(prefix + "a.csv"),
92+
usecols=columns,
93+
dtype={"SERIALNO": str},
94+
)
95+
b = pd.read_csv(
96+
zf.open(prefix + "b.csv"),
97+
usecols=columns,
98+
dtype={"SERIALNO": str},
99+
)
100+
res = pd.concat([a, b]).fillna(0)
101+
res.columns = res.columns.str.upper()
102+
103+
# Ensure correct data types
104+
res["ST"] = res["ST"].astype(int)
105+
106+
return res
107+
108+
@staticmethod
109+
def process_person_data(
110+
url: str, prefix: str, columns: List[str]
111+
) -> pd.DataFrame:
112+
req = requests.get(url, stream=True)
113+
with BytesIO() as f:
114+
pbar = tqdm()
115+
for chunk in req.iter_content(chunk_size=1024):
116+
if chunk:
117+
pbar.update(len(chunk))
118+
f.write(chunk)
119+
f.seek(0)
120+
zf = ZipFile(f)
121+
a = pd.read_csv(
122+
zf.open(prefix + "a.csv"),
123+
usecols=columns,
124+
dtype={"SERIALNO": str},
125+
)
126+
b = pd.read_csv(
127+
zf.open(prefix + "b.csv"),
128+
usecols=columns,
129+
dtype={"SERIALNO": str},
130+
)
131+
res = pd.concat([a, b]).fillna(0)
132+
res.columns = res.columns.str.upper()
133+
134+
# Ensure correct data types
135+
res["SPORDER"] = res["SPORDER"].astype(int)
136+
137+
return res
138+
139+
@staticmethod
140+
def create_spm_unit_table(
141+
storage: pd.HDFStore, person: pd.DataFrame
142+
) -> None:
143+
SPM_UNIT_COLUMNS = [
144+
"CAPHOUSESUB",
145+
"CAPWKCCXPNS",
146+
"CHILDCAREXPNS",
147+
"EITC",
148+
"ENGVAL",
149+
"EQUIVSCALE",
150+
"FEDTAX",
151+
"FEDTAXBC",
152+
"FICA",
153+
"GEOADJ",
154+
"MEDXPNS",
155+
"NUMADULTS",
156+
"NUMKIDS",
157+
"NUMPER",
158+
"POOR",
159+
"POVTHRESHOLD",
160+
"RESOURCES",
161+
"SCHLUNCH",
162+
"SNAPSUB",
163+
"STTAX",
164+
"TENMORTSTATUS",
165+
"TOTVAL",
166+
"WCOHABIT",
167+
"WICVAL",
168+
"WKXPNS",
169+
"WUI_LT15",
170+
"ID",
171+
]
172+
spm_table = (
173+
person[["SPM_" + column for column in SPM_UNIT_COLUMNS]]
174+
.groupby(person.SPM_ID)
175+
.first()
176+
)
177+
178+
original_person_table = storage["person"]
179+
original_person_table.to_csv("person.csv")
180+
person.to_csv("spm_person.csv")
181+
182+
# Ensure SERIALNO is treated as string
183+
JOIN_COLUMNS = ["SERIALNO", "SPORDER"]
184+
original_person_table["SERIALNO"] = original_person_table[
185+
"SERIALNO"
186+
].astype(str)
187+
original_person_table["SPORDER"] = original_person_table[
188+
"SPORDER"
189+
].astype(int)
190+
person["SERIALNO"] = person["SERIALNO"].astype(str)
191+
person["SPORDER"] = person["SPORDER"].astype(int)
192+
193+
# Add SPM_ID from the SPM person table to the original person table.
194+
combined_person_table = pd.merge(
195+
original_person_table,
196+
person[JOIN_COLUMNS + ["SPM_ID"]],
197+
on=JOIN_COLUMNS,
198+
)
199+
200+
storage["person_matched"] = combined_person_table
201+
storage["spm_unit"] = spm_table
202+
203+
204+
class CensusACS_2022(CensusACS):
205+
label = "Census ACS (2022)"
206+
name = "census_acs_2022.h5"
207+
file_path = STORAGE_FOLDER / "census_acs_2022.h5"
208+
time_period = 2022

0 commit comments

Comments
 (0)