Skip to content

Commit 0a6a4e2

Browse files
Add State and household size as rent predictors (#82)
* Add household size and state to predictors * Adjust reweighting algorithm * Adjust learning rate * Remove <10k AGI taxable targets and update documentation * Update dataset locations
1 parent b9ca068 commit 0a6a4e2

5 files changed

Lines changed: 110 additions & 87 deletions

File tree

changelog_entry.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
- bump: minor
2+
changes:
3+
added:
4+
- State and household size as predictors for rent and property taxes.

docs/validation.ipynb

Lines changed: 72 additions & 72 deletions
Large diffs are not rendered by default.

policyengine_us_data/datasets/cps/cps.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,8 @@ def add_rent(self, cps: h5py.File, person: DataFrame, household: DataFrame):
8181
"self_employment_income",
8282
"social_security",
8383
"pension_income",
84+
"state_code_str",
85+
"household_size",
8486
]
8587
IMPUTATIONS = ["rent", "real_estate_taxes"]
8688
train_df = acs.calculate_dataframe(PREDICTORS + IMPUTATIONS)
@@ -593,7 +595,7 @@ class CPS_2021(CPS):
593595
label = "CPS 2021"
594596
raw_cps = CensusCPS_2021
595597
previous_year_raw_cps = CensusCPS_2020
596-
file_path = STORAGE_FOLDER / "cps_2021.h5"
598+
file_path = STORAGE_FOLDER / "cps_2021_v1_6_1.h5"
597599
time_period = 2021
598600

599601

@@ -602,7 +604,7 @@ class CPS_2022(CPS):
602604
label = "CPS 2022"
603605
raw_cps = CensusCPS_2022
604606
previous_year_raw_cps = CensusCPS_2021
605-
file_path = STORAGE_FOLDER / "cps_2022.h5"
607+
file_path = STORAGE_FOLDER / "cps_2022_v1_6_1.h5"
606608
time_period = 2022
607609

608610

@@ -618,9 +620,9 @@ class CPS_2023(CPS):
618620
class CPS_2024(CPS):
619621
name = "cps_2024"
620622
label = "CPS 2024 (2022-based)"
621-
file_path = STORAGE_FOLDER / "cps_2024.h5"
623+
file_path = STORAGE_FOLDER / "cps_2024_v1_6_1.h5"
622624
time_period = 2024
623-
url = "release://policyengine/policyengine-us-data/release/cps_2024.h5"
625+
url = "release://policyengine/policyengine-us-data/release/cps_2024_v1_6_1.h5"
624626

625627

626628
class PooledCPS(Dataset):
@@ -672,14 +674,14 @@ def generate(self):
672674
class Pooled_3_Year_CPS_2023(PooledCPS):
673675
label = "CPS 2023 (3-year pooled)"
674676
name = "pooled_3_year_cps_2023"
675-
file_path = STORAGE_FOLDER / "pooled_3_year_cps_2023.h5"
677+
file_path = STORAGE_FOLDER / "pooled_3_year_cps_2023_v1_6_1.h5"
676678
input_datasets = [
677679
CPS_2021,
678680
CPS_2022,
679681
CPS_2023,
680682
]
681683
time_period = 2023
682-
url = "release://PolicyEngine/policyengine-us-data/release/pooled_3_year_cps_2023.h5"
684+
url = "release://PolicyEngine/policyengine-us-data/release/pooled_3_year_cps_2023_v1_6_1.h5"
683685

684686

685687
if __name__ == "__main__":

policyengine_us_data/datasets/cps/enhanced_cps.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -48,20 +48,23 @@ def loss(weights):
4848
) ** 2
4949
if torch.isnan(rel_error).any():
5050
raise ValueError("Relative error contains NaNs")
51-
worst_name = target_names[torch.argmax(rel_error)]
52-
worst_val = rel_error[torch.argmax(rel_error)].item()
53-
return rel_error.mean(), worst_name, worst_val
51+
return rel_error.mean()
5452

55-
optimizer = torch.optim.Adam([weights], lr=1e-2)
53+
optimizer = torch.optim.Adam([weights], lr=1e-1)
5654
from tqdm import trange
5755

58-
iterator = trange(10_000)
56+
start_loss = None
57+
58+
iterator = trange(1_000)
5959
for i in iterator:
6060
optimizer.zero_grad()
61-
l, worst_name, worst_val = loss(torch.exp(weights))
61+
l = loss(torch.exp(weights))
62+
if start_loss is None:
63+
start_loss = l.item()
64+
loss_rel_change = (l.item() - start_loss) / start_loss
6265
l.backward()
6366
iterator.set_postfix(
64-
{"loss": l.item(), "worst": worst_name, "val": worst_val}
67+
{"loss": l.item(), "loss_rel_change": loss_rel_change}
6568
)
6669
optimizer.step()
6770

@@ -173,8 +176,8 @@ class EnhancedCPS_2024(EnhancedCPS):
173176
end_year = 2024
174177
name = "enhanced_cps_2024"
175178
label = "Enhanced CPS 2024"
176-
file_path = STORAGE_FOLDER / "enhanced_cps_2024.h5"
177-
url = "release://policyengine/policyengine-us-data/release/enhanced_cps_2024.h5"
179+
file_path = STORAGE_FOLDER / "enhanced_cps_2024_v1_6_1.h5"
180+
url = "release://policyengine/policyengine-us-data/release/enhanced_cps_2024_v1_6_1.h5"
178181

179182

180183
if __name__ == "__main__":

policyengine_us_data/utils/loss.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,9 @@ def build_loss_matrix(dataset: type, time_period):
7171
if not row["Taxable only"]:
7272
continue # exclude non "taxable returns" statistics
7373

74+
if row["AGI upper bound"] <= 10_000:
75+
continue
76+
7477
mask = (
7578
(agi >= row["AGI lower bound"])
7679
* (agi < row["AGI upper bound"])
@@ -180,6 +183,17 @@ def build_loss_matrix(dataset: type, time_period):
180183
).calibration.gov.cbo._children[variable_name]
181184
)
182185

186+
# Treasury EITC
187+
188+
loss_matrix["treasury/eitc"] = sim.calculate(
189+
"eitc", map_to="household"
190+
).values
191+
targets_array.append(
192+
sim.tax_benefit_system.parameters(
193+
time_period
194+
).calibration.gov.treasury.tax_expenditures.eitc
195+
)
196+
183197
# CPS-derived statistics
184198
# Medical expenses, sum of spm thresholds
185199
# Child support expenses

0 commit comments

Comments
 (0)