Skip to content

Commit 48e0011

Browse files
Calibrate EITC by number of qualifying dependents and apply dropout (#95)
* Add dropout * Add dropout and EITC calibration * Format * Add install catch * Change download folder * Flip order of US install and bump US * Add EITC targets * Update data links
1 parent 371f77a commit 48e0011

10 files changed

Lines changed: 123 additions & 71 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,5 +6,6 @@
66
!uprating_factors.csv
77
!uprating_growth_factors.csv
88
!healthcare_spending.csv
9+
!eitc.csv
910
!spm_threshold_agi.csv
1011
**/_build

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@ test:
99
pytest
1010

1111
install:
12-
pip install -e ".[dev]"
13-
pip install policyengine-us==1.100.0
12+
pip install policyengine-us==1.109.0
13+
pip install -e ".[dev]" --config-settings editable_mode=compat
1414

1515
changelog:
1616
build-changelog changelog.yaml --output changelog.yaml --update-last-date --start-from 1.0.0 --append-file changelog_entry.yaml

changelog_entry.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
- bump: minor
2+
changes:
3+
added:
4+
- EITC calibration by child counts.
5+
- 10% dropout during weight calibration.

docs/validation.ipynb

Lines changed: 55 additions & 55 deletions
Large diffs are not rendered by default.

policyengine_us_data/datasets/acs/acs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ class ACS_2022(ACS):
111111
time_period = 2022
112112
file_path = STORAGE_FOLDER / "acs_2022.h5"
113113
census_acs = CensusACS_2022
114-
url = "release://PolicyEngine/policyengine-us-data/1.8.0/acs_2022.h5"
114+
url = "release://PolicyEngine/policyengine-us-data/1.9.0/acs_2022.h5"
115115

116116

117117
if __name__ == "__main__":

policyengine_us_data/datasets/cps/cps.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -622,7 +622,7 @@ class CPS_2024(CPS):
622622
label = "CPS 2024 (2022-based)"
623623
file_path = STORAGE_FOLDER / "cps_2024.h5"
624624
time_period = 2024
625-
url = "release://policyengine/policyengine-us-data/1.8.0/cps_2024.h5"
625+
url = "release://policyengine/policyengine-us-data/1.9.0/cps_2024.h5"
626626

627627

628628
class PooledCPS(Dataset):
@@ -681,7 +681,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS):
681681
CPS_2023,
682682
]
683683
time_period = 2023
684-
url = "release://PolicyEngine/policyengine-us-data/1.8.0/pooled_3_year_cps_2023.h5"
684+
url = "release://PolicyEngine/policyengine-us-data/1.9.0/pooled_3_year_cps_2023.h5"
685685

686686

687687
if __name__ == "__main__":

policyengine_us_data/datasets/cps/enhanced_cps.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ def reweight(
2525
original_weights,
2626
loss_matrix,
2727
targets_array,
28+
dropout_rate=0.1,
2829
):
2930
target_names = np.array(loss_matrix.columns)
3031
loss_matrix = torch.tensor(loss_matrix.values, dtype=torch.float32)
@@ -50,15 +51,26 @@ def loss(weights):
5051
raise ValueError("Relative error contains NaNs")
5152
return rel_error.mean()
5253

54+
def dropout_weights(weights, p):
55+
if p == 0:
56+
return weights
57+
# Replace p% of the weights with the mean value of the rest of them
58+
mask = torch.rand_like(weights) < p
59+
mean = weights[~mask].mean()
60+
masked_weights = weights.clone()
61+
masked_weights[mask] = mean
62+
return masked_weights
63+
5364
optimizer = torch.optim.Adam([weights], lr=1e-1)
5465
from tqdm import trange
5566

5667
start_loss = None
5768

58-
iterator = trange(1_000)
69+
iterator = trange(5_000)
5970
for i in iterator:
6071
optimizer.zero_grad()
61-
l = loss(torch.exp(weights))
72+
weights_ = dropout_weights(weights, dropout_rate)
73+
l = loss(torch.exp(weights_))
6274
if start_loss is None:
6375
start_loss = l.item()
6476
loss_rel_change = (l.item() - start_loss) / start_loss
@@ -177,7 +189,7 @@ class EnhancedCPS_2024(EnhancedCPS):
177189
name = "enhanced_cps_2024"
178190
label = "Enhanced CPS 2024"
179191
file_path = STORAGE_FOLDER / "enhanced_cps_2024.h5"
180-
url = "release://policyengine/policyengine-us-data/1.8.0/enhanced_cps_2024.h5"
192+
url = "release://policyengine/policyengine-us-data/1.9.0/enhanced_cps_2024.h5"
181193

182194

183195
if __name__ == "__main__":
Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,18 @@
11
from policyengine_us_data.utils.github import download
22
from pathlib import Path
3-
4-
FOLDER = Path(__file__).parent
3+
from policyengine_us_data.storage import STORAGE_FOLDER
54

65
download(
76
"PolicyEngine",
87
"policyengine-us-data",
98
"release",
109
"soi.csv",
11-
FOLDER / "soi.csv",
10+
STORAGE_FOLDER / "soi.csv",
1211
)
1312
download(
1413
"PolicyEngine",
1514
"policyengine-us-data",
1615
"release",
1716
"np2023_d5_mid.csv",
18-
FOLDER / "np2023_d5_mid.csv",
17+
STORAGE_FOLDER / "np2023_d5_mid.csv",
1918
)
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
count_children,eitc_returns,eitc_total,year
2+
0,15108515,12427886000,2021
3+
1,8500483,19879365000,2021
4+
2,5542949,20472827000,2021

policyengine_us_data/utils/loss.py

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -188,11 +188,42 @@ def build_loss_matrix(dataset: type, time_period):
188188
loss_matrix["treasury/eitc"] = sim.calculate(
189189
"eitc", map_to="household"
190190
).values
191-
targets_array.append(
192-
sim.tax_benefit_system.parameters(
193-
time_period
194-
).calibration.gov.treasury.tax_expenditures.eitc
191+
eitc_spending = (
192+
sim.tax_benefit_system.parameters.calibration.gov.treasury.tax_expenditures.eitc
195193
)
194+
targets_array.append(eitc_spending(time_period))
195+
196+
# IRS EITC filers and totals by child counts
197+
eitc_stats = pd.read_csv(STORAGE_FOLDER / "eitc.csv")
198+
199+
eitc_spending_uprating = eitc_spending(time_period) / eitc_spending(2021)
200+
population = (
201+
sim.tax_benefit_system.parameters.calibration.gov.census.populations.total
202+
)
203+
population_uprating = population(time_period) / population(2021)
204+
205+
for _, row in eitc_stats.iterrows():
206+
returns_label = (
207+
f"irs/eitc/returns/count_children_{row['count_children']}"
208+
)
209+
eitc_eligible_children = sim.calculate("eitc_child_count").values
210+
eitc = sim.calculate("eitc").values
211+
loss_matrix[returns_label] = sim.map_result(
212+
(eitc > 0) * (eitc_eligible_children == row["count_children"]),
213+
"tax_unit",
214+
"household",
215+
)
216+
targets_array.append(row["eitc_returns"] * population_uprating)
217+
218+
spending_label = (
219+
f"irs/eitc/spending/count_children_{row['count_children']}"
220+
)
221+
loss_matrix[spending_label] = sim.map_result(
222+
eitc * (eitc_eligible_children == row["count_children"]),
223+
"tax_unit",
224+
"household",
225+
)
226+
targets_array.append(row["eitc_total"] * eitc_spending_uprating)
196227

197228
# CPS-derived statistics
198229
# Medical expenses, sum of spm thresholds

0 commit comments

Comments
 (0)