Calibrate EITC by number of qualifying dependents and apply dropout (#95)

nikhilwoodruff · web-flow · commit 48e0011ba828 · 2024-10-07T12:45:38.000+01:00
* Add dropout

* Add dropout and EITC calibration

* Format

* Add install catch

* Change download folder

* Flip order of US install and bump US

* Add EITC targets

* Update data links
diff --git a/.gitignore b/.gitignore
@@ -6,5 +6,6 @@
 !uprating_factors.csv
 !uprating_growth_factors.csv
 !healthcare_spending.csv
+!eitc.csv
 !spm_threshold_agi.csv
 **/_build
diff --git a/Makefile b/Makefile
@@ -9,8 +9,8 @@ test:
 	pytest
 
 install:
-	pip install -e ".[dev]"
-	pip install policyengine-us==1.100.0
+	pip install policyengine-us==1.109.0
+	pip install -e ".[dev]"  --config-settings editable_mode=compat
 
 changelog:
 	build-changelog changelog.yaml --output changelog.yaml --update-last-date --start-from 1.0.0 --append-file changelog_entry.yaml
diff --git a/changelog_entry.yaml b/changelog_entry.yaml
@@ -0,0 +1,5 @@
+- bump: minor
+  changes:
+    added:
+    - EITC calibration by child counts.
+    - 10% dropout during weight calibration.
diff --git a/docs/validation.ipynb b/docs/validation.ipynb
diff --git a/policyengine_us_data/datasets/acs/acs.py b/policyengine_us_data/datasets/acs/acs.py
@@ -111,7 +111,7 @@ class ACS_2022(ACS):
     time_period = 2022
     file_path = STORAGE_FOLDER / "acs_2022.h5"
     census_acs = CensusACS_2022
-    url = "release://PolicyEngine/policyengine-us-data/1.8.0/acs_2022.h5"
+    url = "release://PolicyEngine/policyengine-us-data/1.9.0/acs_2022.h5"
 
 
 if __name__ == "__main__":
diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
@@ -622,7 +622,7 @@ class CPS_2024(CPS):
     label = "CPS 2024 (2022-based)"
     file_path = STORAGE_FOLDER / "cps_2024.h5"
     time_period = 2024
-    url = "release://policyengine/policyengine-us-data/1.8.0/cps_2024.h5"
+    url = "release://policyengine/policyengine-us-data/1.9.0/cps_2024.h5"
 
 
 class PooledCPS(Dataset):
@@ -681,7 +681,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS):
         CPS_2023,
     ]
     time_period = 2023
-    url = "release://PolicyEngine/policyengine-us-data/1.8.0/pooled_3_year_cps_2023.h5"
+    url = "release://PolicyEngine/policyengine-us-data/1.9.0/pooled_3_year_cps_2023.h5"
 
 
 if __name__ == "__main__":
diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -25,6 +25,7 @@ def reweight(
     original_weights,
     loss_matrix,
     targets_array,
+    dropout_rate=0.1,
 ):
     target_names = np.array(loss_matrix.columns)
     loss_matrix = torch.tensor(loss_matrix.values, dtype=torch.float32)
@@ -50,15 +51,26 @@ def loss(weights):
             raise ValueError("Relative error contains NaNs")
         return rel_error.mean()
 
+    def dropout_weights(weights, p):
+        if p == 0:
+            return weights
+        # Replace p% of the weights with the mean value of the rest of them
+        mask = torch.rand_like(weights) < p
+        mean = weights[~mask].mean()
+        masked_weights = weights.clone()
+        masked_weights[mask] = mean
+        return masked_weights
+
     optimizer = torch.optim.Adam([weights], lr=1e-1)
     from tqdm import trange
 
     start_loss = None
 
-    iterator = trange(1_000)
+    iterator = trange(5_000)
     for i in iterator:
         optimizer.zero_grad()
-        l = loss(torch.exp(weights))
+        weights_ = dropout_weights(weights, dropout_rate)
+        l = loss(torch.exp(weights_))
         if start_loss is None:
             start_loss = l.item()
         loss_rel_change = (l.item() - start_loss) / start_loss
@@ -177,7 +189,7 @@ class EnhancedCPS_2024(EnhancedCPS):
     name = "enhanced_cps_2024"
     label = "Enhanced CPS 2024"
     file_path = STORAGE_FOLDER / "enhanced_cps_2024.h5"
-    url = "release://policyengine/policyengine-us-data/1.8.0/enhanced_cps_2024.h5"
+    url = "release://policyengine/policyengine-us-data/1.9.0/enhanced_cps_2024.h5"
 
 
 if __name__ == "__main__":
diff --git a/policyengine_us_data/storage/download_public_prerequisites.py b/policyengine_us_data/storage/download_public_prerequisites.py
@@ -1,19 +1,18 @@
 from policyengine_us_data.utils.github import download
 from pathlib import Path
-
-FOLDER = Path(__file__).parent
+from policyengine_us_data.storage import STORAGE_FOLDER
 
 download(
     "PolicyEngine",
     "policyengine-us-data",
     "release",
     "soi.csv",
-    FOLDER / "soi.csv",
+    STORAGE_FOLDER / "soi.csv",
 )
 download(
     "PolicyEngine",
     "policyengine-us-data",
     "release",
     "np2023_d5_mid.csv",
-    FOLDER / "np2023_d5_mid.csv",
+    STORAGE_FOLDER / "np2023_d5_mid.csv",
 )
diff --git a/policyengine_us_data/storage/eitc.csv b/policyengine_us_data/storage/eitc.csv
@@ -0,0 +1,4 @@
+count_children,eitc_returns,eitc_total,year
+0,15108515,12427886000,2021
+1,8500483,19879365000,2021
+2,5542949,20472827000,2021
diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py
@@ -188,11 +188,42 @@ def build_loss_matrix(dataset: type, time_period):
     loss_matrix["treasury/eitc"] = sim.calculate(
         "eitc", map_to="household"
     ).values
-    targets_array.append(
-        sim.tax_benefit_system.parameters(
-            time_period
-        ).calibration.gov.treasury.tax_expenditures.eitc
+    eitc_spending = (
+        sim.tax_benefit_system.parameters.calibration.gov.treasury.tax_expenditures.eitc
     )
+    targets_array.append(eitc_spending(time_period))
+
+    # IRS EITC filers and totals by child counts
+    eitc_stats = pd.read_csv(STORAGE_FOLDER / "eitc.csv")
+
+    eitc_spending_uprating = eitc_spending(time_period) / eitc_spending(2021)
+    population = (
+        sim.tax_benefit_system.parameters.calibration.gov.census.populations.total
+    )
+    population_uprating = population(time_period) / population(2021)
+
+    for _, row in eitc_stats.iterrows():
+        returns_label = (
+            f"irs/eitc/returns/count_children_{row['count_children']}"
+        )
+        eitc_eligible_children = sim.calculate("eitc_child_count").values
+        eitc = sim.calculate("eitc").values
+        loss_matrix[returns_label] = sim.map_result(
+            (eitc > 0) * (eitc_eligible_children == row["count_children"]),
+            "tax_unit",
+            "household",
+        )
+        targets_array.append(row["eitc_returns"] * population_uprating)
+
+        spending_label = (
+            f"irs/eitc/spending/count_children_{row['count_children']}"
+        )
+        loss_matrix[spending_label] = sim.map_result(
+            eitc * (eitc_eligible_children == row["count_children"]),
+            "tax_unit",
+            "household",
+        )
+        targets_array.append(row["eitc_total"] * eitc_spending_uprating)
 
     # CPS-derived statistics
     # Medical expenses, sum of spm thresholds