From de069f7ff8a5c15bdaf6553a422d7a871062d231 Mon Sep 17 00:00:00 2001 From: Sandeep Tuniki Date: Wed, 24 Jun 2026 14:01:19 +0530 Subject: [PATCH 01/33] feat(aggregation): implement stateless stage-based orchestration - Reorganized aggregation logic into a cohesive 'aggregation' package. - Implemented AggregationOrchestrator supporting stage-based parallel execution and wildcards. - Implemented validator utility (CLI & programmatic) validating against schema.json. - Added new stateless /aggregation/initiate and /poll FastAPI endpoints. - Retained legacy /run and /status wrappers for backward compatibility. - Updated spanner-ingestion-workflow.yaml to use the state-passing loop. - Added comprehensive unit and integration test suites (25 tests total, 100% passing). --- .../ingestion-helper/aggregation.yaml | 26 ++ .../ingestion-helper/aggregation/__init__.py | 6 +- .../aggregation/orchestrator.py | 191 ++++++++++++ .../aggregation/orchestrator_test.py | 177 +++++++++++ .../ingestion-helper/aggregation/schema.json | 93 ++++++ .../ingestion-helper/aggregation/validator.py | 132 ++++++++ .../aggregation/validator_test.py | 289 ++++++++++++++++++ .../workflow/ingestion-helper/app_test.py | 72 +++++ .../workflow/ingestion-helper/pyproject.toml | 2 + .../ingestion-helper/routes/aggregation.py | 215 ++++++++++--- .../ingestion-helper/utils/aggregation.py | 98 ------ .../utils/aggregation_test.py | 87 ------ pipeline/workflow/ingestion-helper/uv.lock | 164 ++++++++++ .../workflow/spanner-ingestion-workflow.yaml | 48 +-- 14 files changed, 1356 insertions(+), 244 deletions(-) create mode 100644 pipeline/workflow/ingestion-helper/aggregation.yaml create mode 100644 pipeline/workflow/ingestion-helper/aggregation/orchestrator.py create mode 100644 pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py create mode 100644 pipeline/workflow/ingestion-helper/aggregation/schema.json create mode 100644 pipeline/workflow/ingestion-helper/aggregation/validator.py create mode 100644 pipeline/workflow/ingestion-helper/aggregation/validator_test.py delete mode 100644 pipeline/workflow/ingestion-helper/utils/aggregation.py delete mode 100644 pipeline/workflow/ingestion-helper/utils/aggregation_test.py diff --git a/pipeline/workflow/ingestion-helper/aggregation.yaml b/pipeline/workflow/ingestion-helper/aggregation.yaml new file mode 100644 index 000000000..b6eb01acb --- /dev/null +++ b/pipeline/workflow/ingestion-helper/aggregation.yaml @@ -0,0 +1,26 @@ +# ============================================================================= +# Data Commons Aggregation Configuration +# ============================================================================= +# By default, this runs the standard global post-processing steps in parallel. +# You can customize this file to add place-based rollups, statistical variable +# aggregations, or to disable specific steps. + +aggregations: + + # --------------------------------------------------------------------------- + # Stage 1: Standard Global Steps (Run in parallel by default) + # --------------------------------------------------------------------------- + # Generates linkedContainedInPlace, linkedMemberOf, etc. + - type: linked_edges + imports: ["*"] + stage: 1 + + # Generates summary statistics in the Cache table + - type: provenance_summary + imports: ["*"] + stage: 1 + + # Generates the Statistical Variable hierarchy/verticals + - type: stat_var_groups + imports: ["*"] + stage: 1 diff --git a/pipeline/workflow/ingestion-helper/aggregation/__init__.py b/pipeline/workflow/ingestion-helper/aggregation/__init__.py index 0b76748ab..5fedbd914 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/__init__.py +++ b/pipeline/workflow/ingestion-helper/aggregation/__init__.py @@ -23,6 +23,8 @@ from .stat_var_aggregator import StatVarAggregator from .place_aggregation_generator import PlaceAggregationGenerator from .stat_var_group_generator import StatVarGroupGenerator +from .orchestrator import AggregationOrchestrator +from .validator import validate_config __all__ = [ 'BigQueryExecutor', @@ -30,5 +32,7 @@ 'ProvenanceSummaryGenerator', 'StatVarAggregator', 'PlaceAggregationGenerator', - 'StatVarGroupGenerator' + 'StatVarGroupGenerator', + 'AggregationOrchestrator', + 'validate_config' ] diff --git a/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py b/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py new file mode 100644 index 000000000..db823a8b4 --- /dev/null +++ b/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py @@ -0,0 +1,191 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +from typing import Any, Dict, List, Optional + +from .bq_executor import BigQueryExecutor +from .linked_edge_generator import LinkedEdgeGenerator +from .provenance_summary_generator import ProvenanceSummaryGenerator +from .stat_var_aggregator import StatVarAggregator +from .place_aggregation_generator import PlaceAggregationGenerator +from .stat_var_group_generator import StatVarGroupGenerator +from .validator import validate_config + +logging.getLogger().setLevel(logging.INFO) + + +class AggregationOrchestrator: + """Orchestrates the overall aggregation workflow.""" + + def __init__(self, + connection_id: str, + project_id: str, + instance_id: str, + database_id: str, + location: Optional[str] = None, + is_base_dc: bool = True, + config_file_path: Optional[str] = None) -> None: + """Initializes the orchestrator and loads/validates the configuration. + + Args: + connection_id: BigQuery connection ID to Spanner. + project_id: GCP Project ID. + instance_id: Spanner Instance ID. + database_id: Spanner Database ID. + location: BigQuery location. + is_base_dc: Whether this is running in the base Data Commons environment. + config_file_path: Optional custom path to the aggregation.yaml file. + If not specified, defaults to the aggregation.yaml in the parent directory. + """ + # Always run asynchronously at the executor level for stages to run in parallel + # We handle sequential blocking between stages at the workflow/router level + self.executor = BigQueryExecutor(connection_id=connection_id, + project_id=project_id, + instance_id=instance_id, + database_id=database_id, + location=location, + run_sequential=False) + + # Initialize all generators + self.place_generator = PlaceAggregationGenerator(self.executor, is_base_dc) + self.stat_var_aggregator = StatVarAggregator(self.executor, is_base_dc) + self.linked_edge_generator = LinkedEdgeGenerator(self.executor, is_base_dc) + self.provenance_summary_generator = ProvenanceSummaryGenerator(self.executor, is_base_dc) + self.stat_var_group_generator = StatVarGroupGenerator(self.executor, is_base_dc) + + # Resolve paths for default config and schema + curr_dir = os.path.dirname(os.path.abspath(__file__)) + if not config_file_path: + config_file_path = os.path.join(curr_dir, "..", "aggregation.yaml") + schema_file_path = os.path.join(curr_dir, "schema.json") + + # Load and validate configuration + self.aggregations = validate_config(config_file_path, schema_file_path) + + + + def execute_stage(self, stage_num: int, active_imports: List[str]) -> List[str]: + """Executes all enabled aggregations in the specified stage in parallel. + + Args: + stage_num: The stage number to execute. + active_imports: The list of active import names in this run. + + Returns: + A list of BigQuery job IDs submitted for this stage. + """ + logging.info(f"Executing Aggregation Stage {stage_num} for active imports: {active_imports}") + jobs = [] + + for config in self.aggregations: + # 1. Skip if disabled + if config.get("disabled", False): + continue + + # 2. Filter by stage + if config.get("stage", 1) != stage_num: + continue + + # 3. Filter by active imports + applicable_imports = self._get_applicable_imports(config, active_imports) + if not applicable_imports: + logging.info(f"Skipping step '{config['type']}' in Stage {stage_num}: no matching active imports.") + continue + + # 4. Route to correct generator + step_type = config["type"] + logging.info(f"Submitting step '{step_type}' in Stage {stage_num} for imports: {applicable_imports}") + + step_jobs = [] + if step_type == "place": + job = self.place_generator.aggregate_places( + import_names=applicable_imports, + source_type=config["source_type"], + destination_type=config["destination_type"], + allow_multiple_to_places=config.get("allow_multiple_to_places", False) + ) + if job: step_jobs.append(job) + + elif step_type == "stat_var": + sv_jobs = self.stat_var_aggregator.aggregate_stat_vars( + ancestor_sv=config["ancestor_sv_id"], + source_svs=config["source_sv_ids"], + import_names=applicable_imports, + output_import_name=config.get("output_import_name"), + skip_all_sources_present_check=config.get("skip_all_sources_present_check", False) + ) + step_jobs.extend(sv_jobs) + + elif step_type == "linked_edges": + step_jobs.extend(self.linked_edge_generator.run_all(applicable_imports)) + + elif step_type == "provenance_summary": + step_jobs.extend(self.provenance_summary_generator.run_all(applicable_imports)) + + elif step_type == "stat_var_groups": + step_jobs.extend(self.stat_var_group_generator.run_all(applicable_imports)) + + # Collect BQ jobs + for job in step_jobs: + if job and job.job_id: + jobs.append(job.job_id) + + logging.info(f"Submitted {len(jobs)} jobs in Stage {stage_num}: {jobs}") + return jobs + + def has_stage(self, stage_num: int, active_imports: List[str]) -> bool: + """Checks if there are any active, enabled aggregations configured for the stage. + + Args: + stage_num: The stage number to check. + active_imports: The list of active import names. + + Returns: + True if the stage has at least one aggregation that will run, False otherwise. + """ + for config in self.aggregations: + if config.get("disabled", False): + continue + if config.get("stage", 1) != stage_num: + continue + + # Check if it applies to any active imports + if self._get_applicable_imports(config, active_imports): + return True + + return False + + def check_jobs_status(self, job_ids: List[str]) -> Dict[str, Any]: + """Checks the status of the specified BigQuery job IDs. + + Delegates to the BigQueryExecutor's get_jobs_status. + """ + try: + return self.executor.get_jobs_status(job_ids) + except Exception as e: + logging.error(f"Failed to check jobs status: {e}") + raise e + + def _get_applicable_imports(self, config: Dict[str, Any], active_imports: List[str]) -> List[str]: + """Determines which active imports apply to this aggregation config.""" + configured_imports = config["imports"] + + # Explicit wildcard check + if "*" in configured_imports: + return active_imports + + # Intersection of configured and active imports + return list(set(configured_imports).intersection(active_imports)) diff --git a/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py b/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py new file mode 100644 index 000000000..4301944c5 --- /dev/null +++ b/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py @@ -0,0 +1,177 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import sys +import unittest +from unittest.mock import MagicMock, patch, mock_open + +sys.path.append(os.path.dirname(os.path.dirname(__file__))) + +from aggregation import AggregationOrchestrator + + +# Sample valid YAML config for testing +VALID_CONFIG_YAML = """ +aggregations: + - type: linked_edges + imports: ["*"] + stage: 1 + + - type: place + source_type: County + destination_type: State + allow_multiple_to_places: false + imports: ["USFed_Census"] + stage: 1 + + - type: place + source_type: State + destination_type: Country + imports: ["*"] + stage: 2 + disabled: true + + - type: stat_var + ancestor_sv_id: Count_Person + source_sv_ids: ["Count_Person_Male", "Count_Person_Female"] + skip_all_sources_present_check: true + imports: ["USFed_Census"] + stage: 2 +""" + + + + +@patch('aggregation.orchestrator.BigQueryExecutor') +@patch('aggregation.orchestrator.PlaceAggregationGenerator') +@patch('aggregation.orchestrator.StatVarAggregator') +@patch('aggregation.orchestrator.LinkedEdgeGenerator') +@patch('aggregation.orchestrator.ProvenanceSummaryGenerator') +@patch('aggregation.orchestrator.StatVarGroupGenerator') +class TestAggregationOrchestrator(unittest.TestCase): + + def setUp(self): + # Load the actual schema for validation tests (now inside aggregation/) + schema_path = os.path.join(os.path.dirname(__file__), "schema.json") + with open(schema_path, "r") as f: + self.schema_json = json.load(f) + + def _get_mock_open(self, yaml_content): + """Helper to mock open() calls for both the config YAML and the schema JSON.""" + def side_effect(path, *args, **kwargs): + if "schema.json" in path: + return mock_open(read_data=json.dumps(self.schema_json))().__enter__() + elif "aggregation.yaml" in path: + return mock_open(read_data=yaml_content)().__enter__() + raise FileNotFoundError(f"Mock open not configured for: {path}") + return side_effect + + + + @patch('builtins.open') + def test_has_stage(self, mock_file_open, *mocks): + """Tests the has_stage method for active, disabled, and non-matching stages.""" + mock_file_open.side_effect = self._get_mock_open(VALID_CONFIG_YAML) + utils = AggregationOrchestrator(connection_id="conn", project_id="proj", instance_id="inst", database_id="db") + + # Stage 1: Has linked_edges (wildcard) and place (USFed_Census) + self.assertTrue(utils.has_stage(1, ["AnyImport"])) + self.assertTrue(utils.has_stage(1, ["USFed_Census"])) + + # Stage 2: Has place (disabled: true) and stat_var (USFed_Census) + # If active import is "OtherImport", Stage 2 has no active aggregations (stat_var doesn't match, place is disabled) + self.assertFalse(utils.has_stage(2, ["OtherImport"])) + # If active import is "USFed_Census", Stage 2 has stat_var active + self.assertTrue(utils.has_stage(2, ["USFed_Census"])) + + # Stage 3: Does not exist in config + self.assertFalse(utils.has_stage(3, ["USFed_Census"])) + + @patch('builtins.open') + def test_execute_stage_1(self, mock_file_open, + mock_svg_gen, mock_prov_gen, mock_edge_gen, + mock_sv_agg, mock_place_gen, mock_executor): + """Tests executing Stage 1, verifying parallel job submission and wildcard resolution.""" + mock_file_open.side_effect = self._get_mock_open(VALID_CONFIG_YAML) + + # Setup generator mocks to return mock jobs + mock_job1 = MagicMock() + mock_job1.job_id = "job-edge-1" + mock_edge_gen.return_value.run_all.return_value = [mock_job1] + + mock_job2 = MagicMock() + mock_job2.job_id = "job-place-1" + mock_place_gen.return_value.aggregate_places.return_value = mock_job2 + + utils = AggregationOrchestrator(connection_id="conn", project_id="proj", instance_id="inst", database_id="db") + + # Execute Stage 1 for active imports: ["USFed_Census"] + job_ids = utils.execute_stage(stage_num=1, active_imports=["USFed_Census"]) + + # Assertions + self.assertEqual(len(job_ids), 2) + self.assertIn("job-edge-1", job_ids) + self.assertIn("job-place-1", job_ids) + + # Verify linked_edges ran for ALL active imports (wildcard '*') + mock_edge_gen.return_value.run_all.assert_called_once_with(["USFed_Census"]) + + # Verify place rollup ran for matching import "USFed_Census" + mock_place_gen.return_value.aggregate_places.assert_called_once_with( + import_names=["USFed_Census"], + source_type="County", + destination_type="State", + allow_multiple_to_places=False + ) + + @patch('builtins.open') + def test_execute_stage_2_with_disabled_and_filtering(self, mock_file_open, + mock_svg_gen, mock_prov_gen, mock_edge_gen, + mock_sv_agg, mock_place_gen, mock_executor): + """Tests Stage 2, verifying that disabled steps are skipped and non-matching imports are filtered.""" + mock_file_open.side_effect = self._get_mock_open(VALID_CONFIG_YAML) + + # Setup mock for Stage 2 stat_var job + mock_job_sv = MagicMock() + mock_job_sv.job_id = "job-sv-1" + mock_sv_agg.return_value.aggregate_stat_vars.return_value = [mock_job_sv] + + utils = AggregationOrchestrator(connection_id="conn", project_id="proj", instance_id="inst", database_id="db") + + # 1. Run for an import that does NOT match stat_var ("OtherImport") + # Since the place rollup in Stage 2 is disabled, no jobs should run at all. + job_ids = utils.execute_stage(stage_num=2, active_imports=["OtherImport"]) + self.assertEqual(len(job_ids), 0) + mock_place_gen.return_value.aggregate_places.assert_not_called() + mock_sv_agg.return_value.aggregate_stat_vars.assert_not_called() + + # 2. Run for matching import "USFed_Census" + # The disabled place rollup should still be skipped, but the stat_var aggregation should execute. + job_ids = utils.execute_stage(stage_num=2, active_imports=["USFed_Census"]) + + self.assertEqual(job_ids, ["job-sv-1"]) + mock_place_gen.return_value.aggregate_places.assert_not_called() # Still skipped because disabled: true + mock_sv_agg.return_value.aggregate_stat_vars.assert_called_once_with( + ancestor_sv="Count_Person", + source_svs=["Count_Person_Male", "Count_Person_Female"], + import_names=["USFed_Census"], + output_import_name=None, + skip_all_sources_present_check=True + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/pipeline/workflow/ingestion-helper/aggregation/schema.json b/pipeline/workflow/ingestion-helper/aggregation/schema.json new file mode 100644 index 000000000..b0deba8f6 --- /dev/null +++ b/pipeline/workflow/ingestion-helper/aggregation/schema.json @@ -0,0 +1,93 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "AggregationConfig", + "type": "object", + "properties": { + "aggregations": { + "type": "array", + "items": { + "type": "object", + "required": ["type", "imports"], + "properties": { + "type": { + "type": "string", + "enum": ["place", "stat_var", "entity", "linked_edges", "provenance_summary", "stat_var_groups"] + }, + "disabled": { + "type": "boolean", + "default": false + }, + "stage": { + "type": "integer", + "minimum": 1, + "default": 1 + }, + "imports": { + "type": "array", + "items": { "type": "string" }, + "minItems": 1 + } + }, + "dependencies": { + "type": { + "oneOf": [ + { + "properties": { + "type": { "const": "place" }, + "source_type": { "type": "string" }, + "destination_type": { "type": "string" }, + "allow_multiple_to_places": { "type": "boolean" } + }, + "required": ["source_type", "destination_type"] + }, + { + "properties": { + "type": { "const": "stat_var" }, + "ancestor_sv_id": { "type": "string" }, + "source_sv_ids": { + "type": "array", + "items": { "type": "string" }, + "minItems": 1 + }, + "skip_all_sources_present_check": { "type": "boolean" }, + "output_import_name": { "type": "string" } + }, + "required": ["ancestor_sv_id", "source_sv_ids"] + }, + { + "properties": { + "type": { "const": "entity" }, + "entity_types": { + "type": "array", + "items": { "type": "string" } + }, + "location_props": { + "type": "array", + "items": { "type": "string" } + }, + "date_prop": { "type": "string" }, + "agg_date_formats": { + "type": "array", + "items": { "type": "string" } + }, + "constraints": { + "type": "array", + "items": { "type": "string" } + } + }, + "required": ["entity_types", "location_props"] + }, + { + "properties": { + "type": { "enum": ["linked_edges", "provenance_summary", "stat_var_groups"] } + } + } + ] + } + } + } + } + }, + "required": ["aggregations"], + "additionalProperties": false +} diff --git a/pipeline/workflow/ingestion-helper/aggregation/validator.py b/pipeline/workflow/ingestion-helper/aggregation/validator.py new file mode 100644 index 000000000..e1ac6d2e1 --- /dev/null +++ b/pipeline/workflow/ingestion-helper/aggregation/validator.py @@ -0,0 +1,132 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Configuration validator and CLI tool for Data Commons aggregations.""" + +import argparse +import json +import logging +import os +import sys +from typing import Any, Dict, List +import yaml +import jsonschema + +logging.basicConfig(level=logging.INFO) + +# ANSI escape codes for colored terminal output +GREEN = "\033[92m" +RED = "\033[91m" +RESET = "\033[0m" + + +def validate_config(config_file_path: str, schema_file_path: str) -> List[Dict[str, Any]]: + """Loads and validates the aggregation YAML configuration against the JSON Schema. + + Args: + config_file_path: Path to the aggregation.yaml configuration file. + schema_file_path: Path to the aggregation_schema.json validation file. + + Returns: + A list of validated aggregation dictionaries. + + Raises: + FileNotFoundError: If either the config or schema file is missing. + jsonschema.exceptions.ValidationError: If schema validation fails. + yaml.YAMLError: If the YAML file is malformed. + """ + if not os.path.exists(config_file_path): + raise FileNotFoundError(f"Aggregation config file not found: {config_file_path}") + if not os.path.exists(schema_file_path): + raise FileNotFoundError(f"JSON Schema file not found: {schema_file_path}") + + # 1. Load and parse YAML + try: + with open(config_file_path, "r") as f: + config = yaml.safe_load(f) + except yaml.YAMLError as e: + logging.error(f"Failed to parse YAML file {config_file_path}: {e}") + raise e + + if not config or "aggregations" not in config: + logging.warning("Aggregation config is empty or missing 'aggregations' key.") + return [] + + # 2. Load JSON Schema + try: + with open(schema_file_path, "r") as f: + schema = json.load(f) + except Exception as e: + logging.error(f"Failed to load JSON Schema file {schema_file_path}: {e}") + raise e + + # 3. Validate against Schema + try: + jsonschema.validate(instance=config, schema=schema) + except jsonschema.exceptions.ValidationError as e: + logging.error(f"Schema validation failed for config {config_file_path}: {e.message}") + raise e + + return config["aggregations"] + + +def main(): + """CLI entry point for standalone configuration validation.""" + parser = argparse.ArgumentParser(description="Validate Data Commons aggregation configuration files against the JSON Schema.") + + # Resolve default paths relative to this script's directory (aggregation/) + curr_dir = os.path.dirname(os.path.abspath(__file__)) + default_config = os.path.join(curr_dir, "..", "aggregation.yaml") + default_schema = os.path.join(curr_dir, "schema.json") + + parser.add_argument( + "--config", + type=str, + default=default_config, + help=f"Path to the aggregation YAML config file (default: {default_config})" + ) + parser.add_argument( + "--schema", + type=str, + default=default_schema, + help=f"Path to the JSON Schema validation file (default: {default_schema})" + ) + + args = parser.parse_args() + + print(f"Validating '{os.path.basename(args.config)}' against '{os.path.basename(args.schema)}'...") + + try: + aggregations = validate_config(args.config, args.schema) + print(f"{GREEN}[SUCCESS] Configuration is valid!{RESET}") + print(f"Parsed {len(aggregations)} aggregation steps successfully.") + sys.exit(0) + except FileNotFoundError as e: + print(f"{RED}[ERROR] File not found: {e}{RESET}", file=sys.stderr) + sys.exit(1) + except jsonschema.exceptions.ValidationError as e: + print(f"{RED}[ERROR] Schema Validation Failed:{RESET}", file=sys.stderr) + print(f"{RED} - Path: {'.'.join(str(p) for p in e.path)}{RESET}", file=sys.stderr) + print(f"{RED} - Message: {e.message}{RESET}", file=sys.stderr) + sys.exit(1) + except yaml.YAMLError as e: + print(f"{RED}[ERROR] YAML Syntax Error: {e}{RESET}", file=sys.stderr) + sys.exit(1) + except Exception as e: + print(f"{RED}[ERROR] Unexpected validation failure: {e}{RESET}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/pipeline/workflow/ingestion-helper/aggregation/validator_test.py b/pipeline/workflow/ingestion-helper/aggregation/validator_test.py new file mode 100644 index 000000000..fd38a35cc --- /dev/null +++ b/pipeline/workflow/ingestion-helper/aggregation/validator_test.py @@ -0,0 +1,289 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for the aggregation configuration validator.""" + +import json +import os +import sys +import unittest +from unittest.mock import mock_open, patch +import jsonschema +import yaml + +sys.path.append(os.path.dirname(os.path.dirname(__file__))) + +from aggregation import validate_config + +# ============================================================================= +# Mock YAML Configurations for Testing +# ============================================================================= + +# 1. A perfectly valid config containing all possible types and fields +VALID_ALL_TYPES_YAML = """ +aggregations: + - type: linked_edges + imports: ["*"] + stage: 1 + disabled: false + + - type: place + source_type: County + destination_type: State + allow_multiple_to_places: true + imports: ["ImportA", "ImportB"] + stage: 2 + + - type: stat_var + ancestor_sv_id: Count_Person + source_sv_ids: ["Count_Person_Male", "Count_Person_Female"] + skip_all_sources_present_check: true + output_import_name: "Aggregated_Pop" + imports: ["ImportC"] + stage: 3 + + - type: entity + entity_types: ["MortalityEvent"] + location_props: ["location"] + date_prop: "date" + agg_date_formats: ["%Y"] + imports: ["ImportD"] + + - type: provenance_summary + imports: ["*"] + + - type: stat_var_groups + imports: ["*"] +""" + +# 2. Invalid: Missing required type field +INVALID_MISSING_TYPE_YAML = """ +aggregations: + - imports: ["*"] +""" + +# 3. Invalid: Missing required imports field +INVALID_MISSING_IMPORTS_YAML = """ +aggregations: + - type: linked_edges +""" + +# 4. Invalid: imports is a string instead of an array +INVALID_IMPORTS_TYPE_YAML = """ +aggregations: + - type: linked_edges + imports: "*" +""" + +# 5. Invalid: stage is a string instead of an integer +INVALID_STAGE_TYPE_YAML = """ +aggregations: + - type: linked_edges + imports: ["*"] + stage: "first" +""" + +# 6. Invalid: stage is 0 (minimum is 1) +INVALID_STAGE_VALUE_YAML = """ +aggregations: + - type: linked_edges + imports: ["*"] + stage: 0 +""" + +# 7. Invalid: empty imports list (minItems: 1) +INVALID_EMPTY_IMPORTS_YAML = """ +aggregations: + - type: linked_edges + imports: [] +""" + +# 8. Invalid place rollup: missing required source_type +INVALID_PLACE_MISSING_FIELD_YAML = """ +aggregations: + - type: place + destination_type: State + imports: ["*"] +""" + +# 9. Invalid stat var: missing required source_sv_ids +INVALID_STAT_VAR_MISSING_FIELD_YAML = """ +aggregations: + - type: stat_var + ancestor_sv_id: Count_Person + imports: ["*"] +""" + +# 10. Invalid stat var: empty source_sv_ids list +INVALID_STAT_VAR_EMPTY_SVS_YAML = """ +aggregations: + - type: stat_var + ancestor_sv_id: Count_Person + source_sv_ids: [] + imports: ["*"] +""" + +# 11. Invalid entity: missing required location_props +INVALID_ENTITY_MISSING_FIELD_YAML = """ +aggregations: + - type: entity + entity_types: ["Event"] + imports: ["*"] +""" + +# 12. Malformed YAML (Indentation error) +MALFORMED_YAML = """ +aggregations: + - type: linked_edges + imports: + - "*" +""" + + +class TestConfigValidator(unittest.TestCase): + + def setUp(self): + # Load the actual schema from the workspace to ensure tests remain realistic + self.schema_path = os.path.join(os.path.dirname(__file__), "schema.json") + with open(self.schema_path, "r") as f: + self.schema_json = json.load(f) + + def _get_mock_open(self, yaml_content): + """Helper to mock open() for both the schema JSON and the target YAML.""" + def side_effect(path, *args, **kwargs): + if "schema.json" in path: + return mock_open(read_data=json.dumps(self.schema_json))().__enter__() + else: + return mock_open(read_data=yaml_content)().__enter__() + return side_effect + + # ============================================================================= + # Success Test Cases + # ============================================================================= + + @patch('builtins.open') + def test_validate_config_success_all_types(self, mock_file_open): + """Verifies that a comprehensive, valid config with all types passes validation.""" + mock_file_open.side_effect = self._get_mock_open(VALID_ALL_TYPES_YAML) + + aggregations = validate_config("aggregation.yaml", self.schema_path) + + self.assertEqual(len(aggregations), 6) + self.assertEqual(aggregations[0]["type"], "linked_edges") + self.assertEqual(aggregations[1]["source_type"], "County") + self.assertEqual(aggregations[2]["ancestor_sv_id"], "Count_Person") + self.assertEqual(aggregations[3]["entity_types"], ["MortalityEvent"]) + + # ============================================================================= + # Schema Constraint Test Cases + # ============================================================================= + + @patch('builtins.open') + def test_validate_config_missing_type(self, mock_file_open): + mock_file_open.side_effect = self._get_mock_open(INVALID_MISSING_TYPE_YAML) + with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: + validate_config("aggregation.yaml", self.schema_path) + self.assertIn("'type' is a required property", ctx.exception.message) + + @patch('builtins.open') + def test_validate_config_missing_imports(self, mock_file_open): + mock_file_open.side_effect = self._get_mock_open(INVALID_MISSING_IMPORTS_YAML) + with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: + validate_config("aggregation.yaml", self.schema_path) + self.assertIn("'imports' is a required property", ctx.exception.message) + + @patch('builtins.open') + def test_validate_config_invalid_imports_type(self, mock_file_open): + mock_file_open.side_effect = self._get_mock_open(INVALID_IMPORTS_TYPE_YAML) + with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: + validate_config("aggregation.yaml", self.schema_path) + self.assertIn("is not of type 'array'", ctx.exception.message) + + @patch('builtins.open') + def test_validate_config_invalid_stage_type(self, mock_file_open): + mock_file_open.side_effect = self._get_mock_open(INVALID_STAGE_TYPE_YAML) + with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: + validate_config("aggregation.yaml", self.schema_path) + self.assertIn("is not of type 'integer'", ctx.exception.message) + + @patch('builtins.open') + def test_validate_config_invalid_stage_value(self, mock_file_open): + mock_file_open.side_effect = self._get_mock_open(INVALID_STAGE_VALUE_YAML) + with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: + validate_config("aggregation.yaml", self.schema_path) + self.assertIn("is less than the minimum of 1", ctx.exception.message) + + @patch('builtins.open') + def test_validate_config_empty_imports_list(self, mock_file_open): + mock_file_open.side_effect = self._get_mock_open(INVALID_EMPTY_IMPORTS_YAML) + with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: + validate_config("aggregation.yaml", self.schema_path) + self.assertIn("should be non-empty", ctx.exception.message) + + # ============================================================================= + # Conditional Dependency Test Cases (OneOf/Dependencies) + # ============================================================================= + + @patch('builtins.open') + def test_validate_config_place_missing_field(self, mock_file_open): + mock_file_open.side_effect = self._get_mock_open(INVALID_PLACE_MISSING_FIELD_YAML) + with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: + validate_config("aggregation.yaml", self.schema_path) + # Validation fails because place rollup requires source_type + self.assertIn("is not valid under any of the given schemas", ctx.exception.message) + + @patch('builtins.open') + def test_validate_config_stat_var_missing_field(self, mock_file_open): + mock_file_open.side_effect = self._get_mock_open(INVALID_STAT_VAR_MISSING_FIELD_YAML) + with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: + validate_config("aggregation.yaml", self.schema_path) + self.assertIn("is not valid under any of the given schemas", ctx.exception.message) + + @patch('builtins.open') + def test_validate_config_stat_var_empty_source_svs(self, mock_file_open): + mock_file_open.side_effect = self._get_mock_open(INVALID_STAT_VAR_EMPTY_SVS_YAML) + with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: + validate_config("aggregation.yaml", self.schema_path) + self.assertIn("should be non-empty", ctx.exception.message) + + @patch('builtins.open') + def test_validate_config_entity_missing_field(self, mock_file_open): + mock_file_open.side_effect = self._get_mock_open(INVALID_ENTITY_MISSING_FIELD_YAML) + with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: + validate_config("aggregation.yaml", self.schema_path) + self.assertIn("is not valid under any of the given schemas", ctx.exception.message) + + # ============================================================================= + # File System & Syntax Error Test Cases + # ============================================================================= + + @patch('builtins.open') + def test_validate_config_yaml_syntax_error(self, mock_file_open): + mock_file_open.side_effect = self._get_mock_open(MALFORMED_YAML) + with self.assertRaises(yaml.YAMLError): + validate_config("aggregation.yaml", self.schema_path) + + def test_validate_config_missing_config_file(self): + with self.assertRaises(FileNotFoundError): + validate_config("non_existent_config.yaml", "schema.json") + + def test_validate_config_missing_schema_file(self): + # We patch os.path.exists to simulate config existing but schema missing + with patch('os.path.exists', side_effect=lambda path: "aggregation.yaml" in path): + with self.assertRaises(FileNotFoundError): + validate_config("aggregation.yaml", "non_existent_schema.json") + + +if __name__ == '__main__': + unittest.main() diff --git a/pipeline/workflow/ingestion-helper/app_test.py b/pipeline/workflow/ingestion-helper/app_test.py index 8150e5a6b..d19fdfb7e 100644 --- a/pipeline/workflow/ingestion-helper/app_test.py +++ b/pipeline/workflow/ingestion-helper/app_test.py @@ -263,6 +263,78 @@ def test_update_import_version_override_success(self, mock_get_caller_identity): # Verify get_caller_identity was called exactly once outside of the loop mock_get_caller_identity.assert_called_once() + @patch('routes.aggregation._get_orchestrator') + def test_aggregation_initiate_success(self, mock_aggregation_utils): + # Setup mock orchestrator + mock_instance = MagicMock() + mock_aggregation_utils.return_value = mock_instance + mock_instance.has_stage.side_effect = lambda stage, imports: stage == 1 + mock_instance.execute_stage.return_value = ["job-1", "job-2"] + + # Call endpoint + payload = { + "importList": [{"importName": "USFed_Census"}] + } + response = client.post("/aggregation/initiate", json=payload) + + # Assertions + self.assertEqual(response.status_code, 200) + state = response.json() + self.assertEqual(state["status"], "RUNNING") + self.assertEqual(state["current_stage"], 1) + self.assertEqual(state["active_job_ids"], ["job-1", "job-2"]) + self.assertEqual(state["import_list"], [{"importName": "USFed_Census"}]) + + @patch('routes.aggregation._get_orchestrator') + def test_aggregation_poll_transition(self, mock_aggregation_utils): + # Setup mock orchestrator to simulate Stage 1 completion and Stage 2 execution + mock_instance = MagicMock() + mock_aggregation_utils.return_value = mock_instance + + # Mock BQ reporting Stage 1 jobs are DONE + mock_instance.check_jobs_status.return_value = {"status": "DONE"} + # Mock Stage 2 existence and execution + mock_instance.has_stage.side_effect = lambda stage, imports: stage == 2 + mock_instance.execute_stage.return_value = ["job-stage2-1"] + + # Input state (Stage 1 completed) + payload = { + "status": "RUNNING", + "current_stage": 1, + "active_job_ids": ["job-1", "job-2"], + "import_list": [{"importName": "USFed_Census"}] + } + + # Call endpoint + response = client.post("/aggregation/poll", json=payload) + + # Assertions + self.assertEqual(response.status_code, 200) + state = response.json() + self.assertEqual(state["status"], "RUNNING") + self.assertEqual(state["current_stage"], 2) # Transitioned to 2! + self.assertEqual(state["active_job_ids"], ["job-stage2-1"]) + + @patch('routes.aggregation._get_orchestrator') + def test_aggregation_legacy_run(self, mock_aggregation_utils): + # Setup mock orchestrator + mock_instance = MagicMock() + mock_aggregation_utils.return_value = mock_instance + mock_instance.has_stage.side_effect = lambda stage, imports: stage in [1, 2] + mock_instance.execute_stage.side_effect = lambda stage, imports: [f"job-stage{stage}-1"] + + # Call legacy endpoint + payload = { + "importList": [{"importName": "USFed_Census"}] + } + response = client.post("/aggregation/run", json=payload) + + # Assertions (should return all jobs from all stages in parallel) + self.assertEqual(response.status_code, 200) + data = response.json() + self.assertEqual(data["status"], "SUBMITTED") + self.assertEqual(data["jobIds"], ["job-stage1-1", "job-stage2-1"]) + if __name__ == '__main__': unittest.main() diff --git a/pipeline/workflow/ingestion-helper/pyproject.toml b/pipeline/workflow/ingestion-helper/pyproject.toml index 6301d7b2e..4ce4a251a 100644 --- a/pipeline/workflow/ingestion-helper/pyproject.toml +++ b/pipeline/workflow/ingestion-helper/pyproject.toml @@ -33,6 +33,8 @@ dependencies = [ "google-cloud-bigquery", "redis", "jinja2", + "pyyaml>=6.0.3", + "jsonschema>=4.26.0", ] [tool.hatch.version] diff --git a/pipeline/workflow/ingestion-helper/routes/aggregation.py b/pipeline/workflow/ingestion-helper/routes/aggregation.py index 7433f906d..ce7434763 100644 --- a/pipeline/workflow/ingestion-helper/routes/aggregation.py +++ b/pipeline/workflow/ingestion-helper/routes/aggregation.py @@ -13,44 +13,61 @@ # limitations under the License. import logging -from fastapi import APIRouter, HTTPException -from utils.aggregation import AggregationUtils -import config from typing import Any, Dict, List, Optional +from fastapi import APIRouter, HTTPException from pydantic import BaseModel, Field + +import config from routes.models import BaseResponse, ResponseStatus +from aggregation import AggregationOrchestrator from utils.logging import log_start -class AggregationRequest(BaseModel): +# ============================================================================= +# Pydantic Models for the New Stateless API +# ============================================================================= + +class StateObject(BaseModel): + status: str = Field(..., description="Overall status of the run: RUNNING, SUCCEEDED, FAILED") + current_stage: int = Field(..., description="The stage currently executing") + active_job_ids: List[str] = Field(default_factory=list, description="BQ job IDs running in the current stage") + import_list: List[Dict[str, Any]] = Field(default_factory=list, description="Original list of imports") + error: Optional[str] = Field(default=None, description="Detailed error message if failed") + +class InitiateRequest(BaseModel): importList: List[Dict[str, Any]] = Field(default_factory=list) -class AggregationStatusRequest(BaseModel): +# ============================================================================= +# Pydantic Models for the Legacy API (Backward Compatibility) +# ============================================================================= + +class LegacyAggregationRequest(BaseModel): + importList: List[Dict[str, Any]] = Field(default_factory=list) + +class LegacyAggregationStatusRequest(BaseModel): jobIds: List[str] = Field(default_factory=list) -class AggregationResponse(BaseResponse): +class LegacyAggregationResponse(BaseResponse): jobIds: List[str] = Field(default_factory=list, description="BigQuery job IDs submitted for async aggregation") -class AggregationStatusResponse(BaseResponse): +class LegacyAggregationStatusResponse(BaseResponse): error: Optional[str] = Field(default=None, description="Detailed error message if failed") failedJobs: Optional[List[str]] = Field(default_factory=list, description="List of failed BigQuery job IDs") +# ============================================================================= +# Router Definition +# ============================================================================= + router = APIRouter(prefix="/aggregation", tags=["aggregation"]) -@router.post("/run", response_model=AggregationResponse) -@log_start -def run_aggregation(req: AggregationRequest): - """Runs aggregation logic asynchronously for the specified imports, returning BigQuery job IDs.""" - if not req.importList: - logging.info("Empty import list. Skipping aggregation.") - return AggregationResponse(status=ResponseStatus.SUBMITTED, jobIds=[]) - + +def _get_orchestrator() -> AggregationOrchestrator: + """Helper to initialize the orchestrator using global config.""" if not all([config.SPANNER_CONNECTION_ID, config.SPANNER_PROJECT_ID, config.SPANNER_INSTANCE_ID, config.SPANNER_GRAPH_DATABASE_ID]): raise HTTPException( status_code=400, detail="Missing required configuration environment variables: SPANNER_CONNECTION_ID, SPANNER_PROJECT_ID, SPANNER_INSTANCE_ID, or SPANNER_GRAPH_DATABASE_ID" ) - - aggregation = AggregationUtils( + return AggregationOrchestrator( connection_id=config.SPANNER_CONNECTION_ID, project_id=config.SPANNER_PROJECT_ID, instance_id=config.SPANNER_INSTANCE_ID, @@ -58,39 +75,155 @@ def run_aggregation(req: AggregationRequest): location=config.LOCATION, is_base_dc=config.IS_BASE_DC, ) + +# ----------------------------------------------------------------------------- +# New Stateless API Endpoints (Stage-based) +# ----------------------------------------------------------------------------- + +@router.post("/initiate", response_model=StateObject) +@log_start +def initiate_aggregation(req: InitiateRequest): + """Initiates the aggregation run by executing Stage 1 and returning the initial state.""" + if not req.importList: + logging.info("Empty import list. Skipping aggregation.") + return StateObject(status="SUCCEEDED", current_stage=0, active_job_ids=[], import_list=[]) + try: - job_ids = aggregation.run_aggregation(req.importList) - return AggregationResponse(status=ResponseStatus.SUBMITTED, jobIds=job_ids) + orchestrator = _get_orchestrator() + import_names = [item.get('importName') for item in req.importList if item.get('importName')] + + # Find the first stage that has active aggregations (usually Stage 1) + first_stage = 1 + while first_stage <= 10: # Arbitrary upper limit for safety + if orchestrator.has_stage(first_stage, import_names): + break + first_stage += 1 + else: + logging.info("No stages have active aggregations for the current imports. Completing immediately.") + return StateObject(status="SUCCEEDED", current_stage=0, active_job_ids=[], import_list=req.importList) + + logging.info(f"Initiating aggregation at Stage {first_stage}") + job_ids = orchestrator.execute_stage(first_stage, import_names) + + return StateObject( + status="RUNNING", + current_stage=first_stage, + active_job_ids=job_ids, + import_list=req.importList + ) except Exception as e: - raise HTTPException(status_code=500, detail=f"Aggregation failed: {str(e)}") + logging.error(f"Failed to initiate aggregation: {e}") + raise HTTPException(status_code=500, detail=f"Failed to initiate aggregation: {str(e)}") -@router.post("/status", response_model=AggregationStatusResponse) -def check_aggregation_status(req: AggregationStatusRequest): - """Checks the status of the submitted aggregation BigQuery jobs.""" - if not req.jobIds: - logging.info("Empty jobIds. Returning status DONE.") - return AggregationStatusResponse(status=ResponseStatus.DONE) + +@router.post("/poll", response_model=StateObject) +@log_start +def poll_aggregation(state: StateObject): + """Checks progress of active jobs and transitions to the next stage if complete.""" + if state.status != "RUNNING": + return state # Already in a terminal state + + try: + orchestrator = _get_orchestrator() + import_names = [item.get('importName') for item in state.import_list if item.get('importName')] + + # 1. Check status of active jobs in BigQuery + logging.info(f"Polling status for jobs in Stage {state.current_stage}: {state.active_job_ids}") + bq_status = orchestrator.check_jobs_status(state.active_job_ids) - if not all([config.SPANNER_CONNECTION_ID, config.SPANNER_PROJECT_ID, config.SPANNER_INSTANCE_ID, config.SPANNER_GRAPH_DATABASE_ID]): - raise HTTPException( - status_code=400, - detail="Missing required configuration environment variables." + # Case A: Any job failed + if bq_status["status"] == "FAILED": + logging.error(f"Stage {state.current_stage} failed with error: {bq_status.get('error')}") + return StateObject( + status="FAILED", + current_stage=state.current_stage, + active_job_ids=[], + import_list=state.import_list, + error=bq_status.get("error") + ) + + # Case B: Jobs are still running + if bq_status["status"] == "RUNNING": + logging.info(f"Stage {state.current_stage} is still executing.") + return state # Return unchanged + + # Case C: All jobs succeeded -> Find and execute the next active stage + next_stage = state.current_stage + 1 + while next_stage <= 10: # Arbitrary upper limit + if orchestrator.has_stage(next_stage, import_names): + logging.info(f"Stage {state.current_stage} completed. Transitioning to Stage {next_stage}...") + new_job_ids = orchestrator.execute_stage(next_stage, import_names) + return StateObject( + status="RUNNING", + current_stage=next_stage, + active_job_ids=new_job_ids, + import_list=state.import_list + ) + next_stage += 1 + + # If we exit the loop, there are no more active stages left + logging.info("All aggregation stages completed successfully!") + return StateObject( + status="SUCCEEDED", + current_stage=state.current_stage, + active_job_ids=[], + import_list=state.import_list ) + + except Exception as e: + logging.error(f"Error during polling: {e}") + return StateObject( + status="FAILED", + current_stage=state.current_stage, + active_job_ids=[], + import_list=state.import_list, + error=f"Orchestrator error: {str(e)}" + ) + +# ----------------------------------------------------------------------------- +# Legacy API Endpoints (Backward Compatibility Mode) +# ----------------------------------------------------------------------------- + +@router.post("/run", response_model=LegacyAggregationResponse) +@log_start +def run_aggregation_legacy(req: LegacyAggregationRequest): + """Legacy endpoint. Runs ALL enabled aggregations in parallel (ignores stages).""" + if not req.importList: + logging.info("Empty import list. Skipping legacy aggregation.") + return LegacyAggregationResponse(status=ResponseStatus.SUBMITTED, jobIds=[]) + + try: + orchestrator = _get_orchestrator() + import_names = [item.get('importName') for item in req.importList if item.get('importName')] + + # Compatibility Mode: Submit ALL enabled stages in parallel + job_ids = [] + for stage_num in range(1, 10): + if orchestrator.has_stage(stage_num, import_names): + job_ids.extend(orchestrator.execute_stage(stage_num, import_names)) + + return LegacyAggregationResponse(status=ResponseStatus.SUBMITTED, jobIds=job_ids) + except Exception as e: + logging.error(f"Legacy aggregation failed: {e}") + raise HTTPException(status_code=500, detail=f"Legacy aggregation failed: {str(e)}") + + +@router.post("/status", response_model=LegacyAggregationStatusResponse) +@log_start +def check_aggregation_status_legacy(req: LegacyAggregationStatusRequest): + """Legacy endpoint. Checks the status of the submitted BigQuery jobs.""" + if not req.jobIds: + logging.info("Empty jobIds. Returning status DONE.") + return LegacyAggregationStatusResponse(status=ResponseStatus.DONE) - aggregation = AggregationUtils( - connection_id=config.SPANNER_CONNECTION_ID, - project_id=config.SPANNER_PROJECT_ID, - instance_id=config.SPANNER_INSTANCE_ID, - database_id=config.SPANNER_GRAPH_DATABASE_ID, - location=config.LOCATION, - is_base_dc=config.IS_BASE_DC, - ) try: - status_info = aggregation.check_aggregation_status(req.jobIds) - return AggregationStatusResponse( + orchestrator = _get_orchestrator() + status_info = orchestrator.check_jobs_status(req.jobIds) + return LegacyAggregationStatusResponse( status=ResponseStatus.from_str(status_info.get("status", "ERROR")), error=status_info.get("error"), failedJobs=status_info.get("failedJobs", []) ) except Exception as e: - raise HTTPException(status_code=500, detail=f"Aggregation status check failed: {str(e)}") + logging.error(f"Legacy status check failed: {e}") + raise HTTPException(status_code=500, detail=f"Legacy status check failed: {str(e)}") diff --git a/pipeline/workflow/ingestion-helper/utils/aggregation.py b/pipeline/workflow/ingestion-helper/utils/aggregation.py deleted file mode 100644 index 432a4000f..000000000 --- a/pipeline/workflow/ingestion-helper/utils/aggregation.py +++ /dev/null @@ -1,98 +0,0 @@ -# Copyright 2026 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from typing import Any, Dict, List, Optional - -from aggregation import BigQueryExecutor -from aggregation import LinkedEdgeGenerator -from aggregation import ProvenanceSummaryGenerator -from aggregation import StatVarAggregator -from aggregation import StatVarGroupGenerator -from google.cloud import bigquery - -logging.getLogger().setLevel(logging.INFO) - - -class AggregationUtils: - """Orchestrates the overall aggregation workflow.""" - - def __init__(self, - connection_id: str, - project_id: str, - instance_id: str, - database_id: str, - location: Optional[str] = None, - is_base_dc: bool = True) -> None: - # TODO: remove sequential execution once DCP changes are made - # Use sequential execution for DCP (backward compatibility) - run_sequential = not is_base_dc - self.executor = BigQueryExecutor(connection_id=connection_id, - project_id=project_id, - instance_id=instance_id, - database_id=database_id, - location=location, - run_sequential=run_sequential) - self.linked_edge_generator = LinkedEdgeGenerator( - self.executor, is_base_dc) - self.provenance_summary_generator = ProvenanceSummaryGenerator( - self.executor, is_base_dc) - - def run_aggregation(self, import_list: List[Dict[str, Any]]) -> List[str]: - """ - Orchestrates standard per-import aggregations and global aggregations. - Returns a list of BigQuery job IDs for async polling. - """ - logging.info(f"Received request for importList: {import_list}") - - try: - import_names = [] - # 1. Run standard per-import aggregations - for import_item in import_list: - import_name = import_item.get('importName') - if import_name: - import_names.append(import_name) - query = "SELECT @import_name as import_name, CURRENT_TIMESTAMP() as execution_time" - job_config = bigquery.QueryJobConfig(query_parameters=[ - bigquery.ScalarQueryParameter("import_name", "STRING", - import_name), - ]) - self.executor.execute(query, job_config=job_config) - else: - logging.info( - 'Skipping aggregation logic for empty importName') - - # 2. Run global aggregations asynchronously - jobs = [] - jobs.extend(self.linked_edge_generator.run_all(import_names)) - jobs.extend(self.provenance_summary_generator.run_all(import_names)) - - job_ids = [job.job_id for job in jobs if job] - logging.info(f"Submitted async aggregation jobs: {job_ids}") - - return job_ids - except Exception as e: - logging.error(f"Aggregation failed: {e}") - raise e - - def check_aggregation_status(self, job_ids: List[str]) -> Dict[str, Any]: - """ - Checks the status of the provided BigQuery job IDs. - """ - logging.info(f"Checking status for jobs: {job_ids}") - try: - return self.executor.get_jobs_status(job_ids) - except Exception as e: - logging.error(f"Failed to check aggregation status: {e}") - raise e diff --git a/pipeline/workflow/ingestion-helper/utils/aggregation_test.py b/pipeline/workflow/ingestion-helper/utils/aggregation_test.py deleted file mode 100644 index f4afb6e3f..000000000 --- a/pipeline/workflow/ingestion-helper/utils/aggregation_test.py +++ /dev/null @@ -1,87 +0,0 @@ -# Copyright 2026 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import sys -import unittest -from unittest.mock import MagicMock -from unittest.mock import patch - -sys.path.append(os.path.dirname(os.path.dirname(__file__))) - -from utils.aggregation import AggregationUtils - - -@patch('utils.aggregation.BigQueryExecutor') -@patch('utils.aggregation.LinkedEdgeGenerator') -@patch('utils.aggregation.ProvenanceSummaryGenerator') -@patch('utils.aggregation.StatVarGroupGenerator') -class TestAggregationUtils(unittest.TestCase): - - def test_run_aggregation(self, mock_prov_gen, mock_edge_gen, mock_executor): - # Setup mocks - mock_executor_instance = MagicMock() - mock_executor.return_value = mock_executor_instance - - mock_edge_gen_instance = MagicMock() - mock_edge_gen.return_value = mock_edge_gen_instance - mock_job1 = MagicMock() - mock_job1.job_id = "job1" - mock_edge_gen_instance.run_all.return_value = [mock_job1] - - mock_prov_gen_instance = MagicMock() - mock_prov_gen.return_value = mock_prov_gen_instance - mock_job2 = MagicMock() - mock_job2.job_id = "job2" - mock_prov_gen_instance.run_all.return_value = [mock_job2] - - utils = AggregationUtils(connection_id="conn", - project_id="proj", - instance_id="inst", - database_id="db", - is_base_dc=True) - - import_list = [{'importName': 'import1'}, {'importName': 'import2'}] - job_ids = utils.run_aggregation(import_list) - - # Verify standard import queries were executed - self.assertEqual(mock_executor_instance.execute.call_count, 2) - - # Verify generators were called - mock_edge_gen_instance.run_all.assert_called_once_with( - ["import1", "import2"]) - mock_prov_gen_instance.run_all.assert_called_once_with( - ["import1", "import2"]) - - self.assertEqual(job_ids, ["job1", "job2"]) - - def test_check_aggregation_status(self, mock_prov_gen, mock_edge_gen, - mock_executor): - mock_executor_instance = MagicMock() - mock_executor.return_value = mock_executor_instance - mock_executor_instance.get_jobs_status.return_value = {"status": "DONE"} - - utils = AggregationUtils(connection_id="conn", - project_id="proj", - instance_id="inst", - database_id="db") - - status = utils.check_aggregation_status(["job1", "job2"]) - mock_executor_instance.get_jobs_status.assert_called_once_with( - ["job1", "job2"]) - self.assertEqual(status, {"status": "DONE"}) - - -if __name__ == '__main__': - unittest.main() diff --git a/pipeline/workflow/ingestion-helper/uv.lock b/pipeline/workflow/ingestion-helper/uv.lock index 6fa14bcb8..79e17f455 100644 --- a/pipeline/workflow/ingestion-helper/uv.lock +++ b/pipeline/workflow/ingestion-helper/uv.lock @@ -47,6 +47,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/da/42/e921fccf5015463e32a3cf6ee7f980a6ed0f395ceeaa45060b61d86486c2/anyio-4.13.0-py3-none-any.whl", hash = "sha256:08b310f9e24a9594186fd75b4f73f4a4152069e3853f1ed8bfbf58369f4ad708", size = 114353, upload-time = "2026-03-24T12:59:08.246Z" }, ] +[[package]] +name = "attrs" +version = "26.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/9a/8e/82a0fe20a541c03148528be8cac2408564a6c9a0cc7e9171802bc1d26985/attrs-26.1.0.tar.gz", hash = "sha256:d03ceb89cb322a8fd706d4fb91940737b6642aa36998fe130a9bc96c985eff32", size = 952055, upload-time = "2026-03-19T14:22:25.026Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/64/b4/17d4b0b2a2dc85a6df63d1157e028ed19f90d4cd97c36717afef2bc2f395/attrs-26.1.0-py3-none-any.whl", hash = "sha256:c647aa4a12dfbad9333ca4e71fe62ddc36f4e63b2d260a37a8b83d2f043ac309", size = 67548, upload-time = "2026-03-19T14:22:23.645Z" }, +] + [[package]] name = "certifi" version = "2026.5.20" @@ -272,7 +281,9 @@ dependencies = [ { name = "google-cloud-spanner" }, { name = "google-cloud-storage" }, { name = "jinja2" }, + { name = "jsonschema" }, { name = "pydantic" }, + { name = "pyyaml" }, { name = "redis" }, { name = "uvicorn", extra = ["standard"] }, ] @@ -293,7 +304,9 @@ requires-dist = [ { name = "google-cloud-spanner" }, { name = "google-cloud-storage" }, { name = "jinja2" }, + { name = "jsonschema", specifier = ">=4.26.0" }, { name = "pydantic", specifier = ">=2.0.0" }, + { name = "pyyaml", specifier = ">=6.0.3" }, { name = "redis" }, { name = "uvicorn", extras = ["standard"], specifier = ">=0.28.0" }, ] @@ -722,6 +735,33 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" }, ] +[[package]] +name = "jsonschema" +version = "4.26.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "attrs" }, + { name = "jsonschema-specifications" }, + { name = "referencing" }, + { name = "rpds-py" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b3/fc/e067678238fa451312d4c62bf6e6cf5ec56375422aee02f9cb5f909b3047/jsonschema-4.26.0.tar.gz", hash = "sha256:0c26707e2efad8aa1bfc5b7ce170f3fccc2e4918ff85989ba9ffa9facb2be326", size = 366583, upload-time = "2026-01-07T13:41:07.246Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/69/90/f63fb5873511e014207a475e2bb4e8b2e570d655b00ac19a9a0ca0a385ee/jsonschema-4.26.0-py3-none-any.whl", hash = "sha256:d489f15263b8d200f8387e64b4c3a75f06629559fb73deb8fdfb525f2dab50ce", size = 90630, upload-time = "2026-01-07T13:41:05.306Z" }, +] + +[[package]] +name = "jsonschema-specifications" +version = "2025.9.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "referencing" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/19/74/a633ee74eb36c44aa6d1095e7cc5569bebf04342ee146178e2d36600708b/jsonschema_specifications-2025.9.1.tar.gz", hash = "sha256:b540987f239e745613c7a9176f3edb72b832a4ac465cf02712288397832b5e8d", size = 32855, upload-time = "2025-09-08T01:34:59.186Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/41/45/1a4ed80516f02155c51f51e8cedb3c1902296743db0bbc66608a0db2814f/jsonschema_specifications-2025.9.1-py3-none-any.whl", hash = "sha256:98802fee3a11ee76ecaca44429fda8a41bff98b00a0f2838151b113f210cc6fe", size = 18437, upload-time = "2025-09-08T01:34:57.871Z" }, +] + [[package]] name = "markupsafe" version = "3.0.3" @@ -1196,6 +1236,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/27/e3/b519734372d305bd547534a9f32e4ce9f98552af753dce72cf3483a0ff0b/redis-8.0.0-py3-none-any.whl", hash = "sha256:c938c18338585009f0bc310f4c7e4e4b4d37639356c4ac072cedf3af570c8dc7", size = 499870, upload-time = "2026-05-28T12:45:11.697Z" }, ] +[[package]] +name = "referencing" +version = "0.37.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "attrs" }, + { name = "rpds-py" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/22/f5/df4e9027acead3ecc63e50fe1e36aca1523e1719559c499951bb4b53188f/referencing-0.37.0.tar.gz", hash = "sha256:44aefc3142c5b842538163acb373e24cce6632bd54bdb01b21ad5863489f50d8", size = 78036, upload-time = "2025-10-13T15:30:48.871Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2c/58/ca301544e1fa93ed4f80d724bf5b194f6e4b945841c5bfd555878eea9fcb/referencing-0.37.0-py3-none-any.whl", hash = "sha256:381329a9f99628c9069361716891d34ad94af76e461dcb0335825aecc7692231", size = 26766, upload-time = "2025-10-13T15:30:47.625Z" }, +] + [[package]] name = "requests" version = "2.34.2" @@ -1211,6 +1265,116 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a0/f4/c67b0b3f1b9245e8d266f0f112c500d50e5b4e83cb6f3b71b6528104182a/requests-2.34.2-py3-none-any.whl", hash = "sha256:2a0d60c172f83ac6ab31e4554906c0f3b3588d37b5cb939b1c061f4907e278e0", size = 73075, upload-time = "2026-05-14T19:25:26.443Z" }, ] +[[package]] +name = "rpds-py" +version = "2026.5.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2e/43/25a8dcd3feedd735039a8f0b5b7e3b118232b5eae288c4fd9ab200d41094/rpds_py-2026.5.1.tar.gz", hash = "sha256:07b24fea40541e28570e5b795a4a38fbdcd12550c06bd0748005ecc8116ca256", size = 64459, upload-time = "2026-05-28T12:02:13.232Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d4/e7/a78582dc57caa592dcc7d4fb69b61390561e908eb3d2f5df5928a8e354c0/rpds_py-2026.5.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:3abe24a66e57adcfa645d718063a5fa5103ecc71ddbf26d78af8f9368018ff1d", size = 353040, upload-time = "2026-05-28T11:59:12.531Z" }, + { url = "https://files.pythonhosted.org/packages/a3/43/35e3f136343aef451e545ce8c38d36c2f93c0ed88703db8b64ba2b205c68/rpds_py-2026.5.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:58b1d94308ddf0b1982f61f2eb54bf92997c9ece8a8093ef014250f4a517906c", size = 345775, upload-time = "2026-05-28T11:59:13.827Z" }, + { url = "https://files.pythonhosted.org/packages/20/e1/0f2160c5982d3157734d5cb3ed63d8b2d583a73c9864f77b666449f32cf8/rpds_py-2026.5.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0fa92420128dadce7f54bd73ba1825a273e9268fe9e35dbf7e6362890efa4e08", size = 376329, upload-time = "2026-05-28T11:59:15.271Z" }, + { url = "https://files.pythonhosted.org/packages/d0/11/ee0ba42aff83bf4effdbc576673c6be64c5e173978c3f6d537e94482f77d/rpds_py-2026.5.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ca653c6546386227cd9800d1bef6a348099acf8db4250341da6d90f663d6dfcb", size = 383539, upload-time = "2026-05-28T11:59:16.665Z" }, + { url = "https://files.pythonhosted.org/packages/11/df/d94aa6a499d4ac40afe2d7620f2c597fd3c0f182e854ad7cf3f596a81cb6/rpds_py-2026.5.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:66c93681c4729e4e3ecba31b8179fae083ff3118841672835140338b4b9867c1", size = 494674, upload-time = "2026-05-28T11:59:17.991Z" }, + { url = "https://files.pythonhosted.org/packages/1f/75/33d30f43bb2f458de11979486a591b1bf6e5651765ed1704c6197c2dc773/rpds_py-2026.5.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:40ff257542e04796880e011e15cd4dc21c2599975df2aaa8f2c8495ca574e1a5", size = 389268, upload-time = "2026-05-28T11:59:19.434Z" }, + { url = "https://files.pythonhosted.org/packages/f4/1e/2c9096fc19d5fd084b0184ca2b651e659aa0a37e6fdbecf6ece47f147fe1/rpds_py-2026.5.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b6825cc329b290e93c5f6a9be2393118a763f6ccf6abd83704e0c102ca583644", size = 376280, upload-time = "2026-05-28T11:59:21Z" }, + { url = "https://files.pythonhosted.org/packages/b9/e5/61ec9f8be8211ea7f48448195549e4aaf02004083475493b0e137702ecb2/rpds_py-2026.5.1-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:de42116e69cb53b911cc34aee5ab98f36c597b822545045d49e938818b99e5e4", size = 387233, upload-time = "2026-05-28T11:59:22.454Z" }, + { url = "https://files.pythonhosted.org/packages/0d/ca/bcec1005c4f4a234f92a29078631fee49206c7265ccae966f18fd332e80e/rpds_py-2026.5.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c0f920015df2a504bebaba6d4c31ccf3fcf942f92655c086da30b671aad19aa6", size = 405009, upload-time = "2026-05-28T11:59:23.845Z" }, + { url = "https://files.pythonhosted.org/packages/72/e6/4d5718c5cf26c522dc7c9999e238da1e77380b81d0c5d1df11e271ddfeb1/rpds_py-2026.5.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0408a24e44feb919423dc6d9da677cb5cddb894d2ca9e763967d156d9c60fab4", size = 553113, upload-time = "2026-05-28T11:59:25.184Z" }, + { url = "https://files.pythonhosted.org/packages/d4/25/2ee807bdb3e1f0b7eddf7782acd5665a8b5205a331a7d7244a52c4812fd9/rpds_py-2026.5.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:cea68bcd53467561ae2f96a6bdad1544299ba97b5b0ddcd5ac3d376e5c781c24", size = 618838, upload-time = "2026-05-28T11:59:26.749Z" }, + { url = "https://files.pythonhosted.org/packages/6a/c1/7d4c26f167f8c41501cc073d30ee22082b16ce358cf5b00ec97cbc7804ea/rpds_py-2026.5.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:4be8b1d2a705cc37d08256004e1d07de143fa0075c8e85a3df020b776f62b732", size = 582436, upload-time = "2026-05-28T11:59:28.11Z" }, + { url = "https://files.pythonhosted.org/packages/04/1d/9d12b0a337bab46f4769f8857f4007e3b2d639e14f9a44a0efe157696e64/rpds_py-2026.5.1-cp312-cp312-win32.whl", hash = "sha256:6736718bd4fc49cbcb538ba30516fdbef161522acefb739657d48b97bd864fed", size = 212734, upload-time = "2026-05-28T11:59:29.689Z" }, + { url = "https://files.pythonhosted.org/packages/c5/93/e4116f2de7f56bc7406a76033dc501811ddeb22b7f056b92d632871ebb0c/rpds_py-2026.5.1-cp312-cp312-win_amd64.whl", hash = "sha256:0a7d1eec967df0e9b22614a5e177622e0c89611d03727fa0cb48e45028907870", size = 229045, upload-time = "2026-05-28T11:59:31.033Z" }, + { url = "https://files.pythonhosted.org/packages/cb/53/6c3419d85eb2ec5938a37627c585b42d76a63bb731d6e42ed4b079ebf486/rpds_py-2026.5.1-cp312-cp312-win_arm64.whl", hash = "sha256:1841d067089e117142d79b98aa0df2f08b52f2ecc1819dd2700636c0db74a473", size = 223967, upload-time = "2026-05-28T11:59:32.318Z" }, + { url = "https://files.pythonhosted.org/packages/6c/32/14c961ad295f490eb0849ada8b79683e93a59b9de3afdd983eaf55fa6867/rpds_py-2026.5.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:efef4ac29c6ff495531eb17ee705b62841ecaa291b7c7077e848ea03e237164d", size = 352787, upload-time = "2026-05-28T11:59:33.655Z" }, + { url = "https://files.pythonhosted.org/packages/ca/bb/d1b85117967c11191441a7274ae616c65d93901d082c588f89a50a8da5ae/rpds_py-2026.5.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:c39f5b67a8a2e67179ada2a954227d670fe65fa9098457f698f56ddf248709b3", size = 345179, upload-time = "2026-05-28T11:59:35Z" }, + { url = "https://files.pythonhosted.org/packages/7c/46/d84105f062e626a1b233f863907288a4708c2d833b8b4c6fb2764bc080c0/rpds_py-2026.5.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b5c30f3f04eef4fbd362226a6f31d7c8895ca4fbb6e0b790f6890a98d8da8559", size = 376173, upload-time = "2026-05-28T11:59:36.43Z" }, + { url = "https://files.pythonhosted.org/packages/e2/ae/469d7959ce5b1201e1de135dc735b86db3b35dd0d1734f6a44246d5f061c/rpds_py-2026.5.1-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:277f6c82f0580848796c7ecc8a7173aa3bfb928e4ff831261c2f60a81dc270db", size = 383162, upload-time = "2026-05-28T11:59:37.995Z" }, + { url = "https://files.pythonhosted.org/packages/dc/a2/57853d31a1116a561aa072794602ad3f6341e18d70a8523f1bd5b9fc1e5a/rpds_py-2026.5.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:63c2c4c213f1a4e3f3de28ecab029dbdee976324e729c0d7a55211be72576b02", size = 495093, upload-time = "2026-05-28T11:59:39.453Z" }, + { url = "https://files.pythonhosted.org/packages/99/63/3a8eabcad9314b7daf5c65f451d2c33d989235cd8a5762186cf2c3f5a4f8/rpds_py-2026.5.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3350ec808fb538fe71a1f94dfaa0e29c598dfad805ce49f0caec5ae3183c652b", size = 389829, upload-time = "2026-05-28T11:59:40.896Z" }, + { url = "https://files.pythonhosted.org/packages/4b/25/05678d97fc25e2622df14dc530fb82023174ecfff6733991ed0d78f167bd/rpds_py-2026.5.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1b964e3ab599e718dc46c018d104b1ebc007cbc6567d827c94a687fca56d77e", size = 374786, upload-time = "2026-05-28T11:59:42.626Z" }, + { url = "https://files.pythonhosted.org/packages/88/d1/8c90b6431e80a3b91b284a5c7c8c0c4f9c006444d90477a740d6e0f9c694/rpds_py-2026.5.1-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:19cb09fab7b7fc96b2a6e28f2e34b72a3705ff27b37edb77455316e5d3f3dc9b", size = 386920, upload-time = "2026-05-28T11:59:44.124Z" }, + { url = "https://files.pythonhosted.org/packages/ff/99/4638f672ab356682d633ee0da9255f5b67ce6efd0b85eb94ad3e255e65a5/rpds_py-2026.5.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:abe76bcdba31e576cb83eeb8797aa0d882b738fef6dc65d0601fc753806a5b46", size = 405059, upload-time = "2026-05-28T11:59:47.177Z" }, + { url = "https://files.pythonhosted.org/packages/66/3f/3546524b6eb4cc2e1f363a3d638fa52f6c24faae3500c25fb488b02f1740/rpds_py-2026.5.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:8bff7073db3899158fff55ebf57b113a67030af26f80a18978f9f0aa60250ddf", size = 553030, upload-time = "2026-05-28T11:59:48.603Z" }, + { url = "https://files.pythonhosted.org/packages/c6/c3/7b3388c796fcf471bd17194242d4dc1a7608567c0fa422bcc1c5e79f9c1e/rpds_py-2026.5.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:8ba264fa49be666cd9cc56bf34ec7002fb3d27a4aee5bcb4d43d0d18feb1bb6f", size = 618975, upload-time = "2026-05-28T11:59:50.314Z" }, + { url = "https://files.pythonhosted.org/packages/61/1e/a3cb07f2795075d1d88efddae2f541359fde5f08c81ee114c29c2949c90a/rpds_py-2026.5.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4860b603ddda0475a8885499b3729e90229d480105b42651962a5397d995fa89", size = 581178, upload-time = "2026-05-28T11:59:51.673Z" }, + { url = "https://files.pythonhosted.org/packages/a1/74/e758c03a5ef46f04c37f2651a2893db846d569ba8a7bca469d4b58939bcd/rpds_py-2026.5.1-cp313-cp313-win32.whl", hash = "sha256:7944270ae71383f6e2657dd7d5ce4eeb4ac2d0059a6738f0510583d462ab4842", size = 212481, upload-time = "2026-05-28T11:59:53.148Z" }, + { url = "https://files.pythonhosted.org/packages/70/ec/a2aca432db9c7359b40fa393eeeaa0d166c2f70175be956e75fa24197c44/rpds_py-2026.5.1-cp313-cp313-win_amd64.whl", hash = "sha256:88647f43a73c4e01be19b04ceef0c8d3a1958153604d13c773becd8016f2a0cf", size = 228519, upload-time = "2026-05-28T11:59:54.505Z" }, + { url = "https://files.pythonhosted.org/packages/29/60/a73bfdd45b096574556acf303bbd9fa9eed36ca8a818b514e2a5d5fe2b9d/rpds_py-2026.5.1-cp313-cp313-win_arm64.whl", hash = "sha256:453895624ecf7db7063b1004e44037522bbaef9ff6a945e59bc71662d7a03abd", size = 223446, upload-time = "2026-05-28T11:59:56.081Z" }, + { url = "https://files.pythonhosted.org/packages/18/e2/408105fd611823f00882aea810f3989a30d26b1bab8b6beb20f98c724e0e/rpds_py-2026.5.1-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:b4e4bc98639ec915f512fde3aa7a95e0041d95d9c3cc86eea841fa63cb1e8600", size = 355287, upload-time = "2026-05-28T11:59:57.448Z" }, + { url = "https://files.pythonhosted.org/packages/8d/58/5c4a43436843c90d0f6d19f82c200c80e3843ca9fa07b237623327f6d384/rpds_py-2026.5.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:cacedb7a6e167680acba45ad5716e89067d225dc80da0d7040cae8c81d4572fa", size = 347033, upload-time = "2026-05-28T11:59:58.881Z" }, + { url = "https://files.pythonhosted.org/packages/fb/c2/1a71acdacaf4e259b10278fb87b039ded3cf80041bcd89dd8a3ea702ded6/rpds_py-2026.5.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:68700371c5d7ae1412862ddfa719090925c93ecf351c566d66f09d04b136ea00", size = 376891, upload-time = "2026-05-28T12:00:00.516Z" }, + { url = "https://files.pythonhosted.org/packages/c2/c8/535f3d9b65addd8e28aa87b83c6e526799c3717a88273db8ea795beeef7a/rpds_py-2026.5.1-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:296c799becfa849c779c8725494fe9ed94959ed886787df4364b058465bad7f0", size = 385646, upload-time = "2026-05-28T12:00:02.394Z" }, + { url = "https://files.pythonhosted.org/packages/1c/91/dc033f313345c354ade914dbe73cdb90b615a4409ea02430d5356794f3d8/rpds_py-2026.5.1-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d3858b908218ee108d0bbfb2095ccc237648053c9bf98affad7cb079acaf1d97", size = 498830, upload-time = "2026-05-28T12:00:04.189Z" }, + { url = "https://files.pythonhosted.org/packages/27/fc/90fcbea459dbb8ddc18a2e0fd1de9412b48bc84ffff2db771cf714bacfd6/rpds_py-2026.5.1-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4fb8d2e7cb2f850b169806d61d1b991738acec96500a75c30f49caf064ce7cef", size = 392830, upload-time = "2026-05-28T12:00:05.797Z" }, + { url = "https://files.pythonhosted.org/packages/b2/1d/46cd11a228c9750684a798d98f878be6f614aa762438da7378f035e79e35/rpds_py-2026.5.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:27b74c10ed6a8f190f4287f53bcfea348b92a84a9c9f70d30183d1e6172d580d", size = 379613, upload-time = "2026-05-28T12:00:07.433Z" }, + { url = "https://files.pythonhosted.org/packages/24/4a/d9b0c6af3a1de03eb93741bbe8be2bdce84d8fda8224f3005451d86df389/rpds_py-2026.5.1-cp313-cp313t-manylinux_2_31_riscv64.whl", hash = "sha256:b9a6528956191c48c52294a592dbd4a8386d7048bdb25c0efcb6b966466c6d83", size = 388183, upload-time = "2026-05-28T12:00:09.227Z" }, + { url = "https://files.pythonhosted.org/packages/c5/b4/db7aaabdda6d020afc87d981bcc2f57a434c7dec60ecfc2ab3dd50b20351/rpds_py-2026.5.1-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:af03e34e860047bc7a352b842856fcf78798fbb81132cc98bd2f907ab4eb9cd2", size = 408578, upload-time = "2026-05-28T12:00:10.779Z" }, + { url = "https://files.pythonhosted.org/packages/08/d6/070f6a41cbb343e2ac4171859bf3f3623e0ab002f72619d6d505313ec2de/rpds_py-2026.5.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:fea6e836d10abbe191d557d33bd58bd5987725fe63aa1eefe557d230209855bd", size = 553573, upload-time = "2026-05-28T12:00:12.443Z" }, + { url = "https://files.pythonhosted.org/packages/75/ab/1a71ea3589c4345dac0a0518f0e6a031cb42689277851b683c46d27463a5/rpds_py-2026.5.1-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:fc0c0f878ea770a0a8a462456c5ad36fc9fe6358e6b76fdadc7f17575e0b8bf1", size = 620861, upload-time = "2026-05-28T12:00:14.09Z" }, + { url = "https://files.pythonhosted.org/packages/8a/22/9bf80a56069c0c443fcfefac639a86a744550a2898817a6dfd3e26654924/rpds_py-2026.5.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e0b360f316d966b048b085857630b3cc51f3db2f07b06f440eac8f695374d1e3", size = 585633, upload-time = "2026-05-28T12:00:15.66Z" }, + { url = "https://files.pythonhosted.org/packages/da/68/3b2c0a75c9e04125696f84ebdbbf304acf5a40b58ba4481cdb98a922c3ba/rpds_py-2026.5.1-cp313-cp313t-win32.whl", hash = "sha256:a2999883eedf72fdfb7520b92c7d4ec2572a71ff40239377aa604cc529eecafc", size = 210074, upload-time = "2026-05-28T12:00:17.291Z" }, + { url = "https://files.pythonhosted.org/packages/e7/8b/609157d5a25d37d4f29f92840ba531f416907c34ae5c5739dd21fc2bef98/rpds_py-2026.5.1-cp313-cp313t-win_amd64.whl", hash = "sha256:e07be2a9d7122bd6e82dea89814ef8dc893feb1aae97fec1630f3263bbb30e55", size = 228635, upload-time = "2026-05-28T12:00:18.73Z" }, + { url = "https://files.pythonhosted.org/packages/d4/6f/19c1918a4b590d8de87e712e4abe4b3875771eff60216fb6153cf6665c68/rpds_py-2026.5.1-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:1f2c391c3059798093b65df23aca2cac150460ae9c630d99dec83d703d9485b9", size = 349756, upload-time = "2026-05-28T12:00:20.217Z" }, + { url = "https://files.pythonhosted.org/packages/e5/60/a06fe7da34eca79dacbf958a2ba0c6eea85bc2b29de20080bf40f72f66fa/rpds_py-2026.5.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:413b424f7c4ee65ab5e5be91f5731be0f8b41a1ee2b12dfe810d716312e95a78", size = 343831, upload-time = "2026-05-28T12:00:21.711Z" }, + { url = "https://files.pythonhosted.org/packages/bf/ec/b2333b97b90e2a6ef6ca8ad386ee284968e74bcfe113b3f1a8d9036429a9/rpds_py-2026.5.1-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2c595a1d9255dce0599e13130d1440ab2506654f2b50294226ee06402f8fef63", size = 375127, upload-time = "2026-05-28T12:00:23.326Z" }, + { url = "https://files.pythonhosted.org/packages/14/7f/e00aae54067f2b488c4637961d5f58204d470795fc791085fa3f15060d2e/rpds_py-2026.5.1-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1c27c5f6102eac8c03e7595a00827a53b271ba40a53b59ff8709170e0855ea4a", size = 379034, upload-time = "2026-05-28T12:00:24.89Z" }, + { url = "https://files.pythonhosted.org/packages/be/cc/423999bbb8ae8dc93c77fc1d5e984ade5eb89d237d3bb884ccfa72ae2890/rpds_py-2026.5.1-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6c7fcf61d44cacecaf3aea542b0e053db77972a4573e7ceda16fb2b399161195", size = 490823, upload-time = "2026-05-28T12:00:26.676Z" }, + { url = "https://files.pythonhosted.org/packages/0f/aa/c671bf660f12e68d3c52ff86c7066ed1372df5a0f4f2ff584e419b8207e7/rpds_py-2026.5.1-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2c817a189d4ee14290420e5ff051e4dd6baa13f3edf84685071dee07a6d538ee", size = 388144, upload-time = "2026-05-28T12:00:28.577Z" }, + { url = "https://files.pythonhosted.org/packages/19/c8/d63bb75b68afe77b229e3021c6031bcaf01da5db5b0e69d0d10f9ba679a7/rpds_py-2026.5.1-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21846aac0ed2e0589f38c12dc44e77bb64e494b771eadbcf169cba00566ba7ba", size = 371959, upload-time = "2026-05-28T12:00:30.304Z" }, + { url = "https://files.pythonhosted.org/packages/82/35/c51122014d8274ff37dc606d60049c3db7d83da02b5b282511e5a906a9a6/rpds_py-2026.5.1-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:b317c87a13f769a4e787819bd508aaa5d69aa09b0880de9af6d3a8a54571cdec", size = 383558, upload-time = "2026-05-28T12:00:31.764Z" }, + { url = "https://files.pythonhosted.org/packages/e3/f9/2790cb99c136a5363acdeacf5c27c56f3de0d4118a1f48fca83404c99c89/rpds_py-2026.5.1-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ce87129d9f2c14fa6c4a8601fb80eb4488c80d38a20cd13758ef11123e14995d", size = 402789, upload-time = "2026-05-28T12:00:33.247Z" }, + { url = "https://files.pythonhosted.org/packages/e5/1b/e4fb584f8c75d35c38150ff6a332cda949e6f97acba1f4fd123b14ab56fe/rpds_py-2026.5.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:9cdddb6c1207d284d94fd1530adf57fbd797fe7c4b8704ba85f49414f2557e7d", size = 551405, upload-time = "2026-05-28T12:00:34.819Z" }, + { url = "https://files.pythonhosted.org/packages/d8/f7/a6731b4216cb3793ea1af5391da240f5683dacc0d13e034fe5fc3503f240/rpds_py-2026.5.1-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:4e237e139f94d3c036fd28eb9f564c99055476ff4ff05cd42be55ce349b5aa02", size = 616975, upload-time = "2026-05-28T12:00:36.268Z" }, + { url = "https://files.pythonhosted.org/packages/2c/ea/2e051a81d95d8e63f4b35a1c463a87e8766bc3d083c067c5dfb6bf220747/rpds_py-2026.5.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:ed0954b524873214369184a9c82b0eaa45a3fbb9a798cd95b17e0d98499e7ea0", size = 578701, upload-time = "2026-05-28T12:00:37.82Z" }, + { url = "https://files.pythonhosted.org/packages/65/56/b5f6fdb2083e32bca8a8993d89e70db114b4756c9e2c38421328126689d2/rpds_py-2026.5.1-cp314-cp314-win32.whl", hash = "sha256:2d88621d6a7d4dfa633d21abe90f280bb205274e16b1d1e61c6ad4640b2453b7", size = 209806, upload-time = "2026-05-28T12:00:39.492Z" }, + { url = "https://files.pythonhosted.org/packages/fb/80/65a5aa96c155e611d1ed844e4e1f57f3e36b021f396d9f8585d756e6b90d/rpds_py-2026.5.1-cp314-cp314-win_amd64.whl", hash = "sha256:cef8ac28d26f4dda3533060c20fbf80a325458fa9fd23ea72a73cdfa8e978838", size = 225985, upload-time = "2026-05-28T12:00:40.94Z" }, + { url = "https://files.pythonhosted.org/packages/27/7c/ad185212e87b05f196daef92bc5f3caf07298eb47c295b5585c3dd3093ac/rpds_py-2026.5.1-cp314-cp314-win_arm64.whl", hash = "sha256:eaaea962c68cdc68d4a533ba985ab8e9484277910bbfaa2ab3ef7732667bfed8", size = 221219, upload-time = "2026-05-28T12:00:43.15Z" }, + { url = "https://files.pythonhosted.org/packages/23/58/e14ae18759020334646b031e708ab4158d653a938822bfb7b95ef2e93aa3/rpds_py-2026.5.1-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:21942f52dbbd5f8758bf021213d28bd45c39e873e65e2407faf5f1846f5761ad", size = 352148, upload-time = "2026-05-28T12:00:44.638Z" }, + { url = "https://files.pythonhosted.org/packages/31/9b/5f4a1e2f960bca3ac5d052b139dd31eed97b259f9d909173821760d542e8/rpds_py-2026.5.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:f414556f6e3958300ff941e40c9f97e3dc9774ddd1b3434c475d73dd354bbed3", size = 345196, upload-time = "2026-05-28T12:00:46.14Z" }, + { url = "https://files.pythonhosted.org/packages/1a/71/1d9574d6a2fa20ab60eaa55c7467f5aa20cbc770f341a05f09c0876f59e2/rpds_py-2026.5.1-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ef1013a8625c74043210190b246f5b1551e09757c1f356c6e4160ef96c5bc081", size = 374981, upload-time = "2026-05-28T12:00:47.531Z" }, + { url = "https://files.pythonhosted.org/packages/0c/9a/37e99f4915a80aa71670263c1267f7ae0af95f53a3f61e6c3bdc016d4515/rpds_py-2026.5.1-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:cc68e231a77a5f0d774ae278a1f8e55c0456501820847c1e4efb3829f3441df6", size = 379961, upload-time = "2026-05-28T12:00:49.216Z" }, + { url = "https://files.pythonhosted.org/packages/a8/ff/6e73f74b89d2e0715e0fc86b7dde893f9a61ae2f9b256ff3bdfe41ac4e94/rpds_py-2026.5.1-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9baffb505aff33acc69b422a19f77806680f3c8632227d79f48de8a810d1c2c5", size = 495965, upload-time = "2026-05-28T12:00:51.111Z" }, + { url = "https://files.pythonhosted.org/packages/ea/e0/425faba25f59d74d4638b267f7c7a80e8649d2ef4db10a19b0c4a71e6e6f/rpds_py-2026.5.1-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b8d2f912928d426e8cfa396f7f3f8d29a59e6689c86dcca3c420730c1096322b", size = 389526, upload-time = "2026-05-28T12:00:52.77Z" }, + { url = "https://files.pythonhosted.org/packages/c6/76/7a41960e3fddae47fab43a28684d5da981401dffd88253de0944148654cb/rpds_py-2026.5.1-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90f628283be835db980c941767d41c9a27b5239e54ba0a9c1335247e82406964", size = 376190, upload-time = "2026-05-28T12:00:54.215Z" }, + { url = "https://files.pythonhosted.org/packages/27/60/5f38dc70824fc6951b51d35377e577a3a3a4c81a6769cc5a2de25ebe0ad1/rpds_py-2026.5.1-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:1ebb2f0ab7e16132995a72de805170e0203df0c3dd22e1ef1cd1fdd90bd7a131", size = 383921, upload-time = "2026-05-28T12:00:55.673Z" }, + { url = "https://files.pythonhosted.org/packages/60/1a/d60a38caa1505f4b9483c3fbbde12c94e1079154f4f401a6da96f7e77621/rpds_py-2026.5.1-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f3df3d16ded76f1f8c9cdebd0e1ea55fdf4c23b812de189814da7cf229c22a81", size = 404766, upload-time = "2026-05-28T12:00:57.518Z" }, + { url = "https://files.pythonhosted.org/packages/87/ff/602fd3f174d6425f0bce05ad0dfbec0e96b38d0f7d08a79af5aa20083885/rpds_py-2026.5.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:9af8905b8f854990e40d5206aa5ac58d9b0fe0b7f351ff2bb086c20f6c8c6a47", size = 551343, upload-time = "2026-05-28T12:00:58.978Z" }, + { url = "https://files.pythonhosted.org/packages/b8/c1/1be13327acdbead3eca1fde03b6a34dbb011f1e864e217f0d32cc1779a7f/rpds_py-2026.5.1-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:036a36a87fb1cd3b214d11c4b3c4f7d2ddad933625dca1c900b56a057c07740a", size = 618502, upload-time = "2026-05-28T12:01:00.656Z" }, + { url = "https://files.pythonhosted.org/packages/f3/d7/afb49b49d7f2be8b7ba1a9f0977fa5168003437b93086726f066544e8351/rpds_py-2026.5.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:62ae3853454fe9ef283a03c96c2d835d39e84b14643a9d62c82ef0fb87d702ca", size = 581916, upload-time = "2026-05-28T12:01:02.22Z" }, + { url = "https://files.pythonhosted.org/packages/25/d1/dbef8c1f8a10f07beb62b5f054e20099fd9924b3ec001b8f0b6ac7813a85/rpds_py-2026.5.1-cp314-cp314t-win32.whl", hash = "sha256:6c3d771a46ec18b12af06ce36243a9a80b07a5d0515236332d90863ca8bb326a", size = 207855, upload-time = "2026-05-28T12:01:03.821Z" }, + { url = "https://files.pythonhosted.org/packages/2a/72/bfa4e61ab8e7dc1c8adf397e05e6cbdd4239357bd72b248d3de662f23915/rpds_py-2026.5.1-cp314-cp314t-win_amd64.whl", hash = "sha256:c93c629be4636cf54337bd5f06c104d55e42ced54d681f6fe21ae510a65116f6", size = 225422, upload-time = "2026-05-28T12:01:05.194Z" }, + { url = "https://files.pythonhosted.org/packages/27/3a/7b5da92b640f67b6717ccafc83cdd06bfa7ff2395c3685c68922bb54d703/rpds_py-2026.5.1-cp315-cp315-macosx_10_12_x86_64.whl", hash = "sha256:3574b55c604b8f75dacb007136508bbc0db406e626301778096a133327e7f2fb", size = 349576, upload-time = "2026-05-28T12:01:06.722Z" }, + { url = "https://files.pythonhosted.org/packages/d7/8a/2aafd7ad355a1bd48ca76e2262b74b15e6432b5a1efe150efd4d779cd55d/rpds_py-2026.5.1-cp315-cp315-macosx_11_0_arm64.whl", hash = "sha256:94068eb3ae6d43f5a786b7db96a406a34e6d5c24489feef32fd6e8946ea7b291", size = 343640, upload-time = "2026-05-28T12:01:08.441Z" }, + { url = "https://files.pythonhosted.org/packages/f7/7d/6c9523c1abbe840a1b7fba3c516d48e1d3487cc80fea4366c4071cf56784/rpds_py-2026.5.1-cp315-cp315-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3a5b10e8ce894825f380a8f1b6444cf73c294dfea62afbb2d13e3a9e630cec1", size = 375322, upload-time = "2026-05-28T12:01:09.934Z" }, + { url = "https://files.pythonhosted.org/packages/5a/5d/0b7b03fb1dc509321f01de3149784ab773e34c8573022029af8076afcb9c/rpds_py-2026.5.1-cp315-cp315-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fc09f82e63d4bcd58149572f857a431bae851dc747e313c3b5bdf7abb907fda8", size = 379066, upload-time = "2026-05-28T12:01:11.48Z" }, + { url = "https://files.pythonhosted.org/packages/d7/e2/8ef6012999ebf1cb1c22f876d9ce5e63d960fd4631d2af3202d3f480aa25/rpds_py-2026.5.1-cp315-cp315-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e10464d17df3b582745c25cec695cb9558bca2cb6ddb631aee1787fc72c767b2", size = 494586, upload-time = "2026-05-28T12:01:13.051Z" }, + { url = "https://files.pythonhosted.org/packages/80/af/1eeb029bec67582c226b7809172207cd005073af4ebd906e65ff494f4983/rpds_py-2026.5.1-cp315-cp315-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ba05adbf15d994c38ec0b7ab32e858e5110c21e9009a00a86545fd220f84e038", size = 388415, upload-time = "2026-05-28T12:01:14.631Z" }, + { url = "https://files.pythonhosted.org/packages/18/23/ffbe10711c4d766c1cab0557d6906c074f795814863c67b351355d29354a/rpds_py-2026.5.1-cp315-cp315-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:77c004fdc7b891967106f78ddfd7b076bfe6813c6139c6fff6aed3bcaa960b26", size = 372427, upload-time = "2026-05-28T12:01:16.153Z" }, + { url = "https://files.pythonhosted.org/packages/bd/3a/30ba4a6ad457e5b070c18d742a33fb77d8d922b565cc881f8a5313d63bfe/rpds_py-2026.5.1-cp315-cp315-manylinux_2_31_riscv64.whl", hash = "sha256:83bcf894486c9d78dd290d3c0124ff6dd8875d3025e2090a8ec49fcc37c55fdd", size = 383615, upload-time = "2026-05-28T12:01:17.809Z" }, + { url = "https://files.pythonhosted.org/packages/d3/69/62e242b53ce39c0814bd24e1a6e6eba6c92be716277745f317f9540a2e7b/rpds_py-2026.5.1-cp315-cp315-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c3df104083952a0e0c6f10de33e440eabe98fb6317d23e1a58c68f6df08d01b9", size = 402786, upload-time = "2026-05-28T12:01:19.419Z" }, + { url = "https://files.pythonhosted.org/packages/38/c1/a770b9c186928a1ed0f7e6d7ae50e7f3950ed23e3f9e366dbc8e38cb55de/rpds_py-2026.5.1-cp315-cp315-musllinux_1_2_aarch64.whl", hash = "sha256:980450826cf22e133c57e0835070bdd0dd3f73b9b708c3ce223def2cb9469e14", size = 551583, upload-time = "2026-05-28T12:01:21.013Z" }, + { url = "https://files.pythonhosted.org/packages/21/7c/68e8579b95375b70d2a963103c42e705856cdb98569258bd807f4423891c/rpds_py-2026.5.1-cp315-cp315-musllinux_1_2_i686.whl", hash = "sha256:205dde846f24332ab0c1188699a043b8d165b79bb84529ce272c45048ff6be01", size = 616941, upload-time = "2026-05-28T12:01:22.548Z" }, + { url = "https://files.pythonhosted.org/packages/70/a1/a6135aed5730ff03ab957182259987ac11e55fb392a28dc6f0592048a280/rpds_py-2026.5.1-cp315-cp315-musllinux_1_2_x86_64.whl", hash = "sha256:3966b82dd563176396df030f3dd52a6e54cb69b718e95e78bd555ed3d1e0185d", size = 578349, upload-time = "2026-05-28T12:01:24.118Z" }, + { url = "https://files.pythonhosted.org/packages/09/6e/f24201a76a84e6c49d0bdfdfcb735210e21701e9b21c5bfc0ba497dd62f6/rpds_py-2026.5.1-cp315-cp315-win32.whl", hash = "sha256:7818f8d0a415be74d2be3590b0a1c1f463a642f4d0217e7d10602dceef5b79aa", size = 209922, upload-time = "2026-05-28T12:01:25.522Z" }, + { url = "https://files.pythonhosted.org/packages/9e/e4/966bc240bb0485fc265278f6de44d05834bf0b3618886e0b22e33d54c49a/rpds_py-2026.5.1-cp315-cp315-win_amd64.whl", hash = "sha256:b3cc20c0d800af78fd0fac68086e28c1856cec51ea528bb81ea851aa40d39325", size = 226003, upload-time = "2026-05-28T12:01:27.062Z" }, + { url = "https://files.pythonhosted.org/packages/5c/5c/a15a59269cd5e74472734516c73795c15eccfc841b3d4b0228c3f53f19d0/rpds_py-2026.5.1-cp315-cp315-win_arm64.whl", hash = "sha256:3609e9939a8a76cd904cf98a3f1f13b5dc7e150adeaee89e0ea09652ea213e16", size = 221245, upload-time = "2026-05-28T12:01:28.51Z" }, + { url = "https://files.pythonhosted.org/packages/e0/22/135ce03804e179a71ceb13be095deda4a279bc88f7a6b8fa161c5ad44e12/rpds_py-2026.5.1-cp315-cp315t-macosx_10_12_x86_64.whl", hash = "sha256:5d333a7127d4b307601ac37792bee01bb95c867cbfacf21b6375b804d6bbd723", size = 352015, upload-time = "2026-05-28T12:01:30.214Z" }, + { url = "https://files.pythonhosted.org/packages/3b/5f/f1f6d2652eb9d848f6eb369d8db83a2da6249bb49ad2c2a48f45d54538d3/rpds_py-2026.5.1-cp315-cp315t-macosx_11_0_arm64.whl", hash = "sha256:b5f077b44a4f7808520f66dae234988d867deb9aed9be5da057ce9ba831b2a41", size = 345016, upload-time = "2026-05-28T12:01:31.656Z" }, + { url = "https://files.pythonhosted.org/packages/88/66/b74182775691ea2290c99e52ac8d5db844e56fbec90ce421f107658c8314/rpds_py-2026.5.1-cp315-cp315t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:55d8f9b7b78c9538fc9e04e82ec0e888ff0c3cffcfad152c77e57cd09351a98a", size = 374775, upload-time = "2026-05-28T12:01:33.136Z" }, + { url = "https://files.pythonhosted.org/packages/ff/8f/15e5a61d9f0a43902d36561d4f07cae6ae9f4716be825159fd72717f33af/rpds_py-2026.5.1-cp315-cp315t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e3a8ae58895ac107ed934a6bf51e5846f95c53b9b940c2c6d310838fd5846358", size = 380270, upload-time = "2026-05-28T12:01:34.574Z" }, + { url = "https://files.pythonhosted.org/packages/02/c3/f859b12763a80540cdf2af0f15b19904cf756a71d7bdd3f82ff3e5b1bbf9/rpds_py-2026.5.1-cp315-cp315t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0957cf3c2b8632ec7aaebffebea8005b353cc2a237b6e2ae3c2cac0820704cfb", size = 495285, upload-time = "2026-05-28T12:01:36.127Z" }, + { url = "https://files.pythonhosted.org/packages/1c/c7/ff27c2ac8411d30b03b1829fd88cae8dad1a4d0da48dd25e57c4038042e6/rpds_py-2026.5.1-cp315-cp315t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c396c1304de421050b3681ea70f371874b54d41b0151e96109758144c231e30b", size = 389581, upload-time = "2026-05-28T12:01:37.635Z" }, + { url = "https://files.pythonhosted.org/packages/6e/67/fe92ee32a6cc05c77228a2f8b1762e7124f386ec20ff83d0757b762d58d0/rpds_py-2026.5.1-cp315-cp315t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aad1bff7f666b9598e573815affd666aac6a13a585dde336f843e33350c7fadc", size = 376041, upload-time = "2026-05-28T12:01:39.307Z" }, + { url = "https://files.pythonhosted.org/packages/f8/91/b4d6685c27aba55bd82f25b278be8237038117d05f9659a6213ad3408130/rpds_py-2026.5.1-cp315-cp315t-manylinux_2_31_riscv64.whl", hash = "sha256:656a042550878f12d45752452d47094b7cfe5ad1e9d7b87b5a22ad3ae5ff8015", size = 383946, upload-time = "2026-05-28T12:01:41.043Z" }, + { url = "https://files.pythonhosted.org/packages/bd/79/2c1d832a53c8e0f8e98fc970ec257b950fecd4f62be2ab7182b500a0cbc8/rpds_py-2026.5.1-cp315-cp315t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:73c4bd4f70294737b5206a3e8e30ccadbf8a60301831c8ea23eec5dbeea1ecfa", size = 405526, upload-time = "2026-05-28T12:01:43.032Z" }, + { url = "https://files.pythonhosted.org/packages/78/c4/c98117b03c6a8581ab2c2dfccfe9a5ad82bd8128a3c28b46a6ad2d97c393/rpds_py-2026.5.1-cp315-cp315t-musllinux_1_2_aarch64.whl", hash = "sha256:43bca78665423cabae77146f2fe7ce55272b6c8d55d82cca83effd42c7e13972", size = 551165, upload-time = "2026-05-28T12:01:44.648Z" }, + { url = "https://files.pythonhosted.org/packages/3b/c1/bc479ca069200af730881b1bd525e3114b2b391a351509fcb1b772f28086/rpds_py-2026.5.1-cp315-cp315t-musllinux_1_2_i686.whl", hash = "sha256:42d0f20e85e549c870749d0e247f0c10d318a45b7e9676d575d2dcb04a1b2e66", size = 618778, upload-time = "2026-05-28T12:01:46.337Z" }, + { url = "https://files.pythonhosted.org/packages/77/65/38ab2f90df44c2febfb63cc10ced40763d9b4bc94d173e734528663fe7f5/rpds_py-2026.5.1-cp315-cp315t-musllinux_1_2_x86_64.whl", hash = "sha256:b1be5c35683684d5331b93600c210e8367c254683d8a6df6bd21bd2da3a334fb", size = 581839, upload-time = "2026-05-28T12:01:48.109Z" }, + { url = "https://files.pythonhosted.org/packages/15/2d/ce1f605fe036aadd460e5822e578c6c7ec3a860936cca37d6e0f299daa77/rpds_py-2026.5.1-cp315-cp315t-win32.whl", hash = "sha256:75808f6c38ce7749bb68cc2770161aae5045e6c6f6781a9782e74b93304399df", size = 207866, upload-time = "2026-05-28T12:01:49.648Z" }, + { url = "https://files.pythonhosted.org/packages/79/cb/966040123eb102371559746908ef2c9471f4d43e17ec9a645a2258dab64b/rpds_py-2026.5.1-cp315-cp315t-win_amd64.whl", hash = "sha256:90bd6630002a1c7f09e7843dd79f0d24f3d2897cc25a753480917865d14f15b3", size = 225441, upload-time = "2026-05-28T12:01:51.408Z" }, +] + [[package]] name = "six" version = "1.17.0" diff --git a/pipeline/workflow/spanner-ingestion-workflow.yaml b/pipeline/workflow/spanner-ingestion-workflow.yaml index d0a878067..4bc2d5c6e 100644 --- a/pipeline/workflow/spanner-ingestion-workflow.yaml +++ b/pipeline/workflow/spanner-ingestion-workflow.yaml @@ -121,38 +121,52 @@ main: run_aggregation_job: params: [import_list, helper_url] steps: - - run_aggregation: + # 1. Start the aggregation and get the initial state + - initiate_aggregation: call: http.post args: - url: ${helper_url + "/aggregation/run"} + url: ${helper_url + "/aggregation/initiate"} timeout: 300 auth: type: OIDC body: importList: ${import_list} - result: aggregation_response - - check_aggregation_status_loop: + result: initiate_response + # 2. Store the state in a workflow variable + - assign_state: + assign: + - state: ${initiate_response.body} + # 3. State Check loop + - check_status_loop: + switch: + # Exit successfully if done + - condition: ${state.status == "SUCCEEDED"} + return: "OK" + # Raise error if failed + - condition: ${state.status == "FAILED"} + raise: ${state.error} + next: poll_and_wait + # 4. Sleep and Poll + - poll_and_wait: steps: - - wait_for_aggregation: + - wait_step: call: sys.sleep args: seconds: 300 - - check_aggregation_status: + # Pass the state back to the server, get the new state + - poll_server: call: http.post args: - url: ${helper_url + "/aggregation/status"} + url: ${helper_url + "/aggregation/poll"} auth: type: OIDC - body: - jobIds: ${aggregation_response.body.jobIds} - result: aggregation_status_response - - evaluate_aggregation_status: - switch: - - condition: ${aggregation_status_response.body.status == "DONE"} - return: 'OK' - - condition: ${aggregation_status_response.body.status == "FAILED"} - raise: ${aggregation_status_response.body.error} - next: check_aggregation_status_loop + body: ${state} + result: poll_response + - update_state: + assign: + - state: ${poll_response.body} + next: check_status_loop + # This sub-workflow launches a Dataflow job and waits for it to complete. run_dataflow_job: From d2e3128e3b6bca95090c076be3d6bc66d4823749 Mon Sep 17 00:00:00 2001 From: Sandeep Tuniki Date: Wed, 24 Jun 2026 14:18:09 +0530 Subject: [PATCH 02/33] fix(aggregation): handle PENDING state in polling and short-circuit empty job list - Fixed a bug where a PENDING BigQuery job could cause premature transition to the next stage. - Changed the transition check to strictly require a DONE status. - Added short-circuiting for empty active job lists. - Added a new integration test 'test_aggregation_poll_still_running' to verify PENDING handling. --- .../workflow/ingestion-helper/app_test.py | 28 +++++++++++++++++++ .../ingestion-helper/routes/aggregation.py | 13 +++++---- 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/pipeline/workflow/ingestion-helper/app_test.py b/pipeline/workflow/ingestion-helper/app_test.py index d19fdfb7e..12862c5dc 100644 --- a/pipeline/workflow/ingestion-helper/app_test.py +++ b/pipeline/workflow/ingestion-helper/app_test.py @@ -315,6 +315,34 @@ def test_aggregation_poll_transition(self, mock_aggregation_utils): self.assertEqual(state["current_stage"], 2) # Transitioned to 2! self.assertEqual(state["active_job_ids"], ["job-stage2-1"]) + @patch('routes.aggregation._get_orchestrator') + def test_aggregation_poll_still_running(self, mock_aggregation_utils): + # Setup mock orchestrator to simulate jobs still in PENDING state + mock_instance = MagicMock() + mock_aggregation_utils.return_value = mock_instance + + # Mock BQ reporting Stage 1 jobs are PENDING (still executing) + mock_instance.check_jobs_status.return_value = {"status": "PENDING"} + + # Input state + payload = { + "status": "RUNNING", + "current_stage": 1, + "active_job_ids": ["job-1", "job-2"], + "import_list": [{"importName": "USFed_Census"}] + } + + # Call endpoint + response = client.post("/aggregation/poll", json=payload) + + # Assertions + self.assertEqual(response.status_code, 200) + state = response.json() + # Verify state is returned unchanged + self.assertEqual(state["status"], "RUNNING") + self.assertEqual(state["current_stage"], 1) + self.assertEqual(state["active_job_ids"], ["job-1", "job-2"]) + @patch('routes.aggregation._get_orchestrator') def test_aggregation_legacy_run(self, mock_aggregation_utils): # Setup mock orchestrator diff --git a/pipeline/workflow/ingestion-helper/routes/aggregation.py b/pipeline/workflow/ingestion-helper/routes/aggregation.py index ce7434763..5f1f59e90 100644 --- a/pipeline/workflow/ingestion-helper/routes/aggregation.py +++ b/pipeline/workflow/ingestion-helper/routes/aggregation.py @@ -128,8 +128,11 @@ def poll_aggregation(state: StateObject): import_names = [item.get('importName') for item in state.import_list if item.get('importName')] # 1. Check status of active jobs in BigQuery - logging.info(f"Polling status for jobs in Stage {state.current_stage}: {state.active_job_ids}") - bq_status = orchestrator.check_jobs_status(state.active_job_ids) + if not state.active_job_ids: + bq_status = {"status": "DONE"} + else: + logging.info(f"Polling status for jobs in Stage {state.current_stage}: {state.active_job_ids}") + bq_status = orchestrator.check_jobs_status(state.active_job_ids) # Case A: Any job failed if bq_status["status"] == "FAILED": @@ -142,9 +145,9 @@ def poll_aggregation(state: StateObject): error=bq_status.get("error") ) - # Case B: Jobs are still running - if bq_status["status"] == "RUNNING": - logging.info(f"Stage {state.current_stage} is still executing.") + # Case B: Jobs are still executing (explicitly check for DONE to transition) + if bq_status["status"] != "DONE": + logging.info(f"Stage {state.current_stage} is still executing (status: {bq_status['status']}).") return state # Return unchanged # Case C: All jobs succeeded -> Find and execute the next active stage From 43f3c56fe2987b2a4c786b13d746a86e7a14bbf2 Mon Sep 17 00:00:00 2001 From: Sandeep Tuniki Date: Wed, 24 Jun 2026 14:20:29 +0530 Subject: [PATCH 03/33] feat(aggregation): calculate max_stage dynamically from configuration - Replaced all hardcoded upper limits of stage '10' in API routes and legacy wrappers. - Dynamically calculate the maximum stage from the loaded aggregations config. - Updated integration tests in app_test.py to mock the aggregations list for correct stage limit evaluation. --- pipeline/workflow/ingestion-helper/app_test.py | 2 ++ pipeline/workflow/ingestion-helper/routes/aggregation.py | 9 ++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/pipeline/workflow/ingestion-helper/app_test.py b/pipeline/workflow/ingestion-helper/app_test.py index 12862c5dc..f82003676 100644 --- a/pipeline/workflow/ingestion-helper/app_test.py +++ b/pipeline/workflow/ingestion-helper/app_test.py @@ -290,6 +290,7 @@ def test_aggregation_poll_transition(self, mock_aggregation_utils): # Setup mock orchestrator to simulate Stage 1 completion and Stage 2 execution mock_instance = MagicMock() mock_aggregation_utils.return_value = mock_instance + mock_instance.aggregations = [{"stage": 1}, {"stage": 2}] # Mock BQ reporting Stage 1 jobs are DONE mock_instance.check_jobs_status.return_value = {"status": "DONE"} @@ -348,6 +349,7 @@ def test_aggregation_legacy_run(self, mock_aggregation_utils): # Setup mock orchestrator mock_instance = MagicMock() mock_aggregation_utils.return_value = mock_instance + mock_instance.aggregations = [{"stage": 1}, {"stage": 2}] mock_instance.has_stage.side_effect = lambda stage, imports: stage in [1, 2] mock_instance.execute_stage.side_effect = lambda stage, imports: [f"job-stage{stage}-1"] diff --git a/pipeline/workflow/ingestion-helper/routes/aggregation.py b/pipeline/workflow/ingestion-helper/routes/aggregation.py index 5f1f59e90..f28204df9 100644 --- a/pipeline/workflow/ingestion-helper/routes/aggregation.py +++ b/pipeline/workflow/ingestion-helper/routes/aggregation.py @@ -94,7 +94,8 @@ def initiate_aggregation(req: InitiateRequest): # Find the first stage that has active aggregations (usually Stage 1) first_stage = 1 - while first_stage <= 10: # Arbitrary upper limit for safety + max_stage = max((cfg.get("stage", 1) for cfg in orchestrator.aggregations), default=1) + while first_stage <= max_stage: if orchestrator.has_stage(first_stage, import_names): break first_stage += 1 @@ -152,7 +153,8 @@ def poll_aggregation(state: StateObject): # Case C: All jobs succeeded -> Find and execute the next active stage next_stage = state.current_stage + 1 - while next_stage <= 10: # Arbitrary upper limit + max_stage = max((cfg.get("stage", 1) for cfg in orchestrator.aggregations), default=1) + while next_stage <= max_stage: if orchestrator.has_stage(next_stage, import_names): logging.info(f"Stage {state.current_stage} completed. Transitioning to Stage {next_stage}...") new_job_ids = orchestrator.execute_stage(next_stage, import_names) @@ -201,7 +203,8 @@ def run_aggregation_legacy(req: LegacyAggregationRequest): # Compatibility Mode: Submit ALL enabled stages in parallel job_ids = [] - for stage_num in range(1, 10): + max_stage = max((cfg.get("stage", 1) for cfg in orchestrator.aggregations), default=1) + for stage_num in range(1, max_stage + 1): if orchestrator.has_stage(stage_num, import_names): job_ids.extend(orchestrator.execute_stage(stage_num, import_names)) From ce20588fa506801a1bc0591f9120ee4c13136b1b Mon Sep 17 00:00:00 2001 From: Sandeep Tuniki Date: Wed, 24 Jun 2026 14:22:37 +0530 Subject: [PATCH 04/33] style(logging): follow logging best practices and remove module-level side effects - Removed module-level logging.basicConfig call from validator.py. - Added logging.basicConfig inside validator's CLI main() function to ensure logging is configured only during standalone script execution. - Removed global logging.getLogger().setLevel(logging.INFO) call from orchestrator.py to prevent altering root logger levels on module import. --- pipeline/workflow/ingestion-helper/aggregation/orchestrator.py | 2 -- pipeline/workflow/ingestion-helper/aggregation/validator.py | 3 +-- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py b/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py index db823a8b4..a3fe88efc 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py +++ b/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py @@ -24,8 +24,6 @@ from .stat_var_group_generator import StatVarGroupGenerator from .validator import validate_config -logging.getLogger().setLevel(logging.INFO) - class AggregationOrchestrator: """Orchestrates the overall aggregation workflow.""" diff --git a/pipeline/workflow/ingestion-helper/aggregation/validator.py b/pipeline/workflow/ingestion-helper/aggregation/validator.py index e1ac6d2e1..a2ac70a2d 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/validator.py +++ b/pipeline/workflow/ingestion-helper/aggregation/validator.py @@ -23,8 +23,6 @@ import yaml import jsonschema -logging.basicConfig(level=logging.INFO) - # ANSI escape codes for colored terminal output GREEN = "\033[92m" RED = "\033[91m" @@ -83,6 +81,7 @@ def validate_config(config_file_path: str, schema_file_path: str) -> List[Dict[s def main(): """CLI entry point for standalone configuration validation.""" + logging.basicConfig(level=logging.INFO) parser = argparse.ArgumentParser(description="Validate Data Commons aggregation configuration files against the JSON Schema.") # Resolve default paths relative to this script's directory (aggregation/) From 4ab64686cfc596412c8fa0401cc3a355f59810c4 Mon Sep 17 00:00:00 2001 From: Sandeep Tuniki Date: Wed, 24 Jun 2026 14:45:50 +0530 Subject: [PATCH 05/33] fix(aggregation): fail-fast on unsupported step types and fix validator schema bypass - Added an else block in orchestrator's execute_stage() that raises ValueError for unsupported/unimplemented step types (like 'entity'), preventing silent failures. - Removed the early-return validation bypass in validator.py, ensuring that jsonschema strictly validates missing 'aggregations' keys. - Added unit test 'test_execute_stage_unsupported_type' in orchestrator_test.py to verify the ValueError fail-fast routing. - Added unit tests 'test_validate_config_missing_aggregations_key' and 'test_validate_config_empty_file' in validator_test.py to cover the validation fixes. --- .../aggregation/orchestrator.py | 2 ++ .../aggregation/orchestrator_test.py | 23 +++++++++++++++++-- .../ingestion-helper/aggregation/validator.py | 5 ++-- .../aggregation/validator_test.py | 20 ++++++++++++++++ 4 files changed, 45 insertions(+), 5 deletions(-) diff --git a/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py b/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py index a3fe88efc..0e7e0a133 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py +++ b/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py @@ -135,6 +135,8 @@ def execute_stage(self, stage_num: int, active_imports: List[str]) -> List[str]: elif step_type == "stat_var_groups": step_jobs.extend(self.stat_var_group_generator.run_all(applicable_imports)) + else: + raise ValueError(f"Unsupported or unimplemented aggregation step type: {step_type}") # Collect BQ jobs for job in step_jobs: diff --git a/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py b/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py index 4301944c5..64a4eefe3 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py +++ b/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py @@ -53,8 +53,6 @@ """ - - @patch('aggregation.orchestrator.BigQueryExecutor') @patch('aggregation.orchestrator.PlaceAggregationGenerator') @patch('aggregation.orchestrator.StatVarAggregator') @@ -172,6 +170,27 @@ def test_execute_stage_2_with_disabled_and_filtering(self, mock_file_open, skip_all_sources_present_check=True ) + @patch('builtins.open') + def test_execute_stage_unsupported_type(self, mock_file_open, *mocks): + """Tests that an unsupported aggregation step type raises ValueError.""" + # Use 'entity' which is valid in schema but unimplemented in orchestrator + unimplemented_config = """ + aggregations: + - type: entity + entity_types: ["MortalityEvent"] + location_props: ["location"] + imports: ["*"] + stage: 1 + """ + mock_file_open.side_effect = self._get_mock_open(unimplemented_config) + + utils = AggregationOrchestrator(connection_id="conn", project_id="proj", instance_id="inst", database_id="db") + + # Running Stage 1 should raise ValueError due to unimplemented "entity" type + with self.assertRaises(ValueError) as ctx: + utils.execute_stage(stage_num=1, active_imports=["USFed_Census"]) + self.assertIn("Unsupported or unimplemented aggregation step type: entity", str(ctx.exception)) + if __name__ == '__main__': unittest.main() diff --git a/pipeline/workflow/ingestion-helper/aggregation/validator.py b/pipeline/workflow/ingestion-helper/aggregation/validator.py index a2ac70a2d..c26427e8f 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/validator.py +++ b/pipeline/workflow/ingestion-helper/aggregation/validator.py @@ -57,9 +57,8 @@ def validate_config(config_file_path: str, schema_file_path: str) -> List[Dict[s logging.error(f"Failed to parse YAML file {config_file_path}: {e}") raise e - if not config or "aggregations" not in config: - logging.warning("Aggregation config is empty or missing 'aggregations' key.") - return [] + if config is None: + config = {} # 2. Load JSON Schema try: diff --git a/pipeline/workflow/ingestion-helper/aggregation/validator_test.py b/pipeline/workflow/ingestion-helper/aggregation/validator_test.py index fd38a35cc..34978f58d 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/validator_test.py +++ b/pipeline/workflow/ingestion-helper/aggregation/validator_test.py @@ -284,6 +284,26 @@ def test_validate_config_missing_schema_file(self): with self.assertRaises(FileNotFoundError): validate_config("aggregation.yaml", "non_existent_schema.json") + @patch('builtins.open') + def test_validate_config_missing_aggregations_key(self, mock_file_open): + """Verifies that missing the required 'aggregations' root key raises ValidationError.""" + missing_aggregations_yaml = """ + some_other_key: [] + """ + mock_file_open.side_effect = self._get_mock_open(missing_aggregations_yaml) + with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: + validate_config("aggregation.yaml", self.schema_path) + self.assertIn("'aggregations' is a required property", ctx.exception.message) + + @patch('builtins.open') + def test_validate_config_empty_file(self, mock_file_open): + """Verifies that a completely empty configuration file raises ValidationError.""" + empty_yaml = "" + mock_file_open.side_effect = self._get_mock_open(empty_yaml) + with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: + validate_config("aggregation.yaml", self.schema_path) + self.assertIn("'aggregations' is a required property", ctx.exception.message) + if __name__ == '__main__': unittest.main() From 1c011963b2c95b8c9e9f88bf7cccb957af286758 Mon Sep 17 00:00:00 2001 From: Sandeep Tuniki Date: Wed, 24 Jun 2026 14:52:03 +0530 Subject: [PATCH 06/33] perf(aggregation): optimize stage execution and resolve sparse stage loop performance issue - Added a new get_active_stages() helper in orchestrator.py that returns a sorted list of unique active and enabled stage numbers. - Replaced all sequential while and range loops in routes/aggregation.py with direct list-comprehension jumps using get_active_stages(). - This completely resolves the performance spike and timeout risk when a very large stage number (e.g. 100 million) is configured. - Added unit test 'test_get_active_stages' in orchestrator_test.py. - Updated and significantly simplified integration test mocks in app_test.py to mock get_active_stages() instead of aggregations/has_stage. --- .../aggregation/orchestrator.py | 20 ++++++++- .../aggregation/orchestrator_test.py | 21 +++++++++ .../workflow/ingestion-helper/app_test.py | 10 ++--- .../ingestion-helper/routes/aggregation.py | 45 +++++++++---------- 4 files changed, 64 insertions(+), 32 deletions(-) diff --git a/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py b/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py index 0e7e0a133..de6b7e0da 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py +++ b/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py @@ -165,9 +165,27 @@ def has_stage(self, stage_num: int, active_imports: List[str]) -> bool: # Check if it applies to any active imports if self._get_applicable_imports(config, active_imports): return True - + return False + def get_active_stages(self, active_imports: List[str]) -> List[int]: + """Returns a sorted list of unique, active, and enabled stage numbers. + + Args: + active_imports: The list of active import names. + + Returns: + A sorted list of unique active stage numbers. + """ + stages = set() + for config in self.aggregations: + if config.get("disabled", False): + continue + # Check if it applies to any active imports + if self._get_applicable_imports(config, active_imports): + stages.add(config.get("stage", 1)) + return sorted(list(stages)) + def check_jobs_status(self, job_ids: List[str]) -> Dict[str, Any]: """Checks the status of the specified BigQuery job IDs. diff --git a/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py b/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py index 64a4eefe3..2f9720995 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py +++ b/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py @@ -98,6 +98,27 @@ def test_has_stage(self, mock_file_open, *mocks): # Stage 3: Does not exist in config self.assertFalse(utils.has_stage(3, ["USFed_Census"])) + @patch('builtins.open') + def test_get_active_stages(self, mock_file_open, *mocks): + """Tests that get_active_stages correctly extracts, filters, and sorts active stages.""" + mock_file_open.side_effect = self._get_mock_open(VALID_CONFIG_YAML) + + utils = AggregationOrchestrator(connection_id="conn", project_id="proj", instance_id="inst", database_id="db") + + # 1. For active import "USFed_Census": + # Stage 1 (linked_edges, place) and Stage 2 (stat_var) have active steps. + # The place rollup in Stage 2 is disabled, but the stat_var step is enabled and active. + # Therefore, active stages should be [1, 2]. + stages = utils.get_active_stages(active_imports=["USFed_Census"]) + self.assertEqual(stages, [1, 2]) + + # 2. For active import "OtherImport": + # Stage 1 (linked_edges) matches via wildcard. + # Stage 2 (place rollup is disabled, stat_var does not match "OtherImport"). + # Therefore, only Stage 1 is active. Active stages should be [1]. + stages = utils.get_active_stages(active_imports=["OtherImport"]) + self.assertEqual(stages, [1]) + @patch('builtins.open') def test_execute_stage_1(self, mock_file_open, mock_svg_gen, mock_prov_gen, mock_edge_gen, diff --git a/pipeline/workflow/ingestion-helper/app_test.py b/pipeline/workflow/ingestion-helper/app_test.py index f82003676..65da0ca73 100644 --- a/pipeline/workflow/ingestion-helper/app_test.py +++ b/pipeline/workflow/ingestion-helper/app_test.py @@ -268,7 +268,7 @@ def test_aggregation_initiate_success(self, mock_aggregation_utils): # Setup mock orchestrator mock_instance = MagicMock() mock_aggregation_utils.return_value = mock_instance - mock_instance.has_stage.side_effect = lambda stage, imports: stage == 1 + mock_instance.get_active_stages.return_value = [1] mock_instance.execute_stage.return_value = ["job-1", "job-2"] # Call endpoint @@ -290,12 +290,10 @@ def test_aggregation_poll_transition(self, mock_aggregation_utils): # Setup mock orchestrator to simulate Stage 1 completion and Stage 2 execution mock_instance = MagicMock() mock_aggregation_utils.return_value = mock_instance - mock_instance.aggregations = [{"stage": 1}, {"stage": 2}] + mock_instance.get_active_stages.return_value = [1, 2] # Mock BQ reporting Stage 1 jobs are DONE mock_instance.check_jobs_status.return_value = {"status": "DONE"} - # Mock Stage 2 existence and execution - mock_instance.has_stage.side_effect = lambda stage, imports: stage == 2 mock_instance.execute_stage.return_value = ["job-stage2-1"] # Input state (Stage 1 completed) @@ -321,6 +319,7 @@ def test_aggregation_poll_still_running(self, mock_aggregation_utils): # Setup mock orchestrator to simulate jobs still in PENDING state mock_instance = MagicMock() mock_aggregation_utils.return_value = mock_instance + mock_instance.get_active_stages.return_value = [1] # Mock BQ reporting Stage 1 jobs are PENDING (still executing) mock_instance.check_jobs_status.return_value = {"status": "PENDING"} @@ -349,8 +348,7 @@ def test_aggregation_legacy_run(self, mock_aggregation_utils): # Setup mock orchestrator mock_instance = MagicMock() mock_aggregation_utils.return_value = mock_instance - mock_instance.aggregations = [{"stage": 1}, {"stage": 2}] - mock_instance.has_stage.side_effect = lambda stage, imports: stage in [1, 2] + mock_instance.get_active_stages.return_value = [1, 2] mock_instance.execute_stage.side_effect = lambda stage, imports: [f"job-stage{stage}-1"] # Call legacy endpoint diff --git a/pipeline/workflow/ingestion-helper/routes/aggregation.py b/pipeline/workflow/ingestion-helper/routes/aggregation.py index f28204df9..5860e6235 100644 --- a/pipeline/workflow/ingestion-helper/routes/aggregation.py +++ b/pipeline/workflow/ingestion-helper/routes/aggregation.py @@ -92,17 +92,13 @@ def initiate_aggregation(req: InitiateRequest): orchestrator = _get_orchestrator() import_names = [item.get('importName') for item in req.importList if item.get('importName')] - # Find the first stage that has active aggregations (usually Stage 1) - first_stage = 1 - max_stage = max((cfg.get("stage", 1) for cfg in orchestrator.aggregations), default=1) - while first_stage <= max_stage: - if orchestrator.has_stage(first_stage, import_names): - break - first_stage += 1 - else: + active_stages = orchestrator.get_active_stages(import_names) + if not active_stages: logging.info("No stages have active aggregations for the current imports. Completing immediately.") return StateObject(status="SUCCEEDED", current_stage=0, active_job_ids=[], import_list=req.importList) + first_stage = active_stages[0] + logging.info(f"Initiating aggregation at Stage {first_stage}") job_ids = orchestrator.execute_stage(first_stage, import_names) @@ -152,19 +148,19 @@ def poll_aggregation(state: StateObject): return state # Return unchanged # Case C: All jobs succeeded -> Find and execute the next active stage - next_stage = state.current_stage + 1 - max_stage = max((cfg.get("stage", 1) for cfg in orchestrator.aggregations), default=1) - while next_stage <= max_stage: - if orchestrator.has_stage(next_stage, import_names): - logging.info(f"Stage {state.current_stage} completed. Transitioning to Stage {next_stage}...") - new_job_ids = orchestrator.execute_stage(next_stage, import_names) - return StateObject( - status="RUNNING", - current_stage=next_stage, - active_job_ids=new_job_ids, - import_list=state.import_list - ) - next_stage += 1 + active_stages = orchestrator.get_active_stages(import_names) + next_stages = [s for s in active_stages if s > state.current_stage] + + if next_stages: + next_stage = next_stages[0] + logging.info(f"Stage {state.current_stage} completed. Transitioning to Stage {next_stage}...") + new_job_ids = orchestrator.execute_stage(next_stage, import_names) + return StateObject( + status="RUNNING", + current_stage=next_stage, + active_job_ids=new_job_ids, + import_list=state.import_list + ) # If we exit the loop, there are no more active stages left logging.info("All aggregation stages completed successfully!") @@ -203,10 +199,9 @@ def run_aggregation_legacy(req: LegacyAggregationRequest): # Compatibility Mode: Submit ALL enabled stages in parallel job_ids = [] - max_stage = max((cfg.get("stage", 1) for cfg in orchestrator.aggregations), default=1) - for stage_num in range(1, max_stage + 1): - if orchestrator.has_stage(stage_num, import_names): - job_ids.extend(orchestrator.execute_stage(stage_num, import_names)) + active_stages = orchestrator.get_active_stages(import_names) + for stage_num in active_stages: + job_ids.extend(orchestrator.execute_stage(stage_num, import_names)) return LegacyAggregationResponse(status=ResponseStatus.SUBMITTED, jobIds=job_ids) except Exception as e: From 92ae624f0e430ae112e220c834733ded7cc6994e Mon Sep 17 00:00:00 2001 From: Sandeep Tuniki Date: Wed, 24 Jun 2026 15:00:25 +0530 Subject: [PATCH 07/33] refactor(aggregation): rename StateObject to AggregationWorkflowState and add docstring - Renamed the generic StateObject class to AggregationWorkflowState to better reflect its purpose in representing the state of the multi-stage aggregation workflow. - Added a comprehensive, professional docstring to AggregationWorkflowState explaining its role in the stateless polling loop coordinated by Google Cloud Workflows. - Updated all type annotations, route definitions, and return statements inside routes/aggregation.py. - Verified that no other files in the workspace referenced the old class name directly, and that all 30 tests continue to pass 100%. --- .../ingestion-helper/routes/aggregation.py | 28 +++++++++++-------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/pipeline/workflow/ingestion-helper/routes/aggregation.py b/pipeline/workflow/ingestion-helper/routes/aggregation.py index 5860e6235..60c254e86 100644 --- a/pipeline/workflow/ingestion-helper/routes/aggregation.py +++ b/pipeline/workflow/ingestion-helper/routes/aggregation.py @@ -26,7 +26,13 @@ # Pydantic Models for the New Stateless API # ============================================================================= -class StateObject(BaseModel): +class AggregationWorkflowState(BaseModel): + """Represents the execution state of a multi-stage aggregation pipeline run. + + This state object is passed back and forth between the client (Google Cloud + Workflows) and the helper service endpoints to durably maintain the progress + of a stateless, sequential aggregation run across multiple stages. + """ status: str = Field(..., description="Overall status of the run: RUNNING, SUCCEEDED, FAILED") current_stage: int = Field(..., description="The stage currently executing") active_job_ids: List[str] = Field(default_factory=list, description="BQ job IDs running in the current stage") @@ -80,13 +86,13 @@ def _get_orchestrator() -> AggregationOrchestrator: # New Stateless API Endpoints (Stage-based) # ----------------------------------------------------------------------------- -@router.post("/initiate", response_model=StateObject) +@router.post("/initiate", response_model=AggregationWorkflowState) @log_start def initiate_aggregation(req: InitiateRequest): """Initiates the aggregation run by executing Stage 1 and returning the initial state.""" if not req.importList: logging.info("Empty import list. Skipping aggregation.") - return StateObject(status="SUCCEEDED", current_stage=0, active_job_ids=[], import_list=[]) + return AggregationWorkflowState(status="SUCCEEDED", current_stage=0, active_job_ids=[], import_list=[]) try: orchestrator = _get_orchestrator() @@ -95,14 +101,14 @@ def initiate_aggregation(req: InitiateRequest): active_stages = orchestrator.get_active_stages(import_names) if not active_stages: logging.info("No stages have active aggregations for the current imports. Completing immediately.") - return StateObject(status="SUCCEEDED", current_stage=0, active_job_ids=[], import_list=req.importList) + return AggregationWorkflowState(status="SUCCEEDED", current_stage=0, active_job_ids=[], import_list=req.importList) first_stage = active_stages[0] logging.info(f"Initiating aggregation at Stage {first_stage}") job_ids = orchestrator.execute_stage(first_stage, import_names) - return StateObject( + return AggregationWorkflowState( status="RUNNING", current_stage=first_stage, active_job_ids=job_ids, @@ -113,9 +119,9 @@ def initiate_aggregation(req: InitiateRequest): raise HTTPException(status_code=500, detail=f"Failed to initiate aggregation: {str(e)}") -@router.post("/poll", response_model=StateObject) +@router.post("/poll", response_model=AggregationWorkflowState) @log_start -def poll_aggregation(state: StateObject): +def poll_aggregation(state: AggregationWorkflowState): """Checks progress of active jobs and transitions to the next stage if complete.""" if state.status != "RUNNING": return state # Already in a terminal state @@ -134,7 +140,7 @@ def poll_aggregation(state: StateObject): # Case A: Any job failed if bq_status["status"] == "FAILED": logging.error(f"Stage {state.current_stage} failed with error: {bq_status.get('error')}") - return StateObject( + return AggregationWorkflowState( status="FAILED", current_stage=state.current_stage, active_job_ids=[], @@ -155,7 +161,7 @@ def poll_aggregation(state: StateObject): next_stage = next_stages[0] logging.info(f"Stage {state.current_stage} completed. Transitioning to Stage {next_stage}...") new_job_ids = orchestrator.execute_stage(next_stage, import_names) - return StateObject( + return AggregationWorkflowState( status="RUNNING", current_stage=next_stage, active_job_ids=new_job_ids, @@ -164,7 +170,7 @@ def poll_aggregation(state: StateObject): # If we exit the loop, there are no more active stages left logging.info("All aggregation stages completed successfully!") - return StateObject( + return AggregationWorkflowState( status="SUCCEEDED", current_stage=state.current_stage, active_job_ids=[], @@ -173,7 +179,7 @@ def poll_aggregation(state: StateObject): except Exception as e: logging.error(f"Error during polling: {e}") - return StateObject( + return AggregationWorkflowState( status="FAILED", current_stage=state.current_stage, active_job_ids=[], From 0aa172ad0f079eef6a2e908557b6af269c7b53a2 Mon Sep 17 00:00:00 2001 From: Sandeep Tuniki Date: Wed, 24 Jun 2026 15:13:18 +0530 Subject: [PATCH 08/33] refactor(aggregation): revert legacy prefixes to original names with temporary TODOs - Reverted all 'Legacy' prefix names on compatibility Pydantic models (e.g. AggregationRequest, AggregationResponse) back to their original names, ensuring perfect backward compatibility for client-side code generators. - Reverted compatibility route method names back to their original names (run_aggregation and get_aggregation_status). - Added deprecated=True to the FastAPI route decorators for /run and /status to natively flag them in the OpenAPI/Swagger documentation UI. - Added clear TODO comments and docstrings advising that these are temporary compatibility components to be removed once all consumers migrate. - Renamed the test case in app_test.py to test_aggregation_run to match the method name. --- .../workflow/ingestion-helper/app_test.py | 2 +- .../ingestion-helper/routes/aggregation.py | 56 +++++++++++-------- 2 files changed, 35 insertions(+), 23 deletions(-) diff --git a/pipeline/workflow/ingestion-helper/app_test.py b/pipeline/workflow/ingestion-helper/app_test.py index 65da0ca73..7a1a464cc 100644 --- a/pipeline/workflow/ingestion-helper/app_test.py +++ b/pipeline/workflow/ingestion-helper/app_test.py @@ -344,7 +344,7 @@ def test_aggregation_poll_still_running(self, mock_aggregation_utils): self.assertEqual(state["active_job_ids"], ["job-1", "job-2"]) @patch('routes.aggregation._get_orchestrator') - def test_aggregation_legacy_run(self, mock_aggregation_utils): + def test_aggregation_run(self, mock_aggregation_utils): # Setup mock orchestrator mock_instance = MagicMock() mock_aggregation_utils.return_value = mock_instance diff --git a/pipeline/workflow/ingestion-helper/routes/aggregation.py b/pipeline/workflow/ingestion-helper/routes/aggregation.py index 60c254e86..d08283124 100644 --- a/pipeline/workflow/ingestion-helper/routes/aggregation.py +++ b/pipeline/workflow/ingestion-helper/routes/aggregation.py @@ -43,19 +43,24 @@ class InitiateRequest(BaseModel): importList: List[Dict[str, Any]] = Field(default_factory=list) # ============================================================================= -# Pydantic Models for the Legacy API (Backward Compatibility) +# Pydantic Models for Backward Compatibility (Temporary) +# TODO: Remove these models once all consumers migrate to /initiate and /poll # ============================================================================= -class LegacyAggregationRequest(BaseModel): +class AggregationRequest(BaseModel): + """Temporary request model for compatibility run endpoint.""" importList: List[Dict[str, Any]] = Field(default_factory=list) -class LegacyAggregationStatusRequest(BaseModel): +class AggregationStatusRequest(BaseModel): + """Temporary request model for compatibility status endpoint.""" jobIds: List[str] = Field(default_factory=list) -class LegacyAggregationResponse(BaseResponse): +class AggregationResponse(BaseResponse): + """Temporary response model for compatibility run endpoint.""" jobIds: List[str] = Field(default_factory=list, description="BigQuery job IDs submitted for async aggregation") -class LegacyAggregationStatusResponse(BaseResponse): +class AggregationStatusResponse(BaseResponse): + """Temporary response model for compatibility status endpoint.""" error: Optional[str] = Field(default=None, description="Detailed error message if failed") failedJobs: Optional[List[str]] = Field(default_factory=list, description="List of failed BigQuery job IDs") @@ -188,16 +193,20 @@ def poll_aggregation(state: AggregationWorkflowState): ) # ----------------------------------------------------------------------------- -# Legacy API Endpoints (Backward Compatibility Mode) +# API Endpoints for Backward Compatibility (Temporary) +# TODO: Remove these endpoints once all consumers migrate to /initiate and /poll # ----------------------------------------------------------------------------- -@router.post("/run", response_model=LegacyAggregationResponse) +@router.post("/run", response_model=AggregationResponse, deprecated=True) @log_start -def run_aggregation_legacy(req: LegacyAggregationRequest): - """Legacy endpoint. Runs ALL enabled aggregations in parallel (ignores stages).""" +def run_aggregation(req: AggregationRequest): + """Temporary endpoint. Runs ALL enabled aggregations in parallel (ignores stages). + + Please migrate to /initiate and /poll endpoints. + """ if not req.importList: - logging.info("Empty import list. Skipping legacy aggregation.") - return LegacyAggregationResponse(status=ResponseStatus.SUBMITTED, jobIds=[]) + logging.info("Empty import list. Skipping temporary aggregation.") + return AggregationResponse(status=ResponseStatus.SUBMITTED, jobIds=[]) try: orchestrator = _get_orchestrator() @@ -209,28 +218,31 @@ def run_aggregation_legacy(req: LegacyAggregationRequest): for stage_num in active_stages: job_ids.extend(orchestrator.execute_stage(stage_num, import_names)) - return LegacyAggregationResponse(status=ResponseStatus.SUBMITTED, jobIds=job_ids) + return AggregationResponse(status=ResponseStatus.SUBMITTED, jobIds=job_ids) except Exception as e: - logging.error(f"Legacy aggregation failed: {e}") - raise HTTPException(status_code=500, detail=f"Legacy aggregation failed: {str(e)}") + logging.error(f"Temporary aggregation failed: {e}") + raise HTTPException(status_code=500, detail=f"Temporary aggregation failed: {str(e)}") -@router.post("/status", response_model=LegacyAggregationStatusResponse) +@router.post("/status", response_model=AggregationStatusResponse, deprecated=True) @log_start -def check_aggregation_status_legacy(req: LegacyAggregationStatusRequest): - """Legacy endpoint. Checks the status of the submitted BigQuery jobs.""" +def get_aggregation_status(req: AggregationStatusRequest): + """Temporary endpoint. Checks the status of the submitted BigQuery jobs. + + Please migrate to /initiate and /poll endpoints. + """ if not req.jobIds: logging.info("Empty jobIds. Returning status DONE.") - return LegacyAggregationStatusResponse(status=ResponseStatus.DONE) + return AggregationStatusResponse(status=ResponseStatus.DONE) try: orchestrator = _get_orchestrator() status_info = orchestrator.check_jobs_status(req.jobIds) - return LegacyAggregationStatusResponse( + return AggregationStatusResponse( status=ResponseStatus.from_str(status_info.get("status", "ERROR")), error=status_info.get("error"), - failedJobs=status_info.get("failedJobs", []) + failedJobs=status_info.get("failed_jobs", []) ) except Exception as e: - logging.error(f"Legacy status check failed: {e}") - raise HTTPException(status_code=500, detail=f"Legacy status check failed: {str(e)}") + logging.error(f"Temporary check status failed: {e}") + raise HTTPException(status_code=500, detail=f"Temporary check status failed: {str(e)}") From 8562747a528af15c0b024ea9754fcffb3c1093a2 Mon Sep 17 00:00:00 2001 From: Sandeep Tuniki Date: Wed, 24 Jun 2026 15:15:48 +0530 Subject: [PATCH 09/33] style(aggregation): remove redundant decorative comment headers - Deleted redundant ASCII box section dividers and decorative headers (e.g. Pydantic Models for the New Stateless API, Router Definition) from routes/aggregation.py. - Simplified backward compatibility section markers to clean, single-line TODO comments. - This removes visual noise and aligns the file with clean pythonic commenting best practices. --- .../ingestion-helper/routes/aggregation.py | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/pipeline/workflow/ingestion-helper/routes/aggregation.py b/pipeline/workflow/ingestion-helper/routes/aggregation.py index d08283124..0f9d5478b 100644 --- a/pipeline/workflow/ingestion-helper/routes/aggregation.py +++ b/pipeline/workflow/ingestion-helper/routes/aggregation.py @@ -22,9 +22,7 @@ from aggregation import AggregationOrchestrator from utils.logging import log_start -# ============================================================================= -# Pydantic Models for the New Stateless API -# ============================================================================= + class AggregationWorkflowState(BaseModel): """Represents the execution state of a multi-stage aggregation pipeline run. @@ -42,10 +40,7 @@ class AggregationWorkflowState(BaseModel): class InitiateRequest(BaseModel): importList: List[Dict[str, Any]] = Field(default_factory=list) -# ============================================================================= -# Pydantic Models for Backward Compatibility (Temporary) # TODO: Remove these models once all consumers migrate to /initiate and /poll -# ============================================================================= class AggregationRequest(BaseModel): """Temporary request model for compatibility run endpoint.""" @@ -64,9 +59,7 @@ class AggregationStatusResponse(BaseResponse): error: Optional[str] = Field(default=None, description="Detailed error message if failed") failedJobs: Optional[List[str]] = Field(default_factory=list, description="List of failed BigQuery job IDs") -# ============================================================================= -# Router Definition -# ============================================================================= + router = APIRouter(prefix="/aggregation", tags=["aggregation"]) @@ -87,9 +80,7 @@ def _get_orchestrator() -> AggregationOrchestrator: is_base_dc=config.IS_BASE_DC, ) -# ----------------------------------------------------------------------------- -# New Stateless API Endpoints (Stage-based) -# ----------------------------------------------------------------------------- + @router.post("/initiate", response_model=AggregationWorkflowState) @log_start @@ -192,10 +183,7 @@ def poll_aggregation(state: AggregationWorkflowState): error=f"Orchestrator error: {str(e)}" ) -# ----------------------------------------------------------------------------- -# API Endpoints for Backward Compatibility (Temporary) # TODO: Remove these endpoints once all consumers migrate to /initiate and /poll -# ----------------------------------------------------------------------------- @router.post("/run", response_model=AggregationResponse, deprecated=True) @log_start From f02540bf1c96a1c7b5cae47027f38cff271db5a6 Mon Sep 17 00:00:00 2001 From: Sandeep Tuniki Date: Wed, 24 Jun 2026 15:23:24 +0530 Subject: [PATCH 10/33] refactor(aggregation): localize TODO comments for temporary models and endpoints - Replaced the global section-level TODO comments in routes/aggregation.py with localized, specific TODO comments. - Placed an explicit, actionable TODO comment directly above each of the four compatibility models (AggregationRequest, AggregationStatusRequest, etc.). - Placed an explicit TODO comment directly above the /run and /status route decorators. - This ensures technical debt is highly visible, actionable, and tied directly to the specific components slated for removal after consumer migration. --- .../workflow/ingestion-helper/routes/aggregation.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/pipeline/workflow/ingestion-helper/routes/aggregation.py b/pipeline/workflow/ingestion-helper/routes/aggregation.py index 0f9d5478b..f15e8a4e1 100644 --- a/pipeline/workflow/ingestion-helper/routes/aggregation.py +++ b/pipeline/workflow/ingestion-helper/routes/aggregation.py @@ -40,20 +40,25 @@ class AggregationWorkflowState(BaseModel): class InitiateRequest(BaseModel): importList: List[Dict[str, Any]] = Field(default_factory=list) -# TODO: Remove these models once all consumers migrate to /initiate and /poll - +# TODO: Remove AggregationRequest once all consumers migrate to /initiate and /poll class AggregationRequest(BaseModel): """Temporary request model for compatibility run endpoint.""" importList: List[Dict[str, Any]] = Field(default_factory=list) + +# TODO: Remove AggregationStatusRequest once all consumers migrate to /initiate and /poll class AggregationStatusRequest(BaseModel): """Temporary request model for compatibility status endpoint.""" jobIds: List[str] = Field(default_factory=list) + +# TODO: Remove AggregationResponse once all consumers migrate to /initiate and /poll class AggregationResponse(BaseResponse): """Temporary response model for compatibility run endpoint.""" jobIds: List[str] = Field(default_factory=list, description="BigQuery job IDs submitted for async aggregation") + +# TODO: Remove AggregationStatusResponse once all consumers migrate to /initiate and /poll class AggregationStatusResponse(BaseResponse): """Temporary response model for compatibility status endpoint.""" error: Optional[str] = Field(default=None, description="Detailed error message if failed") @@ -183,8 +188,7 @@ def poll_aggregation(state: AggregationWorkflowState): error=f"Orchestrator error: {str(e)}" ) -# TODO: Remove these endpoints once all consumers migrate to /initiate and /poll - +# TODO: Remove the /run endpoint once all consumers migrate to /initiate and /poll @router.post("/run", response_model=AggregationResponse, deprecated=True) @log_start def run_aggregation(req: AggregationRequest): @@ -212,6 +216,7 @@ def run_aggregation(req: AggregationRequest): raise HTTPException(status_code=500, detail=f"Temporary aggregation failed: {str(e)}") +# TODO: Remove the /status endpoint once all consumers migrate to /initiate and /poll @router.post("/status", response_model=AggregationStatusResponse, deprecated=True) @log_start def get_aggregation_status(req: AggregationStatusRequest): From 41494ef47dd684f159f20f05dc7fbc6ff16a4208 Mon Sep 17 00:00:00 2001 From: Sandeep Tuniki Date: Wed, 24 Jun 2026 15:26:28 +0530 Subject: [PATCH 11/33] refactor(aggregation): fix PEP 8 formatting and clean up unused imports - Fixed a PEP 8 E701 violation in orchestrator.py by splitting a single-line 'if' statement into a standard multi-line block. - Resolved a Ruff F401 unused import warning in app_test.py by removing 'import os'. - Verified that all 30 tests continue to pass 100% after the style cleanup. --- pipeline/workflow/ingestion-helper/aggregation/orchestrator.py | 3 ++- pipeline/workflow/ingestion-helper/app_test.py | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py b/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py index de6b7e0da..de9dcc597 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py +++ b/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py @@ -115,7 +115,8 @@ def execute_stage(self, stage_num: int, active_imports: List[str]) -> List[str]: destination_type=config["destination_type"], allow_multiple_to_places=config.get("allow_multiple_to_places", False) ) - if job: step_jobs.append(job) + if job: + step_jobs.append(job) elif step_type == "stat_var": sv_jobs = self.stat_var_aggregator.aggregate_stat_vars( diff --git a/pipeline/workflow/ingestion-helper/app_test.py b/pipeline/workflow/ingestion-helper/app_test.py index 7a1a464cc..acf0b7796 100644 --- a/pipeline/workflow/ingestion-helper/app_test.py +++ b/pipeline/workflow/ingestion-helper/app_test.py @@ -15,7 +15,6 @@ import unittest from unittest.mock import MagicMock, patch from datetime import datetime -import os from fastapi.testclient import TestClient from app import app From a753fd7a922354c3c4ca285db51cb9fe4da182a5 Mon Sep 17 00:00:00 2001 From: Sandeep Tuniki Date: Wed, 24 Jun 2026 16:17:51 +0530 Subject: [PATCH 12/33] refactor(aggregation): improve orchestrator design, logging, and layout --- .../aggregation/orchestrator.py | 117 ++++++++++++------ 1 file changed, 77 insertions(+), 40 deletions(-) diff --git a/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py b/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py index de9dcc597..cfb931125 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py +++ b/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py @@ -57,12 +57,7 @@ def __init__(self, location=location, run_sequential=False) - # Initialize all generators - self.place_generator = PlaceAggregationGenerator(self.executor, is_base_dc) - self.stat_var_aggregator = StatVarAggregator(self.executor, is_base_dc) - self.linked_edge_generator = LinkedEdgeGenerator(self.executor, is_base_dc) - self.provenance_summary_generator = ProvenanceSummaryGenerator(self.executor, is_base_dc) - self.stat_var_group_generator = StatVarGroupGenerator(self.executor, is_base_dc) + self.is_base_dc = is_base_dc # Resolve paths for default config and schema curr_dir = os.path.dirname(os.path.abspath(__file__)) @@ -73,8 +68,6 @@ def __init__(self, # Load and validate configuration self.aggregations = validate_config(config_file_path, schema_file_path) - - def execute_stage(self, stage_num: int, active_imports: List[str]) -> List[str]: """Executes all enabled aggregations in the specified stage in parallel. @@ -85,7 +78,8 @@ def execute_stage(self, stage_num: int, active_imports: List[str]) -> List[str]: Returns: A list of BigQuery job IDs submitted for this stage. """ - logging.info(f"Executing Aggregation Stage {stage_num} for active imports: {active_imports}") + logging.info(f"=== Starting Aggregation Orchestration for Stage {stage_num} ===") + logging.info(f"Active imports in this run: {active_imports}") jobs = [] for config in self.aggregations: @@ -100,42 +94,23 @@ def execute_stage(self, stage_num: int, active_imports: List[str]) -> List[str]: # 3. Filter by active imports applicable_imports = self._get_applicable_imports(config, active_imports) if not applicable_imports: - logging.info(f"Skipping step '{config['type']}' in Stage {stage_num}: no matching active imports.") continue - # 4. Route to correct generator + # 4. Route to correct generator helper step_type = config["type"] - logging.info(f"Submitting step '{step_type}' in Stage {stage_num} for imports: {applicable_imports}") + logging.info(f"Triggering step '{step_type}' in Stage {stage_num}...") step_jobs = [] if step_type == "place": - job = self.place_generator.aggregate_places( - import_names=applicable_imports, - source_type=config["source_type"], - destination_type=config["destination_type"], - allow_multiple_to_places=config.get("allow_multiple_to_places", False) - ) - if job: - step_jobs.append(job) - + step_jobs = self._trigger_place(config, applicable_imports) elif step_type == "stat_var": - sv_jobs = self.stat_var_aggregator.aggregate_stat_vars( - ancestor_sv=config["ancestor_sv_id"], - source_svs=config["source_sv_ids"], - import_names=applicable_imports, - output_import_name=config.get("output_import_name"), - skip_all_sources_present_check=config.get("skip_all_sources_present_check", False) - ) - step_jobs.extend(sv_jobs) - + step_jobs = self._trigger_stat_var(config, applicable_imports) elif step_type == "linked_edges": - step_jobs.extend(self.linked_edge_generator.run_all(applicable_imports)) - + step_jobs = self._trigger_linked_edges(config, applicable_imports) elif step_type == "provenance_summary": - step_jobs.extend(self.provenance_summary_generator.run_all(applicable_imports)) - + step_jobs = self._trigger_provenance_summary(config, applicable_imports) elif step_type == "stat_var_groups": - step_jobs.extend(self.stat_var_group_generator.run_all(applicable_imports)) + step_jobs = self._trigger_stat_var_groups(config, applicable_imports) else: raise ValueError(f"Unsupported or unimplemented aggregation step type: {step_type}") @@ -144,7 +119,7 @@ def execute_stage(self, stage_num: int, active_imports: List[str]) -> List[str]: if job and job.job_id: jobs.append(job.job_id) - logging.info(f"Submitted {len(jobs)} jobs in Stage {stage_num}: {jobs}") + logging.info(f"=== Stage {stage_num} initiated successfully. Submitted {len(jobs)} BigQuery jobs: {jobs} ===") return jobs def has_stage(self, stage_num: int, active_imports: List[str]) -> bool: @@ -180,12 +155,24 @@ def get_active_stages(self, active_imports: List[str]) -> List[int]: """ stages = set() for config in self.aggregations: + step_type = config.get("type") + stage_num = config.get("stage", 1) + if config.get("disabled", False): + logging.info(f"[Config Scan] Skipping step '{step_type}' in Stage {stage_num} because it is disabled.") continue - # Check if it applies to any active imports - if self._get_applicable_imports(config, active_imports): - stages.add(config.get("stage", 1)) - return sorted(list(stages)) + + applicable_imports = self._get_applicable_imports(config, active_imports) + if not applicable_imports: + logging.info(f"[Config Scan] Skipping step '{step_type}' in Stage {stage_num} because it does not apply to active imports: {active_imports}.") + continue + + logging.info(f"[Config Scan] Step '{step_type}' in Stage {stage_num} is ACTIVE for imports: {applicable_imports}.") + stages.add(stage_num) + + sorted_stages = sorted(list(stages)) + logging.info(f"[Config Scan] Active stages resolved: {sorted_stages}") + return sorted_stages def check_jobs_status(self, job_ids: List[str]) -> Dict[str, Any]: """Checks the status of the specified BigQuery job IDs. @@ -198,6 +185,56 @@ def check_jobs_status(self, job_ids: List[str]) -> Dict[str, Any]: logging.error(f"Failed to check jobs status: {e}") raise e + def _trigger_place(self, config: Dict[str, Any], applicable_imports: List[str]) -> List[Any]: + """Triggers place-level rollup aggregations.""" + source_type = config["source_type"] + destination_type = config["destination_type"] + logging.info( + f" -> Place Rollup: {source_type} -> {destination_type} for imports {applicable_imports}" + ) + generator = PlaceAggregationGenerator(self.executor, self.is_base_dc) + job = generator.aggregate_places( + import_names=applicable_imports, + source_type=source_type, + destination_type=destination_type, + allow_multiple_to_places=config.get("allow_multiple_to_places", False) + ) + return [job] if job else [] + + def _trigger_stat_var(self, config: Dict[str, Any], applicable_imports: List[str]) -> List[Any]: + """Triggers statistical variable aggregations.""" + ancestor_sv = config["ancestor_sv_id"] + source_svs = config["source_sv_ids"] + logging.info( + f" -> Stat Var Aggregation: ancestor '{ancestor_sv}' (sources: {source_svs}) for imports {applicable_imports}" + ) + generator = StatVarAggregator(self.executor, self.is_base_dc) + return generator.aggregate_stat_vars( + ancestor_sv=ancestor_sv, + source_svs=source_svs, + import_names=applicable_imports, + output_import_name=config.get("output_import_name"), + skip_all_sources_present_check=config.get("skip_all_sources_present_check", False) + ) + + def _trigger_linked_edges(self, config: Dict[str, Any], applicable_imports: List[str]) -> List[Any]: + """Triggers linked edge aggregations.""" + logging.info(f" -> Linked Edges Aggregation for imports {applicable_imports}") + generator = LinkedEdgeGenerator(self.executor, self.is_base_dc) + return generator.run_all(applicable_imports) + + def _trigger_provenance_summary(self, config: Dict[str, Any], applicable_imports: List[str]) -> List[Any]: + """Triggers provenance summary aggregations.""" + logging.info(f" -> Provenance Summary Aggregation for imports {applicable_imports}") + generator = ProvenanceSummaryGenerator(self.executor, self.is_base_dc) + return generator.run_all(applicable_imports) + + def _trigger_stat_var_groups(self, config: Dict[str, Any], applicable_imports: List[str]) -> List[Any]: + """Triggers statistical variable group aggregations.""" + logging.info(f" -> Stat Var Groups Aggregation for imports {applicable_imports}") + generator = StatVarGroupGenerator(self.executor, self.is_base_dc) + return generator.run_all(applicable_imports) + def _get_applicable_imports(self, config: Dict[str, Any], active_imports: List[str]) -> List[str]: """Determines which active imports apply to this aggregation config.""" configured_imports = config["imports"] From dec90244b5768803c622175dc22d6b5fdb61871e Mon Sep 17 00:00:00 2001 From: Sandeep Tuniki Date: Wed, 24 Jun 2026 23:07:16 +0530 Subject: [PATCH 13/33] docs(aggregation): add module user guide README --- .../ingestion-helper/aggregation/README.md | 147 ++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 pipeline/workflow/ingestion-helper/aggregation/README.md diff --git a/pipeline/workflow/ingestion-helper/aggregation/README.md b/pipeline/workflow/ingestion-helper/aggregation/README.md new file mode 100644 index 000000000..ec61e5a40 --- /dev/null +++ b/pipeline/workflow/ingestion-helper/aggregation/README.md @@ -0,0 +1,147 @@ +# Aggregations + +This module orchestrates the execution of Data Commons aggregations through BigQuery Federation. The aggregations include place rollups, statistical variable aggregations, linked edges, and metadata summaries. + +## Core Concepts + +* **Sequential Stages**: Aggregations are executed sequentially by their `stage` number (e.g., Stage 1 steps are guaranteed to complete before Stage 2 steps begin). This is useful when later steps depend on the output of earlier ones. +* **Parallel Execution**: All aggregation steps configured in the same stage are executed in parallel to maximize performance. + +--- + +## Configuration Guide (`aggregation.yaml`) + +The entire aggregation pipeline is configured via `aggregation.yaml`. This file defines which aggregations run, what their dependencies are, and in what order they execute. + +### Common Configuration Fields +Every step in the configuration supports these common fields: +* `type` (string, Required): The type of aggregation step to run. +* `stage` (integer, Optional, default: 1): The sequential stage number. Steps in lower stages are guaranteed to finish before higher stages start. +* `imports` (list of strings, Required): The list of import names this step applies to. Use `["*"]` (wildcard) to apply the step to **all** imports in the current run. +* `disabled` (boolean, Optional, default: false): Set to `true` to temporarily disable a step without deleting it. + +--- + +### Supported Aggregation Types + +#### 1. Place (`place`) +Aggregates and rolls up statistical data from a smaller place type (source) to a larger place type (destination). +* **Fields**: + * `source_type` (string, Required): The source place type (e.g., `County`). + * `destination_type` (string, Required): The destination place type (e.g., `State`). + * `allow_multiple_to_places` (boolean, Optional, default: false): Allows mapping to multiple parent places if true. +* **Example**: + ```yaml + - type: place + stage: 1 + imports: ["USFed_Census"] + source_type: County + destination_type: State + ``` + +#### 2. Statistical Variable Aggregation (`stat_var`) +Aggregates raw statistical variables into a summarized ancestor variable (e.g., summing up individual age group counts to get a total population count). +* **Fields**: + * `ancestor_sv_id` (string, Required): The ID of the parent/summary statistical variable (e.g., `Count_Person`). + * `source_sv_ids` (list of strings, Required): The list of individual statistical variables to sum up. + * `output_import_name` (string, Optional): Custom import name to write output under. + * `skip_all_sources_present_check` (boolean, Optional, default: false): If true, aggregates even if some source variables are missing. +* **Example**: + ```yaml + - type: stat_var + stage: 2 + imports: ["USFed_Census"] + ancestor_sv_id: Count_Person + source_sv_ids: + - Count_Person_Male + - Count_Person_Female + ``` + +#### 3. Linked Edges (`linked_edges`) +Constructs and aggregates structural graph links (edges) between nodes in the Data Commons graph. +* **Example**: + ```yaml + - type: linked_edges + stage: 1 + imports: ["*"] # Runs for all imports + ``` + +#### 4. Provenance Summary (`provenance_summary`) +Generates metadata and provenance summaries for all aggregated statistical observations, establishing data lineage. +* **Example**: + ```yaml + - type: provenance_summary + stage: 3 + imports: ["USFed_Census"] + ``` + +#### 5. Statistical Variable Groups (`stat_var_groups`) +Aggregates and structures statistical variables into hierarchical groups for display in the Data Commons UI. +* **Example**: + ```yaml + - type: stat_var_groups + stage: 3 + imports: ["*"] + ``` + +--- + +### Example `aggregation.yaml` + +This example demonstrates a typical multi-stage aggregation workflow. + +```yaml +# aggregation.yaml +aggregations: + # Stage 1: Parallel Place Rollups and Linked Edges + - type: linked_edges + stage: 1 + imports: ["*"] + + - type: place + stage: 1 + imports: ["USFed_Census"] + source_type: County + destination_type: State + + # Stage 2: Parallel Stat Var Aggregations (Depends on Stage 1 completing) + - type: stat_var + stage: 2 + imports: ["USFed_Census"] + ancestor_sv_id: Count_Person + source_sv_ids: + - Count_Person_Male + - Count_Person_Female + + # Stage 3: Metadata and UI Summaries (Depends on Stage 2 completing) + - type: provenance_summary + stage: 3 + imports: ["USFed_Census"] + + - type: stat_var_groups + stage: 3 + imports: ["*"] +``` + +--- + +## Local Configuration Validation + +The orchestrator strictly validates the `aggregation.yaml` file on startup against a strict JSON Schema (`schema.json`). If there is any syntax error, type mismatch, or missing required field, the service will fail to start. + +### Running the Validator Locally +You can validate your `aggregation.yaml` file locally using the built-in CLI tool before committing or deploying changes. + +1. **Navigate to the ingestion-helper root**: + ```bash + cd pipeline/workflow/ingestion-helper + ``` +2. **Run the validator**: + ```bash + python3 -m aggregation.validator --config ../aggregation.yaml + + # sample output... + # Validating 'aggregation.yaml' against 'schema.json'... + # [SUCCESS] Configuration is valid! + # Parsed 5 aggregation steps successfully. + ``` From fb4be20f420cb26505eb6bfd12155d29a341b59c Mon Sep 17 00:00:00 2001 From: Sandeep Tuniki Date: Wed, 24 Jun 2026 23:32:51 +0530 Subject: [PATCH 14/33] test(aggregation): refactor validator and orchestrator test suites --- .../aggregation/orchestrator_test.py | 247 ++++---- .../aggregation/validator_test.py | 567 +++++++++--------- 2 files changed, 416 insertions(+), 398 deletions(-) diff --git a/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py b/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py index 2f9720995..435d5a658 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py +++ b/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py @@ -12,121 +12,122 @@ # See the License for the specific language governing permissions and # limitations under the License. +"""Unit tests for the AggregationOrchestrator class using real temporary files.""" + import json import os import sys +import tempfile +import textwrap import unittest -from unittest.mock import MagicMock, patch, mock_open +from unittest.mock import MagicMock, patch sys.path.append(os.path.dirname(os.path.dirname(__file__))) from aggregation import AggregationOrchestrator - -# Sample valid YAML config for testing -VALID_CONFIG_YAML = """ -aggregations: - - type: linked_edges - imports: ["*"] - stage: 1 - - - type: place - source_type: County - destination_type: State - allow_multiple_to_places: false - imports: ["USFed_Census"] - stage: 1 - - - type: place - source_type: State - destination_type: Country - imports: ["*"] - stage: 2 - disabled: true - - - type: stat_var - ancestor_sv_id: Count_Person - source_sv_ids: ["Count_Person_Male", "Count_Person_Female"] - skip_all_sources_present_check: true - imports: ["USFed_Census"] - stage: 2 -""" +VALID_CONFIG_YAML = textwrap.dedent("""\ + aggregations: + - type: linked_edges + imports: ["*"] + stage: 1 + + - type: place + source_type: County + destination_type: State + allow_multiple_to_places: false + imports: ["USFed_Census"] + stage: 1 + + - type: place + source_type: State + destination_type: Country + imports: ["*"] + stage: 2 + disabled: true + + - type: stat_var + ancestor_sv_id: Count_Person + source_sv_ids: ["Count_Person_Male", "Count_Person_Female"] + skip_all_sources_present_check: true + imports: ["USFed_Census"] + stage: 2 +""") @patch('aggregation.orchestrator.BigQueryExecutor') -@patch('aggregation.orchestrator.PlaceAggregationGenerator') -@patch('aggregation.orchestrator.StatVarAggregator') -@patch('aggregation.orchestrator.LinkedEdgeGenerator') -@patch('aggregation.orchestrator.ProvenanceSummaryGenerator') -@patch('aggregation.orchestrator.StatVarGroupGenerator') -class TestAggregationOrchestrator(unittest.TestCase): +class TestOrchestratorScanning(unittest.TestCase): + """Tests the stage scanning and active stage resolution methods.""" def setUp(self): - # Load the actual schema for validation tests (now inside aggregation/) - schema_path = os.path.join(os.path.dirname(__file__), "schema.json") - with open(schema_path, "r") as f: - self.schema_json = json.load(f) + self.tmpdir = tempfile.TemporaryDirectory() + config_path = os.path.join(self.tmpdir.name, "aggregation.yaml") + with open(config_path, "w") as f: + f.write(VALID_CONFIG_YAML) + + self.orchestrator = AggregationOrchestrator( + connection_id="conn", + project_id="proj", + instance_id="inst", + database_id="db", + config_file_path=config_path + ) - def _get_mock_open(self, yaml_content): - """Helper to mock open() calls for both the config YAML and the schema JSON.""" - def side_effect(path, *args, **kwargs): - if "schema.json" in path: - return mock_open(read_data=json.dumps(self.schema_json))().__enter__() - elif "aggregation.yaml" in path: - return mock_open(read_data=yaml_content)().__enter__() - raise FileNotFoundError(f"Mock open not configured for: {path}") - return side_effect + def tearDown(self): + self.tmpdir.cleanup() + def test_has_stage(self, mock_executor): + """Tests the has_stage method for active, disabled, and non-matching stages.""" + self.assertTrue(self.orchestrator.has_stage(1, ["AnyImport"])) + self.assertTrue(self.orchestrator.has_stage(1, ["USFed_Census"])) + self.assertFalse(self.orchestrator.has_stage(2, ["OtherImport"])) + self.assertTrue(self.orchestrator.has_stage(2, ["USFed_Census"])) - @patch('builtins.open') - def test_has_stage(self, mock_file_open, *mocks): - """Tests the has_stage method for active, disabled, and non-matching stages.""" - mock_file_open.side_effect = self._get_mock_open(VALID_CONFIG_YAML) - utils = AggregationOrchestrator(connection_id="conn", project_id="proj", instance_id="inst", database_id="db") + self.assertFalse(self.orchestrator.has_stage(3, ["USFed_Census"])) - # Stage 1: Has linked_edges (wildcard) and place (USFed_Census) - self.assertTrue(utils.has_stage(1, ["AnyImport"])) - self.assertTrue(utils.has_stage(1, ["USFed_Census"])) + def test_get_active_stages(self, mock_executor): + """Tests that get_active_stages correctly extracts, filters, and sorts active stages.""" + stages = self.orchestrator.get_active_stages(active_imports=["USFed_Census"]) + self.assertEqual(stages, [1, 2]) - # Stage 2: Has place (disabled: true) and stat_var (USFed_Census) - # If active import is "OtherImport", Stage 2 has no active aggregations (stat_var doesn't match, place is disabled) - self.assertFalse(utils.has_stage(2, ["OtherImport"])) - # If active import is "USFed_Census", Stage 2 has stat_var active - self.assertTrue(utils.has_stage(2, ["USFed_Census"])) + stages = self.orchestrator.get_active_stages(active_imports=["OtherImport"]) + self.assertEqual(stages, [1]) - # Stage 3: Does not exist in config - self.assertFalse(utils.has_stage(3, ["USFed_Census"])) - @patch('builtins.open') - def test_get_active_stages(self, mock_file_open, *mocks): - """Tests that get_active_stages correctly extracts, filters, and sorts active stages.""" - mock_file_open.side_effect = self._get_mock_open(VALID_CONFIG_YAML) +@patch('aggregation.orchestrator.BigQueryExecutor') +@patch('aggregation.orchestrator.PlaceAggregationGenerator') +@patch('aggregation.orchestrator.StatVarAggregator') +@patch('aggregation.orchestrator.LinkedEdgeGenerator') +@patch('aggregation.orchestrator.ProvenanceSummaryGenerator') +@patch('aggregation.orchestrator.StatVarGroupGenerator') +class TestOrchestratorExecution(unittest.TestCase): + """Tests stage execution, verifying parallel job submission and routing. - utils = AggregationOrchestrator(connection_id="conn", project_id="proj", instance_id="inst", database_id="db") + These tests execute stages, so they mock the executor and all five generators + to verify correct parameters are passed and jobs are collected. + """ - # 1. For active import "USFed_Census": - # Stage 1 (linked_edges, place) and Stage 2 (stat_var) have active steps. - # The place rollup in Stage 2 is disabled, but the stat_var step is enabled and active. - # Therefore, active stages should be [1, 2]. - stages = utils.get_active_stages(active_imports=["USFed_Census"]) - self.assertEqual(stages, [1, 2]) + def setUp(self): + self.tmpdir = tempfile.TemporaryDirectory() + config_path = os.path.join(self.tmpdir.name, "aggregation.yaml") + with open(config_path, "w") as f: + f.write(VALID_CONFIG_YAML) + + self.orchestrator = AggregationOrchestrator( + connection_id="conn", + project_id="proj", + instance_id="inst", + database_id="db", + config_file_path=config_path + ) - # 2. For active import "OtherImport": - # Stage 1 (linked_edges) matches via wildcard. - # Stage 2 (place rollup is disabled, stat_var does not match "OtherImport"). - # Therefore, only Stage 1 is active. Active stages should be [1]. - stages = utils.get_active_stages(active_imports=["OtherImport"]) - self.assertEqual(stages, [1]) + def tearDown(self): + self.tmpdir.cleanup() - @patch('builtins.open') - def test_execute_stage_1(self, mock_file_open, - mock_svg_gen, mock_prov_gen, mock_edge_gen, + def test_execute_stage_1(self, mock_svg_gen, mock_prov_gen, mock_edge_gen, mock_sv_agg, mock_place_gen, mock_executor): """Tests executing Stage 1, verifying parallel job submission and wildcard resolution.""" - mock_file_open.side_effect = self._get_mock_open(VALID_CONFIG_YAML) - - # Setup generator mocks to return mock jobs mock_job1 = MagicMock() mock_job1.job_id = "job-edge-1" mock_edge_gen.return_value.run_all.return_value = [mock_job1] @@ -135,20 +136,14 @@ def test_execute_stage_1(self, mock_file_open, mock_job2.job_id = "job-place-1" mock_place_gen.return_value.aggregate_places.return_value = mock_job2 - utils = AggregationOrchestrator(connection_id="conn", project_id="proj", instance_id="inst", database_id="db") + job_ids = self.orchestrator.execute_stage(stage_num=1, active_imports=["USFed_Census"]) - # Execute Stage 1 for active imports: ["USFed_Census"] - job_ids = utils.execute_stage(stage_num=1, active_imports=["USFed_Census"]) - - # Assertions self.assertEqual(len(job_ids), 2) self.assertIn("job-edge-1", job_ids) self.assertIn("job-place-1", job_ids) - # Verify linked_edges ran for ALL active imports (wildcard '*') mock_edge_gen.return_value.run_all.assert_called_once_with(["USFed_Census"]) - # Verify place rollup ran for matching import "USFed_Census" mock_place_gen.return_value.aggregate_places.assert_called_once_with( import_names=["USFed_Census"], source_type="County", @@ -156,33 +151,22 @@ def test_execute_stage_1(self, mock_file_open, allow_multiple_to_places=False ) - @patch('builtins.open') - def test_execute_stage_2_with_disabled_and_filtering(self, mock_file_open, - mock_svg_gen, mock_prov_gen, mock_edge_gen, + def test_execute_stage_2_with_disabled_and_filtering(self, mock_svg_gen, mock_prov_gen, mock_edge_gen, mock_sv_agg, mock_place_gen, mock_executor): """Tests Stage 2, verifying that disabled steps are skipped and non-matching imports are filtered.""" - mock_file_open.side_effect = self._get_mock_open(VALID_CONFIG_YAML) - - # Setup mock for Stage 2 stat_var job mock_job_sv = MagicMock() mock_job_sv.job_id = "job-sv-1" mock_sv_agg.return_value.aggregate_stat_vars.return_value = [mock_job_sv] - utils = AggregationOrchestrator(connection_id="conn", project_id="proj", instance_id="inst", database_id="db") - - # 1. Run for an import that does NOT match stat_var ("OtherImport") - # Since the place rollup in Stage 2 is disabled, no jobs should run at all. - job_ids = utils.execute_stage(stage_num=2, active_imports=["OtherImport"]) + job_ids = self.orchestrator.execute_stage(stage_num=2, active_imports=["OtherImport"]) self.assertEqual(len(job_ids), 0) mock_place_gen.return_value.aggregate_places.assert_not_called() mock_sv_agg.return_value.aggregate_stat_vars.assert_not_called() - # 2. Run for matching import "USFed_Census" - # The disabled place rollup should still be skipped, but the stat_var aggregation should execute. - job_ids = utils.execute_stage(stage_num=2, active_imports=["USFed_Census"]) + job_ids = self.orchestrator.execute_stage(stage_num=2, active_imports=["USFed_Census"]) self.assertEqual(job_ids, ["job-sv-1"]) - mock_place_gen.return_value.aggregate_places.assert_not_called() # Still skipped because disabled: true + mock_place_gen.return_value.aggregate_places.assert_not_called() mock_sv_agg.return_value.aggregate_stat_vars.assert_called_once_with( ancestor_sv="Count_Person", source_svs=["Count_Person_Male", "Count_Person_Female"], @@ -191,26 +175,33 @@ def test_execute_stage_2_with_disabled_and_filtering(self, mock_file_open, skip_all_sources_present_check=True ) - @patch('builtins.open') - def test_execute_stage_unsupported_type(self, mock_file_open, *mocks): + def test_execute_stage_unsupported_type(self, *mocks): """Tests that an unsupported aggregation step type raises ValueError.""" - # Use 'entity' which is valid in schema but unimplemented in orchestrator - unimplemented_config = """ - aggregations: - - type: entity - entity_types: ["MortalityEvent"] - location_props: ["location"] - imports: ["*"] - stage: 1 - """ - mock_file_open.side_effect = self._get_mock_open(unimplemented_config) - - utils = AggregationOrchestrator(connection_id="conn", project_id="proj", instance_id="inst", database_id="db") - - # Running Stage 1 should raise ValueError due to unimplemented "entity" type - with self.assertRaises(ValueError) as ctx: - utils.execute_stage(stage_num=1, active_imports=["USFed_Census"]) - self.assertIn("Unsupported or unimplemented aggregation step type: entity", str(ctx.exception)) + unimplemented_config = textwrap.dedent("""\ + aggregations: + - type: entity + entity_types: ["MortalityEvent"] + location_props: ["location"] + imports: ["*"] + stage: 1 + """) + + with tempfile.TemporaryDirectory() as local_tmpdir: + local_config_path = os.path.join(local_tmpdir, "aggregation.yaml") + with open(local_config_path, "w") as f: + f.write(unimplemented_config) + + local_orchestrator = AggregationOrchestrator( + connection_id="conn", + project_id="proj", + instance_id="inst", + database_id="db", + config_file_path=local_config_path + ) + + with self.assertRaises(ValueError) as ctx: + local_orchestrator.execute_stage(stage_num=1, active_imports=["USFed_Census"]) + self.assertIn("Unsupported or unimplemented aggregation step type: entity", str(ctx.exception)) if __name__ == '__main__': diff --git a/pipeline/workflow/ingestion-helper/aggregation/validator_test.py b/pipeline/workflow/ingestion-helper/aggregation/validator_test.py index 34978f58d..44cd56d82 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/validator_test.py +++ b/pipeline/workflow/ingestion-helper/aggregation/validator_test.py @@ -12,13 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Unit tests for the aggregation configuration validator.""" +"""Unit tests for the aggregation configuration validator using real temporary files.""" -import json import os import sys +import tempfile +import textwrap import unittest -from unittest.mock import mock_open, patch import jsonschema import yaml @@ -26,283 +26,310 @@ from aggregation import validate_config -# ============================================================================= -# Mock YAML Configurations for Testing -# ============================================================================= - -# 1. A perfectly valid config containing all possible types and fields -VALID_ALL_TYPES_YAML = """ -aggregations: - - type: linked_edges - imports: ["*"] - stage: 1 - disabled: false - - - type: place - source_type: County - destination_type: State - allow_multiple_to_places: true - imports: ["ImportA", "ImportB"] - stage: 2 - - - type: stat_var - ancestor_sv_id: Count_Person - source_sv_ids: ["Count_Person_Male", "Count_Person_Female"] - skip_all_sources_present_check: true - output_import_name: "Aggregated_Pop" - imports: ["ImportC"] - stage: 3 - - - type: entity - entity_types: ["MortalityEvent"] - location_props: ["location"] - date_prop: "date" - agg_date_formats: ["%Y"] - imports: ["ImportD"] - - - type: provenance_summary - imports: ["*"] - - - type: stat_var_groups - imports: ["*"] -""" - -# 2. Invalid: Missing required type field -INVALID_MISSING_TYPE_YAML = """ -aggregations: - - imports: ["*"] -""" - -# 3. Invalid: Missing required imports field -INVALID_MISSING_IMPORTS_YAML = """ -aggregations: - - type: linked_edges -""" - -# 4. Invalid: imports is a string instead of an array -INVALID_IMPORTS_TYPE_YAML = """ -aggregations: - - type: linked_edges - imports: "*" -""" - -# 5. Invalid: stage is a string instead of an integer -INVALID_STAGE_TYPE_YAML = """ -aggregations: - - type: linked_edges - imports: ["*"] - stage: "first" -""" - -# 6. Invalid: stage is 0 (minimum is 1) -INVALID_STAGE_VALUE_YAML = """ -aggregations: - - type: linked_edges - imports: ["*"] - stage: 0 -""" - -# 7. Invalid: empty imports list (minItems: 1) -INVALID_EMPTY_IMPORTS_YAML = """ -aggregations: - - type: linked_edges - imports: [] -""" - -# 8. Invalid place rollup: missing required source_type -INVALID_PLACE_MISSING_FIELD_YAML = """ -aggregations: - - type: place - destination_type: State - imports: ["*"] -""" - -# 9. Invalid stat var: missing required source_sv_ids -INVALID_STAT_VAR_MISSING_FIELD_YAML = """ -aggregations: - - type: stat_var - ancestor_sv_id: Count_Person - imports: ["*"] -""" - -# 10. Invalid stat var: empty source_sv_ids list -INVALID_STAT_VAR_EMPTY_SVS_YAML = """ -aggregations: - - type: stat_var - ancestor_sv_id: Count_Person - source_sv_ids: [] - imports: ["*"] -""" - -# 11. Invalid entity: missing required location_props -INVALID_ENTITY_MISSING_FIELD_YAML = """ -aggregations: - - type: entity - entity_types: ["Event"] - imports: ["*"] -""" - -# 12. Malformed YAML (Indentation error) -MALFORMED_YAML = """ -aggregations: - - type: linked_edges - imports: - - "*" -""" - - -class TestConfigValidator(unittest.TestCase): + +class TestValidatorSuccess(unittest.TestCase): + """Verifies successful validation paths for valid configurations.""" def setUp(self): # Load the actual schema from the workspace to ensure tests remain realistic self.schema_path = os.path.join(os.path.dirname(__file__), "schema.json") - with open(self.schema_path, "r") as f: - self.schema_json = json.load(f) - - def _get_mock_open(self, yaml_content): - """Helper to mock open() for both the schema JSON and the target YAML.""" - def side_effect(path, *args, **kwargs): - if "schema.json" in path: - return mock_open(read_data=json.dumps(self.schema_json))().__enter__() - else: - return mock_open(read_data=yaml_content)().__enter__() - return side_effect - - # ============================================================================= - # Success Test Cases - # ============================================================================= - - @patch('builtins.open') - def test_validate_config_success_all_types(self, mock_file_open): - """Verifies that a comprehensive, valid config with all types passes validation.""" - mock_file_open.side_effect = self._get_mock_open(VALID_ALL_TYPES_YAML) - - aggregations = validate_config("aggregation.yaml", self.schema_path) - - self.assertEqual(len(aggregations), 6) - self.assertEqual(aggregations[0]["type"], "linked_edges") - self.assertEqual(aggregations[1]["source_type"], "County") - self.assertEqual(aggregations[2]["ancestor_sv_id"], "Count_Person") - self.assertEqual(aggregations[3]["entity_types"], ["MortalityEvent"]) - - # ============================================================================= - # Schema Constraint Test Cases - # ============================================================================= - - @patch('builtins.open') - def test_validate_config_missing_type(self, mock_file_open): - mock_file_open.side_effect = self._get_mock_open(INVALID_MISSING_TYPE_YAML) - with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: - validate_config("aggregation.yaml", self.schema_path) - self.assertIn("'type' is a required property", ctx.exception.message) - - @patch('builtins.open') - def test_validate_config_missing_imports(self, mock_file_open): - mock_file_open.side_effect = self._get_mock_open(INVALID_MISSING_IMPORTS_YAML) - with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: - validate_config("aggregation.yaml", self.schema_path) - self.assertIn("'imports' is a required property", ctx.exception.message) - - @patch('builtins.open') - def test_validate_config_invalid_imports_type(self, mock_file_open): - mock_file_open.side_effect = self._get_mock_open(INVALID_IMPORTS_TYPE_YAML) - with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: - validate_config("aggregation.yaml", self.schema_path) - self.assertIn("is not of type 'array'", ctx.exception.message) - - @patch('builtins.open') - def test_validate_config_invalid_stage_type(self, mock_file_open): - mock_file_open.side_effect = self._get_mock_open(INVALID_STAGE_TYPE_YAML) - with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: - validate_config("aggregation.yaml", self.schema_path) - self.assertIn("is not of type 'integer'", ctx.exception.message) - - @patch('builtins.open') - def test_validate_config_invalid_stage_value(self, mock_file_open): - mock_file_open.side_effect = self._get_mock_open(INVALID_STAGE_VALUE_YAML) - with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: - validate_config("aggregation.yaml", self.schema_path) - self.assertIn("is less than the minimum of 1", ctx.exception.message) - - @patch('builtins.open') - def test_validate_config_empty_imports_list(self, mock_file_open): - mock_file_open.side_effect = self._get_mock_open(INVALID_EMPTY_IMPORTS_YAML) - with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: - validate_config("aggregation.yaml", self.schema_path) - self.assertIn("should be non-empty", ctx.exception.message) - - # ============================================================================= - # Conditional Dependency Test Cases (OneOf/Dependencies) - # ============================================================================= - - @patch('builtins.open') - def test_validate_config_place_missing_field(self, mock_file_open): - mock_file_open.side_effect = self._get_mock_open(INVALID_PLACE_MISSING_FIELD_YAML) - with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: - validate_config("aggregation.yaml", self.schema_path) - # Validation fails because place rollup requires source_type - self.assertIn("is not valid under any of the given schemas", ctx.exception.message) - - @patch('builtins.open') - def test_validate_config_stat_var_missing_field(self, mock_file_open): - mock_file_open.side_effect = self._get_mock_open(INVALID_STAT_VAR_MISSING_FIELD_YAML) - with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: - validate_config("aggregation.yaml", self.schema_path) - self.assertIn("is not valid under any of the given schemas", ctx.exception.message) - - @patch('builtins.open') - def test_validate_config_stat_var_empty_source_svs(self, mock_file_open): - mock_file_open.side_effect = self._get_mock_open(INVALID_STAT_VAR_EMPTY_SVS_YAML) - with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: - validate_config("aggregation.yaml", self.schema_path) - self.assertIn("should be non-empty", ctx.exception.message) - - @patch('builtins.open') - def test_validate_config_entity_missing_field(self, mock_file_open): - mock_file_open.side_effect = self._get_mock_open(INVALID_ENTITY_MISSING_FIELD_YAML) - with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: - validate_config("aggregation.yaml", self.schema_path) - self.assertIn("is not valid under any of the given schemas", ctx.exception.message) - - # ============================================================================= - # File System & Syntax Error Test Cases - # ============================================================================= - - @patch('builtins.open') - def test_validate_config_yaml_syntax_error(self, mock_file_open): - mock_file_open.side_effect = self._get_mock_open(MALFORMED_YAML) - with self.assertRaises(yaml.YAMLError): - validate_config("aggregation.yaml", self.schema_path) - def test_validate_config_missing_config_file(self): - with self.assertRaises(FileNotFoundError): - validate_config("non_existent_config.yaml", "schema.json") + def test_validate_config_success_all_types(self): + """Verifies that a comprehensive, valid config with all types passes validation.""" + valid_all_types_yaml = textwrap.dedent("""\ + aggregations: + - type: linked_edges + imports: ["*"] + stage: 1 + disabled: false + + - type: place + source_type: County + destination_type: State + allow_multiple_to_places: true + imports: ["ImportA", "ImportB"] + stage: 2 + + - type: stat_var + ancestor_sv_id: Count_Person + source_sv_ids: ["Count_Person_Male", "Count_Person_Female"] + skip_all_sources_present_check: true + output_import_name: "Aggregated_Pop" + imports: ["ImportC"] + stage: 3 + + - type: entity + entity_types: ["MortalityEvent"] + location_props: ["location"] + date_prop: "date" + agg_date_formats: ["%Y"] + imports: ["ImportD"] + + - type: provenance_summary + imports: ["*"] + + - type: stat_var_groups + imports: ["*"] + """) + + with tempfile.TemporaryDirectory() as tmpdir: + config_path = os.path.join(tmpdir, "aggregation.yaml") + with open(config_path, "w") as f: + f.write(valid_all_types_yaml) + + aggregations = validate_config(config_path, self.schema_path) + + self.assertEqual(len(aggregations), 6) + self.assertEqual(aggregations[0]["type"], "linked_edges") + self.assertEqual(aggregations[1]["source_type"], "County") + self.assertEqual(aggregations[2]["ancestor_sv_id"], "Count_Person") + self.assertEqual(aggregations[3]["entity_types"], ["MortalityEvent"]) + + +class TestValidatorSchemaConstraints(unittest.TestCase): + """Verifies core schema constraint failures (types, required fields, values).""" - def test_validate_config_missing_schema_file(self): - # We patch os.path.exists to simulate config existing but schema missing - with patch('os.path.exists', side_effect=lambda path: "aggregation.yaml" in path): - with self.assertRaises(FileNotFoundError): - validate_config("aggregation.yaml", "non_existent_schema.json") + def setUp(self): + self.schema_path = os.path.join(os.path.dirname(__file__), "schema.json") - @patch('builtins.open') - def test_validate_config_missing_aggregations_key(self, mock_file_open): + def test_validate_config_missing_type(self): + """Verifies that missing the required 'type' field raises ValidationError.""" + invalid_missing_type_yaml = textwrap.dedent("""\ + aggregations: + - imports: ["*"] + """) + with tempfile.TemporaryDirectory() as tmpdir: + config_path = os.path.join(tmpdir, "aggregation.yaml") + with open(config_path, "w") as f: + f.write(invalid_missing_type_yaml) + + with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: + validate_config(config_path, self.schema_path) + self.assertIn("'type' is a required property", ctx.exception.message) + + def test_validate_config_missing_imports(self): + """Verifies that missing the required 'imports' field raises ValidationError.""" + invalid_missing_imports_yaml = textwrap.dedent("""\ + aggregations: + - type: linked_edges + """) + with tempfile.TemporaryDirectory() as tmpdir: + config_path = os.path.join(tmpdir, "aggregation.yaml") + with open(config_path, "w") as f: + f.write(invalid_missing_imports_yaml) + + with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: + validate_config(config_path, self.schema_path) + self.assertIn("'imports' is a required property", ctx.exception.message) + + def test_validate_config_invalid_imports_type(self): + """Verifies that imports field being a string instead of an array raises ValidationError.""" + invalid_imports_type_yaml = textwrap.dedent("""\ + aggregations: + - type: linked_edges + imports: "*" + """) + with tempfile.TemporaryDirectory() as tmpdir: + config_path = os.path.join(tmpdir, "aggregation.yaml") + with open(config_path, "w") as f: + f.write(invalid_imports_type_yaml) + + with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: + validate_config(config_path, self.schema_path) + self.assertIn("is not of type 'array'", ctx.exception.message) + + def test_validate_config_invalid_stage_type(self): + """Verifies that stage field being a string instead of an integer raises ValidationError.""" + invalid_stage_type_yaml = textwrap.dedent("""\ + aggregations: + - type: linked_edges + imports: ["*"] + stage: "first" + """) + with tempfile.TemporaryDirectory() as tmpdir: + config_path = os.path.join(tmpdir, "aggregation.yaml") + with open(config_path, "w") as f: + f.write(invalid_stage_type_yaml) + + with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: + validate_config(config_path, self.schema_path) + self.assertIn("is not of type 'integer'", ctx.exception.message) + + def test_validate_config_invalid_stage_value(self): + """Verifies that a stage value of 0 (minimum is 1) raises ValidationError.""" + invalid_stage_value_yaml = textwrap.dedent("""\ + aggregations: + - type: linked_edges + imports: ["*"] + stage: 0 + """) + with tempfile.TemporaryDirectory() as tmpdir: + config_path = os.path.join(tmpdir, "aggregation.yaml") + with open(config_path, "w") as f: + f.write(invalid_stage_value_yaml) + + with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: + validate_config(config_path, self.schema_path) + self.assertIn("is less than the minimum of 1", ctx.exception.message) + + def test_validate_config_empty_imports_list(self): + """Verifies that an empty imports list raises ValidationError.""" + invalid_empty_imports_yaml = textwrap.dedent("""\ + aggregations: + - type: linked_edges + imports: [] + """) + with tempfile.TemporaryDirectory() as tmpdir: + config_path = os.path.join(tmpdir, "aggregation.yaml") + with open(config_path, "w") as f: + f.write(invalid_empty_imports_yaml) + + with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: + validate_config(config_path, self.schema_path) + self.assertIn("should be non-empty", ctx.exception.message) + + def test_validate_config_missing_aggregations_key(self): """Verifies that missing the required 'aggregations' root key raises ValidationError.""" - missing_aggregations_yaml = """ - some_other_key: [] - """ - mock_file_open.side_effect = self._get_mock_open(missing_aggregations_yaml) - with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: - validate_config("aggregation.yaml", self.schema_path) - self.assertIn("'aggregations' is a required property", ctx.exception.message) - - @patch('builtins.open') - def test_validate_config_empty_file(self, mock_file_open): + missing_aggregations_yaml = textwrap.dedent("""\ + some_other_key: [] + """) + with tempfile.TemporaryDirectory() as tmpdir: + config_path = os.path.join(tmpdir, "aggregation.yaml") + with open(config_path, "w") as f: + f.write(missing_aggregations_yaml) + + with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: + validate_config(config_path, self.schema_path) + self.assertIn("'aggregations' is a required property", ctx.exception.message) + + def test_validate_config_empty_file(self): """Verifies that a completely empty configuration file raises ValidationError.""" empty_yaml = "" - mock_file_open.side_effect = self._get_mock_open(empty_yaml) - with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: - validate_config("aggregation.yaml", self.schema_path) - self.assertIn("'aggregations' is a required property", ctx.exception.message) + with tempfile.TemporaryDirectory() as tmpdir: + config_path = os.path.join(tmpdir, "aggregation.yaml") + with open(config_path, "w") as f: + f.write(empty_yaml) + + with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: + validate_config(config_path, self.schema_path) + self.assertIn("'aggregations' is a required property", ctx.exception.message) + + +class TestValidatorConditionalDependencies(unittest.TestCase): + """Verifies type-specific conditional dependencies (OneOf / dependencies).""" + + def setUp(self): + self.schema_path = os.path.join(os.path.dirname(__file__), "schema.json") + + def test_validate_config_place_missing_field(self): + """Verifies that a place step missing the required 'source_type' raises ValidationError.""" + invalid_place_missing_field_yaml = textwrap.dedent("""\ + aggregations: + - type: place + destination_type: State + imports: ["*"] + """) + with tempfile.TemporaryDirectory() as tmpdir: + config_path = os.path.join(tmpdir, "aggregation.yaml") + with open(config_path, "w") as f: + f.write(invalid_place_missing_field_yaml) + + with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: + validate_config(config_path, self.schema_path) + self.assertIn("is not valid under any of the given schemas", ctx.exception.message) + + def test_validate_config_stat_var_missing_field(self): + """Verifies that a stat_var step missing the required 'source_sv_ids' raises ValidationError.""" + invalid_stat_var_missing_field_yaml = textwrap.dedent("""\ + aggregations: + - type: stat_var + ancestor_sv_id: Count_Person + imports: ["*"] + """) + with tempfile.TemporaryDirectory() as tmpdir: + config_path = os.path.join(tmpdir, "aggregation.yaml") + with open(config_path, "w") as f: + f.write(invalid_stat_var_missing_field_yaml) + + with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: + validate_config(config_path, self.schema_path) + self.assertIn("is not valid under any of the given schemas", ctx.exception.message) + + def test_validate_config_stat_var_empty_source_svs(self): + """Verifies that a stat_var step with an empty source_sv_ids array raises ValidationError.""" + invalid_stat_var_empty_svs_yaml = textwrap.dedent("""\ + aggregations: + - type: stat_var + ancestor_sv_id: Count_Person + source_sv_ids: [] + imports: ["*"] + """) + with tempfile.TemporaryDirectory() as tmpdir: + config_path = os.path.join(tmpdir, "aggregation.yaml") + with open(config_path, "w") as f: + f.write(invalid_stat_var_empty_svs_yaml) + + with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: + validate_config(config_path, self.schema_path) + self.assertIn("should be non-empty", ctx.exception.message) + + def test_validate_config_entity_missing_field(self): + """Verifies that an entity step missing the required 'location_props' raises ValidationError.""" + invalid_entity_missing_field_yaml = textwrap.dedent("""\ + aggregations: + - type: entity + entity_types: ["Event"] + imports: ["*"] + """) + with tempfile.TemporaryDirectory() as tmpdir: + config_path = os.path.join(tmpdir, "aggregation.yaml") + with open(config_path, "w") as f: + f.write(invalid_entity_missing_field_yaml) + + with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: + validate_config(config_path, self.schema_path) + self.assertIn("is not valid under any of the given schemas", ctx.exception.message) + + +class TestValidatorErrorsAndFileSystem(unittest.TestCase): + """Verifies file-system issues and non-schema parsing errors (YAML syntax).""" + + def setUp(self): + self.schema_path = os.path.join(os.path.dirname(__file__), "schema.json") + + def test_validate_config_yaml_syntax_error(self): + """Verifies that malformed YAML syntax raises YAMLError.""" + malformed_yaml = textwrap.dedent("""\ + aggregations: + - type: linked_edges + imports: + - "*" + """) + with tempfile.TemporaryDirectory() as tmpdir: + config_path = os.path.join(tmpdir, "aggregation.yaml") + with open(config_path, "w") as f: + f.write(malformed_yaml) + + with self.assertRaises(yaml.YAMLError): + validate_config(config_path, self.schema_path) + + def test_validate_config_missing_config_file(self): + """Verifies that a missing config file path raises FileNotFoundError.""" + with self.assertRaises(FileNotFoundError): + validate_config("non_existent_config.yaml", self.schema_path) + + def test_validate_config_missing_schema_file(self): + """Verifies that a missing schema file path raises FileNotFoundError.""" + with tempfile.TemporaryDirectory() as tmpdir: + config_path = os.path.join(tmpdir, "aggregation.yaml") + # Create a real, valid config file so it exists + with open(config_path, "w") as f: + f.write("aggregations: []") + + # Pass the real config path, but a non-existent schema path + with self.assertRaises(FileNotFoundError) as ctx: + validate_config(config_path, "non_existent_schema.json") + self.assertIn("JSON Schema file not found", str(ctx.exception)) if __name__ == '__main__': From a26d9d4a10d3c71750a2cad6ca0d3bbcac02dbec Mon Sep 17 00:00:00 2001 From: Sandeep Tuniki Date: Wed, 24 Jun 2026 23:47:02 +0530 Subject: [PATCH 15/33] refactor(aggregation): polish orchestrator logging and flatten unit test suites --- .../aggregation/orchestrator.py | 4 +- .../aggregation/orchestrator_test.py | 2 +- .../aggregation/validator_test.py | 214 +++++++++--------- 3 files changed, 103 insertions(+), 117 deletions(-) diff --git a/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py b/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py index cfb931125..4ef337aa8 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py +++ b/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py @@ -48,8 +48,6 @@ def __init__(self, config_file_path: Optional custom path to the aggregation.yaml file. If not specified, defaults to the aggregation.yaml in the parent directory. """ - # Always run asynchronously at the executor level for stages to run in parallel - # We handle sequential blocking between stages at the workflow/router level self.executor = BigQueryExecutor(connection_id=connection_id, project_id=project_id, instance_id=instance_id, @@ -78,7 +76,7 @@ def execute_stage(self, stage_num: int, active_imports: List[str]) -> List[str]: Returns: A list of BigQuery job IDs submitted for this stage. """ - logging.info(f"=== Starting Aggregation Orchestration for Stage {stage_num} ===") + logging.info(f"Starting Aggregation Orchestration for Stage {stage_num}") logging.info(f"Active imports in this run: {active_imports}") jobs = [] diff --git a/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py b/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py index 435d5a658..a0b2627f4 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py +++ b/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Unit tests for the AggregationOrchestrator class using real temporary files.""" +"""Unit tests for the AggregationOrchestrator class.""" import json import os diff --git a/pipeline/workflow/ingestion-helper/aggregation/validator_test.py b/pipeline/workflow/ingestion-helper/aggregation/validator_test.py index 44cd56d82..617e43a97 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/validator_test.py +++ b/pipeline/workflow/ingestion-helper/aggregation/validator_test.py @@ -31,8 +31,12 @@ class TestValidatorSuccess(unittest.TestCase): """Verifies successful validation paths for valid configurations.""" def setUp(self): - # Load the actual schema from the workspace to ensure tests remain realistic self.schema_path = os.path.join(os.path.dirname(__file__), "schema.json") + self.tmpdir = tempfile.TemporaryDirectory() + self.config_path = os.path.join(self.tmpdir.name, "aggregation.yaml") + + def tearDown(self): + self.tmpdir.cleanup() def test_validate_config_success_all_types(self): """Verifies that a comprehensive, valid config with all types passes validation.""" @@ -72,18 +76,16 @@ def test_validate_config_success_all_types(self): imports: ["*"] """) - with tempfile.TemporaryDirectory() as tmpdir: - config_path = os.path.join(tmpdir, "aggregation.yaml") - with open(config_path, "w") as f: - f.write(valid_all_types_yaml) + with open(self.config_path, "w") as f: + f.write(valid_all_types_yaml) - aggregations = validate_config(config_path, self.schema_path) - - self.assertEqual(len(aggregations), 6) - self.assertEqual(aggregations[0]["type"], "linked_edges") - self.assertEqual(aggregations[1]["source_type"], "County") - self.assertEqual(aggregations[2]["ancestor_sv_id"], "Count_Person") - self.assertEqual(aggregations[3]["entity_types"], ["MortalityEvent"]) + aggregations = validate_config(self.config_path, self.schema_path) + + self.assertEqual(len(aggregations), 6) + self.assertEqual(aggregations[0]["type"], "linked_edges") + self.assertEqual(aggregations[1]["source_type"], "County") + self.assertEqual(aggregations[2]["ancestor_sv_id"], "Count_Person") + self.assertEqual(aggregations[3]["entity_types"], ["MortalityEvent"]) class TestValidatorSchemaConstraints(unittest.TestCase): @@ -91,6 +93,11 @@ class TestValidatorSchemaConstraints(unittest.TestCase): def setUp(self): self.schema_path = os.path.join(os.path.dirname(__file__), "schema.json") + self.tmpdir = tempfile.TemporaryDirectory() + self.config_path = os.path.join(self.tmpdir.name, "aggregation.yaml") + + def tearDown(self): + self.tmpdir.cleanup() def test_validate_config_missing_type(self): """Verifies that missing the required 'type' field raises ValidationError.""" @@ -98,14 +105,12 @@ def test_validate_config_missing_type(self): aggregations: - imports: ["*"] """) - with tempfile.TemporaryDirectory() as tmpdir: - config_path = os.path.join(tmpdir, "aggregation.yaml") - with open(config_path, "w") as f: - f.write(invalid_missing_type_yaml) + with open(self.config_path, "w") as f: + f.write(invalid_missing_type_yaml) - with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: - validate_config(config_path, self.schema_path) - self.assertIn("'type' is a required property", ctx.exception.message) + with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: + validate_config(self.config_path, self.schema_path) + self.assertIn("'type' is a required property", ctx.exception.message) def test_validate_config_missing_imports(self): """Verifies that missing the required 'imports' field raises ValidationError.""" @@ -113,14 +118,12 @@ def test_validate_config_missing_imports(self): aggregations: - type: linked_edges """) - with tempfile.TemporaryDirectory() as tmpdir: - config_path = os.path.join(tmpdir, "aggregation.yaml") - with open(config_path, "w") as f: - f.write(invalid_missing_imports_yaml) + with open(self.config_path, "w") as f: + f.write(invalid_missing_imports_yaml) - with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: - validate_config(config_path, self.schema_path) - self.assertIn("'imports' is a required property", ctx.exception.message) + with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: + validate_config(self.config_path, self.schema_path) + self.assertIn("'imports' is a required property", ctx.exception.message) def test_validate_config_invalid_imports_type(self): """Verifies that imports field being a string instead of an array raises ValidationError.""" @@ -129,14 +132,12 @@ def test_validate_config_invalid_imports_type(self): - type: linked_edges imports: "*" """) - with tempfile.TemporaryDirectory() as tmpdir: - config_path = os.path.join(tmpdir, "aggregation.yaml") - with open(config_path, "w") as f: - f.write(invalid_imports_type_yaml) + with open(self.config_path, "w") as f: + f.write(invalid_imports_type_yaml) - with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: - validate_config(config_path, self.schema_path) - self.assertIn("is not of type 'array'", ctx.exception.message) + with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: + validate_config(self.config_path, self.schema_path) + self.assertIn("is not of type 'array'", ctx.exception.message) def test_validate_config_invalid_stage_type(self): """Verifies that stage field being a string instead of an integer raises ValidationError.""" @@ -146,14 +147,12 @@ def test_validate_config_invalid_stage_type(self): imports: ["*"] stage: "first" """) - with tempfile.TemporaryDirectory() as tmpdir: - config_path = os.path.join(tmpdir, "aggregation.yaml") - with open(config_path, "w") as f: - f.write(invalid_stage_type_yaml) + with open(self.config_path, "w") as f: + f.write(invalid_stage_type_yaml) - with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: - validate_config(config_path, self.schema_path) - self.assertIn("is not of type 'integer'", ctx.exception.message) + with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: + validate_config(self.config_path, self.schema_path) + self.assertIn("is not of type 'integer'", ctx.exception.message) def test_validate_config_invalid_stage_value(self): """Verifies that a stage value of 0 (minimum is 1) raises ValidationError.""" @@ -163,14 +162,12 @@ def test_validate_config_invalid_stage_value(self): imports: ["*"] stage: 0 """) - with tempfile.TemporaryDirectory() as tmpdir: - config_path = os.path.join(tmpdir, "aggregation.yaml") - with open(config_path, "w") as f: - f.write(invalid_stage_value_yaml) + with open(self.config_path, "w") as f: + f.write(invalid_stage_value_yaml) - with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: - validate_config(config_path, self.schema_path) - self.assertIn("is less than the minimum of 1", ctx.exception.message) + with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: + validate_config(self.config_path, self.schema_path) + self.assertIn("is less than the minimum of 1", ctx.exception.message) def test_validate_config_empty_imports_list(self): """Verifies that an empty imports list raises ValidationError.""" @@ -179,40 +176,34 @@ def test_validate_config_empty_imports_list(self): - type: linked_edges imports: [] """) - with tempfile.TemporaryDirectory() as tmpdir: - config_path = os.path.join(tmpdir, "aggregation.yaml") - with open(config_path, "w") as f: - f.write(invalid_empty_imports_yaml) + with open(self.config_path, "w") as f: + f.write(invalid_empty_imports_yaml) - with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: - validate_config(config_path, self.schema_path) - self.assertIn("should be non-empty", ctx.exception.message) + with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: + validate_config(self.config_path, self.schema_path) + self.assertIn("should be non-empty", ctx.exception.message) def test_validate_config_missing_aggregations_key(self): """Verifies that missing the required 'aggregations' root key raises ValidationError.""" missing_aggregations_yaml = textwrap.dedent("""\ some_other_key: [] """) - with tempfile.TemporaryDirectory() as tmpdir: - config_path = os.path.join(tmpdir, "aggregation.yaml") - with open(config_path, "w") as f: - f.write(missing_aggregations_yaml) + with open(self.config_path, "w") as f: + f.write(missing_aggregations_yaml) - with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: - validate_config(config_path, self.schema_path) - self.assertIn("'aggregations' is a required property", ctx.exception.message) + with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: + validate_config(self.config_path, self.schema_path) + self.assertIn("'aggregations' is a required property", ctx.exception.message) def test_validate_config_empty_file(self): """Verifies that a completely empty configuration file raises ValidationError.""" empty_yaml = "" - with tempfile.TemporaryDirectory() as tmpdir: - config_path = os.path.join(tmpdir, "aggregation.yaml") - with open(config_path, "w") as f: - f.write(empty_yaml) + with open(self.config_path, "w") as f: + f.write(empty_yaml) - with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: - validate_config(config_path, self.schema_path) - self.assertIn("'aggregations' is a required property", ctx.exception.message) + with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: + validate_config(self.config_path, self.schema_path) + self.assertIn("'aggregations' is a required property", ctx.exception.message) class TestValidatorConditionalDependencies(unittest.TestCase): @@ -220,6 +211,11 @@ class TestValidatorConditionalDependencies(unittest.TestCase): def setUp(self): self.schema_path = os.path.join(os.path.dirname(__file__), "schema.json") + self.tmpdir = tempfile.TemporaryDirectory() + self.config_path = os.path.join(self.tmpdir.name, "aggregation.yaml") + + def tearDown(self): + self.tmpdir.cleanup() def test_validate_config_place_missing_field(self): """Verifies that a place step missing the required 'source_type' raises ValidationError.""" @@ -229,14 +225,12 @@ def test_validate_config_place_missing_field(self): destination_type: State imports: ["*"] """) - with tempfile.TemporaryDirectory() as tmpdir: - config_path = os.path.join(tmpdir, "aggregation.yaml") - with open(config_path, "w") as f: - f.write(invalid_place_missing_field_yaml) + with open(self.config_path, "w") as f: + f.write(invalid_place_missing_field_yaml) - with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: - validate_config(config_path, self.schema_path) - self.assertIn("is not valid under any of the given schemas", ctx.exception.message) + with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: + validate_config(self.config_path, self.schema_path) + self.assertIn("is not valid under any of the given schemas", ctx.exception.message) def test_validate_config_stat_var_missing_field(self): """Verifies that a stat_var step missing the required 'source_sv_ids' raises ValidationError.""" @@ -246,14 +240,12 @@ def test_validate_config_stat_var_missing_field(self): ancestor_sv_id: Count_Person imports: ["*"] """) - with tempfile.TemporaryDirectory() as tmpdir: - config_path = os.path.join(tmpdir, "aggregation.yaml") - with open(config_path, "w") as f: - f.write(invalid_stat_var_missing_field_yaml) + with open(self.config_path, "w") as f: + f.write(invalid_stat_var_missing_field_yaml) - with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: - validate_config(config_path, self.schema_path) - self.assertIn("is not valid under any of the given schemas", ctx.exception.message) + with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: + validate_config(self.config_path, self.schema_path) + self.assertIn("is not valid under any of the given schemas", ctx.exception.message) def test_validate_config_stat_var_empty_source_svs(self): """Verifies that a stat_var step with an empty source_sv_ids array raises ValidationError.""" @@ -264,14 +256,12 @@ def test_validate_config_stat_var_empty_source_svs(self): source_sv_ids: [] imports: ["*"] """) - with tempfile.TemporaryDirectory() as tmpdir: - config_path = os.path.join(tmpdir, "aggregation.yaml") - with open(config_path, "w") as f: - f.write(invalid_stat_var_empty_svs_yaml) + with open(self.config_path, "w") as f: + f.write(invalid_stat_var_empty_svs_yaml) - with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: - validate_config(config_path, self.schema_path) - self.assertIn("should be non-empty", ctx.exception.message) + with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: + validate_config(self.config_path, self.schema_path) + self.assertIn("should be non-empty", ctx.exception.message) def test_validate_config_entity_missing_field(self): """Verifies that an entity step missing the required 'location_props' raises ValidationError.""" @@ -281,14 +271,12 @@ def test_validate_config_entity_missing_field(self): entity_types: ["Event"] imports: ["*"] """) - with tempfile.TemporaryDirectory() as tmpdir: - config_path = os.path.join(tmpdir, "aggregation.yaml") - with open(config_path, "w") as f: - f.write(invalid_entity_missing_field_yaml) + with open(self.config_path, "w") as f: + f.write(invalid_entity_missing_field_yaml) - with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: - validate_config(config_path, self.schema_path) - self.assertIn("is not valid under any of the given schemas", ctx.exception.message) + with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: + validate_config(self.config_path, self.schema_path) + self.assertIn("is not valid under any of the given schemas", ctx.exception.message) class TestValidatorErrorsAndFileSystem(unittest.TestCase): @@ -296,6 +284,11 @@ class TestValidatorErrorsAndFileSystem(unittest.TestCase): def setUp(self): self.schema_path = os.path.join(os.path.dirname(__file__), "schema.json") + self.tmpdir = tempfile.TemporaryDirectory() + self.config_path = os.path.join(self.tmpdir.name, "aggregation.yaml") + + def tearDown(self): + self.tmpdir.cleanup() def test_validate_config_yaml_syntax_error(self): """Verifies that malformed YAML syntax raises YAMLError.""" @@ -305,31 +298,26 @@ def test_validate_config_yaml_syntax_error(self): imports: - "*" """) - with tempfile.TemporaryDirectory() as tmpdir: - config_path = os.path.join(tmpdir, "aggregation.yaml") - with open(config_path, "w") as f: - f.write(malformed_yaml) + with open(self.config_path, "w") as f: + f.write(malformed_yaml) - with self.assertRaises(yaml.YAMLError): - validate_config(config_path, self.schema_path) + with self.assertRaises(yaml.YAMLError): + validate_config(self.config_path, self.schema_path) def test_validate_config_missing_config_file(self): """Verifies that a missing config file path raises FileNotFoundError.""" - with self.assertRaises(FileNotFoundError): + with self.assertRaises(FileNotFoundError) as ctx: validate_config("non_existent_config.yaml", self.schema_path) + self.assertIn("Aggregation config file not found", str(ctx.exception)) def test_validate_config_missing_schema_file(self): """Verifies that a missing schema file path raises FileNotFoundError.""" - with tempfile.TemporaryDirectory() as tmpdir: - config_path = os.path.join(tmpdir, "aggregation.yaml") - # Create a real, valid config file so it exists - with open(config_path, "w") as f: - f.write("aggregations: []") - - # Pass the real config path, but a non-existent schema path - with self.assertRaises(FileNotFoundError) as ctx: - validate_config(config_path, "non_existent_schema.json") - self.assertIn("JSON Schema file not found", str(ctx.exception)) + with open(self.config_path, "w") as f: + f.write("aggregations: []") + + with self.assertRaises(FileNotFoundError) as ctx: + validate_config(self.config_path, "non_existent_schema.json") + self.assertIn("JSON Schema file not found", str(ctx.exception)) if __name__ == '__main__': From cfecadc362728849367238cfeeb5140cef3d0053 Mon Sep 17 00:00:00 2001 From: Sandeep Tuniki Date: Wed, 24 Jun 2026 23:51:15 +0530 Subject: [PATCH 16/33] style(aggregation): clean up and simplify aggregation.yaml comments --- pipeline/workflow/ingestion-helper/aggregation.yaml | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/pipeline/workflow/ingestion-helper/aggregation.yaml b/pipeline/workflow/ingestion-helper/aggregation.yaml index b6eb01acb..b8786aa51 100644 --- a/pipeline/workflow/ingestion-helper/aggregation.yaml +++ b/pipeline/workflow/ingestion-helper/aggregation.yaml @@ -1,15 +1,6 @@ -# ============================================================================= -# Data Commons Aggregation Configuration -# ============================================================================= -# By default, this runs the standard global post-processing steps in parallel. -# You can customize this file to add place-based rollups, statistical variable -# aggregations, or to disable specific steps. +# Data Commons Aggregation Configuration. See the README for details. aggregations: - - # --------------------------------------------------------------------------- - # Stage 1: Standard Global Steps (Run in parallel by default) - # --------------------------------------------------------------------------- # Generates linkedContainedInPlace, linkedMemberOf, etc. - type: linked_edges imports: ["*"] From 05ea5d9652e66e5e51860d54f604e2c986f6122f Mon Sep 17 00:00:00 2001 From: Sandeep Tuniki Date: Thu, 2 Jul 2026 15:25:51 +0530 Subject: [PATCH 17/33] feat(aggregation): add YAML aggregation configs --- .../aggregation/configs/entity.yaml | 188 ++ .../aggregation/configs/place.yaml | 139 ++ .../aggregation/configs/statvar.yaml | 1791 +++++++++++++++++ .../configs/statvar_calculation.yaml | 324 +++ .../aggregation/configs/statvar_series.yaml | 449 +++++ .../aggregation/configs/super_enum.yaml | 25 + 6 files changed, 2916 insertions(+) create mode 100644 pipeline/workflow/ingestion-helper/aggregation/configs/entity.yaml create mode 100644 pipeline/workflow/ingestion-helper/aggregation/configs/place.yaml create mode 100644 pipeline/workflow/ingestion-helper/aggregation/configs/statvar.yaml create mode 100644 pipeline/workflow/ingestion-helper/aggregation/configs/statvar_calculation.yaml create mode 100644 pipeline/workflow/ingestion-helper/aggregation/configs/statvar_series.yaml create mode 100644 pipeline/workflow/ingestion-helper/aggregation/configs/super_enum.yaml diff --git a/pipeline/workflow/ingestion-helper/aggregation/configs/entity.yaml b/pipeline/workflow/ingestion-helper/aggregation/configs/entity.yaml new file mode 100644 index 000000000..dcf30500d --- /dev/null +++ b/pipeline/workflow/ingestion-helper/aggregation/configs/entity.yaml @@ -0,0 +1,188 @@ +calculations: + + # Earthquakes + - type: ENTITY_AGGREGATION + output_import: EarthquakeUSGS_Agg + input_imports: + - EarthquakeUSGS + entity_aggregation: + entity_types: + - EarthquakeEvent + location_props: + - affectedPlace + date_prop: occurrenceTime + agg_date_formats: + - YYYY + - YYYY-MM + constraints: + - "magnitude: [3 - M]" + - "magnitude: [4 - M]" + - "magnitude: [5 - M]" + - "magnitude: [6 - M]" + - "magnitude: [7 - M]" + - "magnitude: [8 - M]" + - "magnitude: [9 - M]" + - "magnitude: [3 4 M]" + - "magnitude: [4 5 M]" + - "magnitude: [5 6 M]" + - "magnitude: [6 7 M]" + - "magnitude: [7 8 M]" + - "magnitude: [8 9 M]" + + # Fires + - type: ENTITY_AGGREGATION + output_import: FireFAMWEB_Agg + input_imports: + - FireFAMWEB + entity_aggregation: + entity_types: + - WildlandFireEvent + location_props: + - location + date_prop: discoveryDate + agg_date_formats: + - YYYY + + - type: ENTITY_AGGREGATION + output_import: FireWFIGS_Agg + input_imports: + - FireWFIGS + entity_aggregation: + entity_types: + - FireIncidentComplexEvent + - PrescribedFireEvent + - WildlandFireEvent + location_props: + - location + date_prop: discoveryDate + agg_date_formats: + - YYYY-MM + - YYYY + + - type: ENTITY_AGGREGATION + output_import: NASA_VIIRSActiveFiresEvents_Agg + # Aggregate event counts for S2-cells of level 13 + input_imports: + - NASA_VIIRSActiveFiresEvents + entity_aggregation: + entity_types: + - FireEvent + location_props: + - affectedPlace + date_prop: startDate + agg_date_formats: + - YYYY + - YYYY-MM + + # Storms + - type: ENTITY_AGGREGATION + output_import: StormNOAA_Agg + input_imports: + - StormNOAA + entity_aggregation: + location_props: + - affectedPlace + date_prop: startDate + agg_date_formats: + - YYYY + - YYYY-MM + # The following query is used to get all the types: + # + # SELECT DISTINCT object_id + # FROM `datcom-store.dc_kg_latest.Triple` + # WHERE prov_id='dc/svnv9g3' AND predicate = 'typeOf' + entity_types: + - MarineHighWindEvent + - HighWindEvent + - HeavyRainEvent + - DebrisFlowEvent + - DenseSmokeEvent + - TropicalDepressionEvent + - LandslideEvent + - DustDevilEvent + - WinterStormEvent + - MarineHailEvent + - TsunamiEvent + - BlizzardEvent + - RipCurrentEvent + - HeavySnowEvent + - ExtremeColdWindChillEvent + - MarineDenseFogEvent + - TropicalStormEvent + - StormSurgeTideEvent + - NorthernLightsEvent + - FrostFreezeEvent + - WildfireEvent + - SleetEvent + - ColdWindChillEvent + - MarineStrongWindEvent + - LightningEvent + - SneakerwaveEvent + - AstronomicalLowTideEvent + - MarineTropicalStormEvent + - VolcanicAshEvent + - CoastalFloodEvent + - GeoCoordinates + - HeavyWindEvent + - IceStormEvent + - StrongWindEvent + - WinterWeatherEvent + - VolcanicAshfallEvent + - ThunderstormWindEvent + - FlashFloodEvent + - HeatEvent + - DustStormEvent + - ExcessiveHeatEvent + - HailEvent + - AvalancheEvent + - MarineThunderstormWindEvent + - LakeshoreFloodEvent + - DroughtEvent + - HighSurfEvent + - HurricaneTyphoonEvent + - LakeEffectSnowEvent + - WaterspoutEvent + - MarineTropicalDepressionEvent + - SeicheEvent + - HighSnowEvent + - DenseFogEvent + - HurricaneEvent + - MarineHurricaneTyphoonEvent + - FloodEvent + - FunnelCloudEvent + - FreezingFogEvent + - TornadoEvent + - MarineLightningEvent + - StormEpisode + + # Floods + - type: ENTITY_AGGREGATION + output_import: DynamicWorld_FloodEvents_Agg + # Aggregate event counts for S2-cells of level 13 + input_imports: + - DynamicWorld_FloodEvents + entity_aggregation: + entity_types: + - FloodEvent + location_props: + - affectedPlace + date_prop: startDate + agg_date_formats: + - YYYY + - YYYY-MM + + # Heat/Cold Temperature Events counts + - type: ENTITY_AGGREGATION + output_import: TemperatureEvents_Agg + input_imports: + - TemperatureEvents + entity_aggregation: + entity_types: + - HeatTemperatureEvent + - ColdTemperatureEvent + location_props: + - affectedPlace + date_prop: startDate + agg_date_formats: + - YYYY + - YYYY-MM diff --git a/pipeline/workflow/ingestion-helper/aggregation/configs/place.yaml b/pipeline/workflow/ingestion-helper/aggregation/configs/place.yaml new file mode 100644 index 000000000..6d1108f73 --- /dev/null +++ b/pipeline/workflow/ingestion-helper/aggregation/configs/place.yaml @@ -0,0 +1,139 @@ +calculations: + - type: PLACE_AGGREGATION + input_imports: CensusACS5YearSurvey + place_aggregation: + from_place_types: State + to_place_types: Country + round: 1 + output_import: CensusACS5YearSurvey_AggCountry + + - type: PLACE_AGGREGATION + input_imports: CensusSAHIE + place_aggregation: + from_place_types: State + to_place_types: Country + round: 1 + output_import: CensusSAHIE_AggCountry + + - type: PLACE_AGGREGATION + input_imports: CDCMortality + place_aggregation: + from_place_types: County + to_place_types: State + round: 1 + output_import: CDCMortality_AggState + + - type: PLACE_AGGREGATION + input_imports: CDCMortality_AggState + place_aggregation: + from_place_types: State + to_place_types: Country + round: 2 + output_import: CDCMortality_AggState_AggCountry + + # - type: PLACE_AGGREGATION + # input_imports: FBIGovCrime + # place_aggregation: + # from_place_types: State + # to_place_types: Country + # round: 1 + # output_import: FBIGovCrime_AggCountry + + - type: PLACE_AGGREGATION + input_imports: DEA_ARCOS + place_aggregation: + from_place_types: County + to_place_types: State + round: 1 + output_import: DEA_ARCOS_AggState + + - type: PLACE_AGGREGATION + input_imports: DEA_ARCOS_AggState + place_aggregation: + from_place_types: State + to_place_types: Country + round: 2 + output_import: DEA_ARCOS_AggState_AggCountry + + - type: PLACE_AGGREGATION + input_imports: EPA_EJSCREEN + place_aggregation: + from_place_types: CensusBlockGroup + to_place_types: CensusTract + round: 1 + output_import: EPA_EJSCREEN_AggCensusTract + + - type: PLACE_AGGREGATION + input_imports: EPA_EJSCREEN_AggCensusTract + place_aggregation: + from_place_types: CensusTract + to_place_types: County + round: 2 + output_import: EPA_EJSCREEN_AggCensusTract_AggCounty + + - type: PLACE_AGGREGATION + input_imports: DeepSolar + place_aggregation: + from_place_types: CensusBlockGroup + to_place_types: CensusTract + round: 1 + output_import: DeepSolar_AggCensusTract + + - type: PLACE_AGGREGATION + input_imports: DeepSolar_AggCensusTract + place_aggregation: + from_place_types: CensusTract + to_place_types: County + round: 2 + output_import: DeepSolar_AggCensusTract_AggCounty + + - type: PLACE_AGGREGATION + input_imports: EPA_GHGRP + place_aggregation: + from_place_types: EpaReportingFacility + to_place_types: County + round: 1 + output_import: EPA_GHGRP_AggCounty + + - type: PLACE_AGGREGATION + input_imports: EPA_GHGRP + place_aggregation: + from_place_types: EpaReportingFacility + to_place_types: CensusZipCodeTabulationArea + round: 1 + output_import: EPA_GHGRP_AggCensusZipCodeTabulationArea + + - type: PLACE_AGGREGATION + input_imports: EPA_GHGRP_AggCounty + place_aggregation: + from_place_types: County + to_place_types: State + round: 2 + output_import: EPA_GHGRP_AggCounty_AggState + + - type: PLACE_AGGREGATION + input_imports: RFF_USGridGeo_WeatherVariabilityForecast + place_aggregation: + from_place_types: GeoGridPlace_0.25Deg + to_place_types: County + allow_multiple_to_places: true + round: 1 + output_import: RFF_USGridGeo_WeatherVariabilityForecast_AggCounty + + - type: PLACE_AGGREGATION + input_imports: India_RBIStateDomesticProduct + place_aggregation: + from_place_types: State + to_place_types: Country + allow_multiple_to_places: true + round: 1 + output_import: India_RBIStateDomesticProduct_AggCountry + + - type: PLACE_AGGREGATION + input_imports: India_RBIStateDomesticProduct_StatVarAgg + place_aggregation: + from_place_types: State + to_place_types: Country + allow_multiple_to_places: true + round: 1 + output_import: India_RBIStateDomesticProduct_StatVarAgg_AggCountry diff --git a/pipeline/workflow/ingestion-helper/aggregation/configs/statvar.yaml b/pipeline/workflow/ingestion-helper/aggregation/configs/statvar.yaml new file mode 100644 index 000000000..fe6743c5b --- /dev/null +++ b/pipeline/workflow/ingestion-helper/aggregation/configs/statvar.yaml @@ -0,0 +1,1791 @@ +calculations: + +# +# Health Insurance Coverage +# ------------------------- +# + - type: STAT_VAR_AGGREGATION + output_import: CensusACS5YearSurvey_HealthInsurance_StatVarAgg + input_imports: + - CensusACS5YearSurvey + - CensusACS5YearSurvey_AggCountry + stat_var_aggregation: + aggregations: + # 1. Census Table B18135 includes insurance coverage by disability-status and + # age (https://data.census.gov/cedsci/table?tid=ACSDT5Y2019.B18135). + # We use that to compute total and by disability-status. + # + # Total uninsured + # https://screenshot.googleplex.com/v4s43nzoCC9G5Qp + - ancestor_sv_id: Count_Person_NoHealthInsurance + source_sv_ids: + # No Disability + - dc/y0dvhk0sggzef + - dc/kdg05h55y45y6 + - dc/9drszqwd2nef7 + # With Disability + - dc/bew8kj6l7tv93 + - dc/96dqj47csvmy8 + - dc/qr4s77egv27q2 + # Total insured + # https://screenshot.googleplex.com/ARqaVprXtizwETy + - ancestor_sv_id: Count_Person_WithHealthInsurance + source_sv_ids: + # No Disability + - dc/32mhsxvq7qsm4 + - dc/2s2dkbb7gz038 + - dc/dc8vqzkx18x0c + # With Disability + - dc/s5efzs4x817p5 + - dc/9j39148yn79zf + - dc/p3v76jcvdx919 + # Uninsured with no disability + # The following 4 aggregations come from parts of the above two aggregations. + - ancestor_sv_id: Count_Person_NoHealthInsurance_NoDisability + source_sv_ids: + # No Disability + - dc/y0dvhk0sggzef + - dc/kdg05h55y45y6 + - dc/9drszqwd2nef7 + # Uninsured with disability + - ancestor_sv_id: Count_Person_NoHealthInsurance_WithDisability + source_sv_ids: + # With Disability + - dc/bew8kj6l7tv93 + - dc/96dqj47csvmy8 + - dc/qr4s77egv27q2 + # Insured with no disability + - ancestor_sv_id: Count_Person_WithHealthInsurance_NoDisability + source_sv_ids: + # No Disability + - dc/32mhsxvq7qsm4 + - dc/2s2dkbb7gz038 + - dc/dc8vqzkx18x0c + # Insured with disability + - ancestor_sv_id: Count_Person_WithHealthInsurance_WithDisability + source_sv_ids: + # With Disability + - dc/s5efzs4x817p5 + - dc/9j39148yn79zf + - dc/p3v76jcvdx919 + # + # 2. Census Table B27001 includes insurance coverage by age and gender + # breakdown (https://data.census.gov/cedsci/table?tid=ACSDT5Y2019.B27001). We + # use that to compute gender breakdown. + # + # https://screenshot.googleplex.com/aSP3i8ASJDxct2C + - ancestor_sv_id: Count_Person_Female_NoHealthInsurance + source_sv_ids: + - dc/jx2q10tbnwhf3 + - dc/g1957rtkkrnpg + - dc/kbd1j4tv1r383 + - dc/xkfbhgbnzhm8d + - dc/001qmh9sdqzeb + - dc/qhwj3qx90h4pd + - dc/jydkzj9v0yb2 + - dc/qb6th3c5tcd92 + - dc/zj152fzswgrw6 + # https://screenshot.googleplex.com/57jf9CpaC7KGd7y + - ancestor_sv_id: Count_Person_Male_NoHealthInsurance + source_sv_ids: + - dc/fwqlkjjch0dg5 + - dc/1qdvp8x0t3l11 + - dc/33z5t6xqjepp1 + - dc/260gxgmeglpb + - dc/l8j7ggjt8yx8b + - dc/4rdkjc2485w42 + - dc/nn4xz2lv98h5c + - dc/ywjqcxc244sx2 + - dc/j88b1fy7045j1 + # https://screenshot.googleplex.com/5buzauZqyhTdnkV + - ancestor_sv_id: Count_Person_Female_WithHealthInsurance + source_sv_ids: + - dc/jjhxlkp3qk2m6 + - dc/lwqlh3vtpzw51 + - dc/1h5c2xbpd0edb + - dc/q4lk959j0bgq + - dc/1hs676pfqqthd + - dc/kkbq8kzlm6tw4 + - dc/y9xxbzfr8yb06 + - dc/whh1y53fj4p69 + - dc/965mdb0c8tef9 + # https://screenshot.googleplex.com/38JDwJJGKhNLmWM + - ancestor_sv_id: Count_Person_Male_WithHealthInsurance + source_sv_ids: + - dc/ve6s1v1eyrbt7 + - dc/1n4mf17q78xtd + - dc/zyhtyhs9efej3 + - dc/y6tzpetgq89v4 + - dc/js56cehzj7wf4 + - dc/yrmj9vp8b8n1d + - dc/wp3txgscfen9c + - dc/b3n86k0h3h2w + - dc/j8se7wv2gbdfd + # dc/g/Person_HealthInsurance-NoPrivateHealthInsurance + # Level 3 + - ancestor_sv_id: Count_Person_NoPrivateHealthInsurance + source_sv_ids: + # Population: 26 - 34 Years, Female, No Private Health Insurance + # Population: 75 Years or More, Male, No Private Health Insurance + # Population: 6 Years or Less, Male, No Private Health Insurance + # Population: 55 - 64 Years, Female, No Private Health Insurance + # Population: 65 - 74 Years, Female, No Private Health Insurance + # Population: 18 - 24 Years, Female, No Private Health Insurance + # Population: 25 - 34 Years, Male, No Private Health Insurance + # Population: 45 - 54 Years, Female, No Private Health Insurance + # Population: 35 - 44 Years, Female, No Private Health Insurance + # Population: 19 - 25 Years, Male, No Private Health Insurance + # Population: 6 - 17 Years, Female, No Private Health Insurance + # Population: 26 - 34 Years, Male, No Private Health Insurance + # Population: 45 - 54 Years, Male, No Private Health Insurance + # Population: 6 Years or Less, Female, No Private Health Insurance + # Population: 6 - 18 Years, Male, No Private Health Insurance + # Population: 19 - 25 Years, Female, No Private Health Insurance + # Population: 25 - 34 Years, Female, No Private Health Insurance + # Population: 55 - 64 Years, Male, No Private Health Insurance + # Population: 65 - 74 Years, Male, No Private Health Insurance + # Population: 6 - 18 Years, Female, No Private Health Insurance + # Population: 18 - 24 Years, Male, No Private Health Insurance + # Population: 75 Years or More, Female, No Private Health Insurance + # Population: 35 - 44 Years, Male, No Private Health Insurance + # Population: 6 - 17 Years, Male, No Private Health Insurance + # dc/g/Person_HealthInsurance-NoPublicHealthInsurance + # Level 3 + - ancestor_sv_id: Count_Person_NoPublicHealthInsurance + source_sv_ids: + # Population: 26 - 34 Years, Female, No Public Health Insurance + # Population: 65 - 74 Years, Male, No Public Health Insurance + # Population: 19 - 25 Years, Female, No Public Health Insurance + # Population: 45 - 54 Years, Female, No Public Health Insurance + # Population: 26 - 34 Years, Male, No Public Health Insurance + # Population: 35 - 44 Years, Male, No Public Health Insurance + # Population: 19 - 25 Years, Male, No Public Health Insurance + # Population: 6 - 18 Years, Male, No Public Health Insurance + # Population: 65 - 74 Years, Female, No Public Health Insurance + # Population: 45 - 54 Years, Male, No Public Health Insurance + # Population: 75 Years or More, Male, No Public Health Insurance + # Population: 6 - 18 Years, Female, No Public Health Insurance + # Population: 35 - 44 Years, Female, No Public Health Insurance + # Population: 55 - 64 Years, Male, No Public Health Insurance + # Population: 6 - 17 Years, Male, No Public Health Insurance + # Population: 18 - 24 Years, Female, No Public Health Insurance + # Population: 18 - 24 Years, Male, No Public Health Insurance + # Population: 75 Years or More, Female, No Public Health Insurance + # Population: 6 Years or Less, Female, No Public Health Insurance + # Population: 25 - 34 Years, Male, No Public Health Insurance + # Population: 6 - 17 Years, Female, No Public Health Insurance + # Population: 6 Years or Less, Male, No Public Health Insurance + # Population: 55 - 64 Years, Female, No Public Health Insurance + # Population: 25 - 34 Years, Female, No Public Health Insurance + # dc/g/Person_HealthInsurance-WithOneTypeOfHealthInsurance + # Level 1 + - ancestor_sv_id: Count_Person_WithOneTypeOfHealthInsurance + source_sv_ids: + # Population: 18 Years or Less, With One Type of Health Insurance + # Population: 18 - 34 Years, With One Type of Health Insurance + # Population: 35 - 64 Years, With One Type of Health Insurance + # Population: 19 Years or Less, With One Type of Health Insurance + # Population: 19 - 34 Years, With One Type of Health Insurance + # Population: 65 Years or More, With One Type of Health Insurance + # dc/g/Person_HealthInsurance-WithPublicCoverage + # Level 3 + - ancestor_sv_id: Count_Person_WithPublicCoverage + source_sv_ids: + # Population: 18 Years or Less, With Disability, With Public Coverage + # Population: 19 Years or Less, With Disability, With Public Coverage + # Population: 19 Years or Less, No Disability, With Public Coverage + # Population: 65 Years or More, No Disability, With Public Coverage + # Population: 19 - 64 Years, No Disability, With Public Coverage + # Population: 65 Years or More, With Disability, With Public Coverage + # Population: 18 - 64 Years, With Disability, With Public Coverage + # Population: 18 Years or Less, No Disability, With Public Coverage + # Population: 18 - 64 Years, No Disability, With Public Coverage + # Population: 19 - 64 Years, With Disability, With Public Coverage + + # + # Ability to speak English + # ------------------------ + # + # We aggregate abilityToSpeakEnglish over nativity (2) and languageSpokenAtHome + # (4) for a total of 8 source SVs from Census Table B16005 + # (https://data.census.gov/cedsci/table?tid=ACSDT5Y2019.B16005) + # + # NOTE: Values SpeakEnglishLessThanVeryWell and SpeakEnglishVeryWell already + # have population values from a different table. + # + - type: STAT_VAR_AGGREGATION + output_import: CensusACS5YearSurvey_AbilityToSpeakEnglish_StatVarAgg + input_imports: + - CensusACS5YearSurvey + - CensusACS5YearSurvey_AggCountry + stat_var_aggregation: + aggregations: + # https://screenshot.googleplex.com/3Fw42DhNnD5fPLi + - ancestor_sv_id: Count_Person_SpeakEnglishNotAtAll + source_sv_ids: + - dc/edfjy64gmxf6f + - dc/0165994v0l8fh + - dc/0ltdeskbvkn3d + - dc/ves02m300f6lc + - dc/vh52n3qgw0kt + - dc/6yzk8t4e9t6v6 + - dc/0kx32ff9c6d79 + - dc/7cxlkzf56zk26 + # https://screenshot.googleplex.com/3ExaNScXGyyEP2c + - ancestor_sv_id: Count_Person_SpeakEnglishNotWell + source_sv_ids: + - dc/f8qh4hp1830dg + - dc/3f6nmhf0zhtd4 + - dc/l1zr0zx1yje61 + - dc/zh4sly3kmwbwg + - dc/e4jl4qe9xc4n9 + - dc/6rzmvxpgqlww6 + - dc/wf1rm5zx34dtg + - dc/4xykjw3v6n4t3 + # https://screenshot.googleplex.com/AaaSNUMpQwHRvSZ + - ancestor_sv_id: Count_Person_SpeakEnglishWell + source_sv_ids: + - dc/4n1wg2c7hjem + - dc/hbqhxpwsdvyj8 + - dc/ewchtlvjlljk5 + - dc/7st63yed0l4x + - dc/vx85c3xzwg981 + - dc/92mnzpclp7tbd + - dc/yx0mdj3dnk3mb + - dc/1x48nzrne2f88 + + # + # In Armed Forces + # --------------- + # + # Census Table B23001 includes count of people in armed forces (in which case + # In labor Forces is implied) broken down by gender and age. From that we + # aggregate to total and by gender. + # (https://data.census.gov/cedsci/table?tid=ACSDT5Y2019.B23001) + # + - type: STAT_VAR_AGGREGATION + output_import: CensusACS5YearSurvey_InArmedForces_StatVarAgg + input_imports: + - CensusACS5YearSurvey + - CensusACS5YearSurvey_AggCountry + stat_var_aggregation: + aggregations: + # Women in armed forces + # https://screenshot.googleplex.com/3ik3W3g2eqMjkSm + - ancestor_sv_id: Count_Person_Female_InArmedForces + source_sv_ids: + - dc/173smewzddlb + - dc/fjd99f6xm3mz3 + - dc/n9wqebpd8jyhc + - dc/7tej0l9s6x2rh + - dc/hqz1tck7sdrm2 + - dc/tmw7srr91bj41 + - dc/lzzl0rx7m1yf9 + - dc/8d182v9j04mg4 + - dc/lb3lb4mg82mph + - dc/g2m31qc7q1x64 + # Men in armed forces + # https://screenshot.googleplex.com/6dvhYcbmtHsjifQ + - ancestor_sv_id: Count_Person_Male_InArmedForces + source_sv_ids: + - dc/vp1gqv00d2ql3 + - dc/04nv03cmv6btf + - dc/rhgnypl16nh6c + - dc/mtqbxrg29f64h + - dc/gd0jznk5n2kb3 + - dc/4f6fv8bdstp3b + - dc/rlrwr4f7r4pg7 + - dc/x63tjfw28tzvc + - dc/fpx513jvf4xed + - dc/2zj8jthd4f563 + # Armed forces population, from combining the above two sets of SVs. + - ancestor_sv_id: Count_Person_InArmedForces + source_sv_ids: + # Female + - dc/173smewzddlb + - dc/fjd99f6xm3mz3 + - dc/n9wqebpd8jyhc + - dc/7tej0l9s6x2rh + - dc/hqz1tck7sdrm2 + - dc/tmw7srr91bj41 + - dc/lzzl0rx7m1yf9 + - dc/8d182v9j04mg4 + - dc/lb3lb4mg82mph + - dc/g2m31qc7q1x64 + # Male + - dc/vp1gqv00d2ql3 + - dc/04nv03cmv6btf + - dc/rhgnypl16nh6c + - dc/mtqbxrg29f64h + - dc/gd0jznk5n2kb3 + - dc/4f6fv8bdstp3b + - dc/rlrwr4f7r4pg7 + - dc/x63tjfw28tzvc + - dc/fpx513jvf4xed + - dc/2zj8jthd4f563 + + # Education - CensusACS5YearSurvey + - type: STAT_VAR_AGGREGATION + output_import: CensusACS5YearSurvey_Education_StatVarAgg + input_imports: + - CensusACS5YearSurvey + - CensusACS5YearSurvey_AggCountry + stat_var_aggregation: + aggregations: + # dc/g/Person_EducationalAttainment-5ThAnd6ThGrade + # Level 1 + - ancestor_sv_id: Count_Person_Years25Onwards_EducationalAttainment_5ThAnd6ThGrade + source_sv_ids: + # Population: 5th And 6th Grade, Female + # Population: 5th And 6th Grade, Male + # dc/g/Person_EducationalAttainment-7ThAnd8ThGrade + # Level 1 + - ancestor_sv_id: Count_Person_Years25Onwards_EducationalAttainment_7ThAnd8ThGrade + source_sv_ids: + # Population: 7th And 8th Grade, Female + # Population: 7th And 8th Grade, Male + # dc/g/Person_EducationalAttainment-9ThTo12ThGradeNoDiploma + # Level 3 + - ancestor_sv_id: Count_Person_EducationalAttainment_9ThTo12ThGradeNoDiploma + source_sv_ids: + # Population: 18 - 24 Years, 9th To 12th Grade No Diploma, Female + # Population: 18 - 24 Years, 9th To 12th Grade No Diploma, Male + # Population: 25 - 34 Years, 9th To 12th Grade No Diploma, Female + # Population: 25 - 34 Years, 9th To 12th Grade No Diploma, Male + # Population: 35 - 44 Years, 9th To 12th Grade No Diploma, Female + # Population: 35 - 44 Years, 9th To 12th Grade No Diploma, Male + # Population: 45 - 64 Years, 9th To 12th Grade No Diploma, Female + # Population: 45 - 64 Years, 9th To 12th Grade No Diploma, Male + # Population: 65 Years or More, 9th To 12th Grade No Diploma, Female + # Population: 65 Years or More, 9th To 12th Grade No Diploma, Male + # dc/g/Person_EducationalAttainment-LessThan9ThGrade + # Level 3 + - ancestor_sv_id: Count_Person_EducationalAttainment_LessThan9ThGrade + source_sv_ids: + # Population: 18 - 24 Years, Less Than 9th Grade, Female + # Population: 18 - 24 Years, Less Than 9th Grade, Male + # Population: 25 - 34 Years, Less Than 9th Grade, Female + # Population: 25 - 34 Years, Less Than 9th Grade, Male + # Population: 35 - 44 Years, Less Than 9th Grade, Female + # Population: 35 - 44 Years, Less Than 9th Grade, Male + # Population: 45 - 64 Years, Less Than 9th Grade, Female + # Population: 45 - 64 Years, Less Than 9th Grade, Male + # Population: 65 Years or More, Less Than 9th Grade, Female + # Population: 65 Years or More, Less Than 9th Grade, Male + # dc/g/Person_EducationalAttainment-LessThanHighSchoolDiploma + # Level 3 + - ancestor_sv_id: Count_Person_EducationalAttainment_LessThanHighSchoolDiploma + source_sv_ids: + # Population: Less Than High School Diploma, Male, Two or More Races + # Population: Less Than High School Diploma, Male, Asian Alone + # Population: Less Than High School Diploma, Female, Hispanic or Latino + # Population: Less Than High School Diploma, Male, White Alone + # Population: Less Than High School Diploma, Female, Asian Alone + # Population: Less Than High School Diploma, Male, Hispanic or Latino + # Population: Less Than High School Diploma, Male, Some Other Race Alone + # Population: Less Than High School Diploma, Female, White Alone Not Hispanic or Latino + # Population: Less Than High School Diploma, Male, Black or African American Alone + # Population: Less Than High School Diploma, Female, Black or African American Alone + # Population: Less Than High School Diploma, Male, Native Hawaiian or Other Pacific Islander Alone + # Population: Less Than High School Diploma, Male, American Indian or Alaska Native Alone + # Population: Less Than High School Diploma, Female, Native Hawaiian or Other Pacific Islander Alone + # Population: Less Than High School Diploma, Female, Some Other Race Alone + # Population: Less Than High School Diploma, Female, American Indian or Alaska Native Alone + # Population: Less Than High School Diploma, Female, White Alone + # Population: Less Than High School Diploma, Male, White Alone Not Hispanic or Latino + # Population: Less Than High School Diploma, Female, Two or More Races + # dc/g/Person_EducationalAttainment-NurseryTo4ThGrade + # Level 1 + - ancestor_sv_id: Count_Person_Years25Onwards_EducationalAttainment_NurseryTo4ThGrade + source_sv_ids: + # Population: Nursery To 4th Grade, Female + # Population: Nursery To 4th Grade, Male + # dc/g/Person_EducationalAttainment-SomeCollegeNoDegree + # Level 3 + - ancestor_sv_id: Count_Person_EducationalAttainment_SomeCollegeNoDegree + source_sv_ids: + # Population: 18 - 24 Years, Some College No Degree, Female + # Population: 18 - 24 Years, Some College No Degree, Male + # Population: 25 - 34 Years, Some College No Degree, Female + # Population: 25 - 34 Years, Some College No Degree, Male + # Population: 35 - 44 Years, Some College No Degree, Female + # Population: 35 - 44 Years, Some College No Degree, Male + # Population: 45 - 64 Years, Some College No Degree, Female + # Population: 45 - 64 Years, Some College No Degree, Male + # Population: 65 Years or More, Some College No Degree, Female + # Population: 65 Years or More, Some College No Degree, Male + - ancestor_sv_id: Count_Person_EducationalAttainment_1StTo12ThGrade + source_sv_ids: + - Count_Person_EducationalAttainment1StGrade + - Count_Person_EducationalAttainment2NdGrade + - Count_Person_EducationalAttainment3RdGrade + - Count_Person_EducationalAttainment4ThGrade + - Count_Person_EducationalAttainment5ThGrade + - Count_Person_EducationalAttainment6ThGrade + - Count_Person_EducationalAttainment7ThGrade + - Count_Person_EducationalAttainment8ThGrade + - Count_Person_EducationalAttainment9ThGrade + - Count_Person_EducationalAttainment10ThGrade + - Count_Person_EducationalAttainment11ThGrade + - Count_Person_EducationalAttainment12ThGradeNoDiploma + + # The following 9 aggregations are: SomeCollegeOrAssociatesDegree, by race. + - ancestor_sv_id: Count_Person_25OrMoreYears_SomeCollegeOrAssociatesDegree_AmericanIndianOrAlaskaNativeAlone + source_sv_ids: + # Population: Some College or Associates Degree, Female, American Indian or Alaska Native Alone + - dc/mqxdr821c7kw3 + # Population: Some College or Associates Degree, Male, American Indian or Alaska Native Alone + - dc/lyt8y97bbrkpc + - ancestor_sv_id: Count_Person_25OrMoreYears_SomeCollegeOrAssociatesDegree_AsianAlone + source_sv_ids: + # Population: Some College or Associates Degree, Female, Asian Alone + - dc/whn99h1l0xgth + # Population: Some College or Associates Degree, Male, Asian Alone + - dc/fkvnj4rlrs84f + - ancestor_sv_id: Count_Person_25OrMoreYears_SomeCollegeOrAssociatesDegree_BlackOrAfricanAmericanAlone + source_sv_ids: + # Population: Some College or Associates Degree, Female, Black or African American Alone + - dc/56md3ndhrmvm7 + # Population: Some College or Associates Degree, Male, Black or African American Alone + - dc/e7h67vn5w4g13 + - ancestor_sv_id: Count_Person_25OrMoreYears_SomeCollegeOrAssociatesDegree_HispanicOrLatino + source_sv_ids: + # Population: Some College or Associates Degree, Female, Hispanic or Latino + - dc/3w039ndqy7qv1 + # Population: Some College or Associates Degree, Male, Hispanic or Latino + - dc/7xf6mm0sg9y18 + - ancestor_sv_id: Count_Person_25OrMoreYears_SomeCollegeOrAssociatesDegree_NativeHawaiianOrOtherPacificIslanderAlone + source_sv_ids: + # Population: Some College or Associates Degree, Female, Native Hawaiian or Other Pacific Islander Alone + - dc/kpcfmb6lp3zpd + # Population: Some College or Associates Degree, Male, Native Hawaiian or Other Pacific Islander Alone + - dc/r3l3rfl1ms85f + - ancestor_sv_id: Count_Person_25OrMoreYears_SomeCollegeOrAssociatesDegree_SomeOtherRaceAlone + source_sv_ids: + # Population: Some College or Associates Degree, Female, Some Other Race Alone + - dc/epw58ne8mytn5 + # Population: Some College or Associates Degree, Male, Some Other Race Alone + - dc/t0mcpxqgr2lm9 + - ancestor_sv_id: Count_Person_25OrMoreYears_SomeCollegeOrAssociatesDegree_TwoOrMoreRaces + source_sv_ids: + # Population: Some College or Associates Degree, Female, Two or More Races + - dc/q98jxycvs422f + # Population: Some College or Associates Degree, Male, Two or More Races + - dc/zjlfv8d8v14f8 + - ancestor_sv_id: Count_Person_25OrMoreYears_SomeCollegeOrAssociatesDegree_WhiteAlone + source_sv_ids: + # Population: Some College or Associates Degree, Female, White Alone + - dc/9sneyc8lpk8dc + # Population: Some College or Associates Degree, Male, White Alone + - dc/d2ct8qmvcct81 + - ancestor_sv_id: Count_Person_25OrMoreYears_SomeCollegeOrAssociatesDegree_WhiteAloneNotHispanicOrLatino + source_sv_ids: + # Population: Some College or Associates Degree, Female, White Alone Not Hispanic or Latino + - dc/bqy52h7y4nq34 + # Population: Some College or Associates Degree, Male, White Alone Not Hispanic or Latino + - dc/dc9v9h3q8l8n7 + + # Education - ACSED5YrSurvey + - type: STAT_VAR_AGGREGATION + output_import: ACSED5YrSurvey_StatVarAgg + input_imports: + - ACSED5YrSurvey + stat_var_aggregation: + aggregations: + - ancestor_sv_id: Count_Parent_Occupation_Management_Business_Science_Arts + source_sv_ids: + # Count of Parent: 16 Years or More, Civilian, Public School, Employed, in Labor Force, Management, Business, Science, And Arts Occupations + - dc/zvmdlctml84sb + - ancestor_sv_id: Count_Parent_Occupation_Natural_Resources_Construction_Maintenance + source_sv_ids: + # Count of Parent: 16 Years or More, Civilian, Public School, Employed, in Labor Force, Natural Resources, Construction, And Maintenance Occupations + - dc/fsrwm5cy84lmf + - ancestor_sv_id: Count_Parent_Occupation_Production_Transportation_Material_Moving + source_sv_ids: + # Count of Parent: 16 Years or More, Civilian, Public School, Employed, in Labor Force, Production, Transportation, And Material Moving Occupations + - dc/jbyb43crb1t37 + - ancestor_sv_id: Count_Parent_Occupation_Sales_Office + source_sv_ids: + # Count of Parent: 16 Years or More, Civilian, Public School, Employed, in Labor Force, Sales And Office Occupations + - dc/p029jbzf00pw6 + - ancestor_sv_id: Count_Parent_Occupation_Service + source_sv_ids: + # Count of Parent: 16 Years or More, Civilian, Public School, Employed, in Labor Force, Service Occupations + - dc/bstxmnb4k1wrb + + # Demographics + - type: STAT_VAR_AGGREGATION + output_import: CensusACS5YearSurvey_Demographics_StatVarAgg + input_imports: + - CensusACS5YearSurvey + - CensusACS5YearSurvey_AggCountry + stat_var_aggregation: + aggregations: + - ancestor_sv_id: Count_Person_85OrMoreYears + source_sv_ids: + - Count_Person_85OrMoreYears_Male + - Count_Person_85OrMoreYears_Female + - ancestor_sv_id: Count_Person_0To4Years + source_sv_ids: + - Count_Person_0To4Years_Male + - Count_Person_0To4Years_Female + + # Agriculture + - type: STAT_VAR_AGGREGATION + output_import: USDA_AgricultureCensus_Agriculture_StatVarAgg + input_imports: + - USDA_AgricultureCensus + stat_var_aggregation: + aggregations: + - ancestor_sv_id: Count_Person_Producer + source_sv_ids: + - Count_Person_Producer_BlackOrAfricanAmericanAlone + - Count_Person_Producer_AmericanIndianOrAlaskaNativeAlone + - Count_Person_Producer_NativeHawaiianOrOtherPacificIslanderAlone + - Count_Person_Producer_WhiteAlone + - Count_Person_Producer_TwoOrMoreRaces + - Count_Person_Producer_HispanicOrLatino + - Count_Person_Producer_AsianAlone + skip_all_sources_present_check: true + + # + # Employment + # ------------------------ + + - type: STAT_VAR_AGGREGATION + output_import: CensusACS5YearSurvey_Employment_StatVarAgg + input_imports: + - CensusACS5YearSurvey + - CensusACS5YearSurvey_AggCountry + stat_var_aggregation: + aggregations: + - ancestor_sv_id: Count_Person_16OrMoreYears_Civilian_Employed_InLaborForce_NAICSAgricultureForestryFishingHunting + source_sv_ids: + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Female, Agriculture, Forestry, Fishing And Hunting (NAICS/11) + - dc/scgpgdxgx9tr1 + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Male, Agriculture, Forestry, Fishing And Hunting (NAICS/11) + - dc/klz4pcgfe44rb + - ancestor_sv_id: Count_Person_16OrMoreYears_Civilian_Employed_InLaborForce_NAICSMiningQuarryingOilGasExtraction + source_sv_ids: + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Female, Mining, Quarrying, And Oil And Gas Extraction (NAICS/21) + - dc/jcq9116jwjvs3 + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Male, Mining, Quarrying, And Oil And Gas Extraction (NAICS/21) + - dc/xkywpwshj3ttb + - ancestor_sv_id: Count_Person_16OrMoreYears_Civilian_Employed_InLaborForce_NAICSUtilities + source_sv_ids: + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Female, Utilities (NAICS/22) + - dc/ewhc3j80tx5z1 + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Male, Utilities (NAICS/22) + - dc/hjepq5mk3yvy9 + - ancestor_sv_id: Count_Person_16OrMoreYears_Civilian_Employed_InLaborForce_NAICSConstruction + source_sv_ids: + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Female, Construction (NAICS/23) + - dc/1h1rnmvtj706b + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Male, Construction (NAICS/23) + - dc/0llhlrmmm5ce2 + - ancestor_sv_id: Count_Person_16OrMoreYears_Civilian_Employed_InLaborForce_NAICSManufacturing + source_sv_ids: + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Female, Manufacturing (NAICS/31-33) + - dc/xsqgzwf1h8f33 + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Male, Manufacturing (NAICS/31-33) + - dc/ypmrvdtd4e159 + - ancestor_sv_id: Count_Person_16OrMoreYears_Civilian_Employed_InLaborForce_NAICSWholesaleTrade + source_sv_ids: + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Female, Wholesale Trade (NAICS/42) + - dc/v2q0shf60bejd + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Male, Wholesale Trade (NAICS/42) + - dc/3cn2vsmlfsfv1 + - ancestor_sv_id: Count_Person_16OrMoreYears_Civilian_Employed_InLaborForce_NAICSRetailTrade + source_sv_ids: + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Female, Retail Trade (NAICS/44-45) + - dc/qcfd17k1c3vm9 + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Male, Retail Trade (NAICS/44-45) + - dc/3hh7cp878qgjb + - ancestor_sv_id: Count_Person_16OrMoreYears_Civilian_Employed_InLaborForce_NAICSTransportationWarehousing + source_sv_ids: + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Female, Transportation And Warehousing (NAICS/48-49) + - dc/cq266pvtkbll4 + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Male, Transportation And Warehousing (NAICS/48-49) + - dc/j9f2bf9fb5y1g + - ancestor_sv_id: Count_Person_16OrMoreYears_Civilian_Employed_InLaborForce_NAICSInformation + source_sv_ids: + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Female, Information (NAICS/51) + - dc/c9yj0kkxpf8ff + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Male, Information (NAICS/51) + - dc/20cmnywrsexnc + - ancestor_sv_id: Count_Person_16OrMoreYears_Civilian_Employed_InLaborForce_NAICSFinanceInsurance + source_sv_ids: + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Female, Finance And Insurance (NAICS/52) + - dc/e7yf34hemnde9 + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Male, Finance And Insurance (NAICS/52) + - dc/3gcb6txyx3t88 + - ancestor_sv_id: Count_Person_16OrMoreYears_Civilian_Employed_InLaborForce_Count_Establishment_NAICSRealEstateRentalLeasing + source_sv_ids: + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Female, Real Estate And Rental And Leasing (NAICS/53) + - dc/l292my8mjkns + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Male, Real Estate And Rental And Leasing (NAICS/53) + - dc/4zgbhyej0lnd + - ancestor_sv_id: Count_Person_16OrMoreYears_Civilian_Employed_InLaborForce_NAICSProfessionalScientificTechnicalServices + source_sv_ids: + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Female, Professional, Scientific, And Technical Services (NAICS/54) + - dc/lc629dk4yeg4b + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Male, Professional, Scientific, And Technical Services (NAICS/54) + - dc/gtbj7nt7jh608 + - ancestor_sv_id: Count_Person_16OrMoreYears_Civilian_Employed_InLaborForce_NAICSManagementOfCompaniesEnterprises + source_sv_ids: + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Female, Management of Companies And Enterprises (NAICS/55) + - dc/zt2qhhtnjm4qh + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Male, Management of Companies And Enterprises (NAICS/55) + - dc/z9qrge1jf1k3h + - ancestor_sv_id: Count_Person_16OrMoreYears_Civilian_Employed_InLaborForce_NAICSAdministrativeSupportWasteManagementRemediationServices + source_sv_ids: + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Female, Administrative And Support And Waste Management Services (NAICS/56) + - dc/ff2f4my72zx0c + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Male, Administrative And Support And Waste Management Services (NAICS/56) + - dc/cmy92ne7lqr4d + - ancestor_sv_id: Count_Person_16OrMoreYears_Civilian_Employed_InLaborForce_NAICSEducationalServices + source_sv_ids: + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Female, Educational Services (NAICS/61) + - dc/1wr6sspk9nrjc + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Male, Educational Services (NAICS/61) + - dc/xjvlbg6jjb6q8 + - ancestor_sv_id: Count_Person_16OrMoreYears_Civilian_Employed_InLaborForce_NAICSHealthCareSocialAssistance + source_sv_ids: + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Female, Health Care And Social Assistance (NAICS/62) + - dc/nmd0vjps6psb2 + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Male, Health Care And Social Assistance (NAICS/62) + - dc/4gr6y83p0hvj + - ancestor_sv_id: Count_Person_16OrMoreYears_Civilian_Employed_InLaborForce_NAICSArtsEntertainmentRecreation + source_sv_ids: + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Female, Arts, Entertainment, And Recreation (NAICS/71) + - dc/9hrltcsyj2mcb + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Male, Arts, Entertainment, And Recreation (NAICS/71) + - dc/0xmrxl7ztb4e7 + - ancestor_sv_id: Count_Person_16OrMoreYears_Civilian_Employed_InLaborForce_NAICSAccommodationFoodServices + source_sv_ids: + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Female, Accommodation And Food Services (NAICS/72) + - dc/rydmsxhz7pxh1 + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Male, Accommodation And Food Services (NAICS/72) + - dc/pn22zf92qr31f + - ancestor_sv_id: Count_Person_16OrMoreYears_Civilian_Employed_InLaborForce_NAICSOtherServices + source_sv_ids: + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Female, Other Services, Except Public Administration (NAICS/81) + - dc/cyc1f26msdct4 + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Male, Other Services, Except Public Administration (NAICS/81) + - dc/ns2e3khxs6gw7 + - ancestor_sv_id: Count_Person_16OrMoreYears_Civilian_Employed_InLaborForce_NAICSPublicAdministration + source_sv_ids: + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Female, Public Administration (NAICS/92) + - dc/ylj424exymm1b + # Population: 16 Years or More, Civilian, Employed, in Labor Force, Male, Public Administration (NAICS/92) + - dc/589p0gc36qem5 + + # Census Table B23001 includes count of employed civilians broken down by gender + # and age. From that we aggregate over age to get count of employed civilians + # for each gender. + # (https://data.census.gov/cedsci/table?tid=ACSDT5Y2019.B23001) + # +# - type: STAT_VAR_AGGREGATION + # type: STAT_VAR_AGGREGATION +# output_import: CensusACS5YearSurvey_Employment_StatVarAgg +# input_imports: +# - CensusACS5YearSurvey +# - CensusACS5YearSurvey_AggCountry +# stat_var_aggregation: +# aggregations: + # # Women Employed + # # https://screenshot.googleplex.com/AYFWvJqXa3EB2eU + # + # # The ancestor has "armedForcesStatus: dcs:Civilian", but the following sources + # # don't have. Fix it before adding this to aggregation. + # # dc/kz49wc5n3lhpd + # # dc/qcpg8c533syd6 + # # dc/zv26z833d7g3c +# - ancestor_sv_id: Count_Person_Female_Employed +# source_sv_ids: +# - dc/yt1fm72s1y7b7 +# - dc/hj65vwnt5csr7 +# - dc/0sd3x3bb4qet5 +# - dc/j4mzcc63n5zq5 +# - dc/56jhsezrzl049 +# - dc/ggx918j9p5tnf +# - dc/8rw47nf3ngle7 +# - dc/hplj99j7mbfsh +# - dc/ksmx7fwfkm8lb +# - dc/6706lc55kg5d +# - dc/qcpg8c533syd6 +# - dc/zv26z833d7g3c +# - dc/kz49wc5n3lhpd + # } + # + # # The ancestor has "armedForcesStatus: dcs:Civilian", but the following sources + # # don't have. Fix it before adding this to aggregation. + # # dc/5cxs4br0jz02c + # # dc/s909dd4r22fw + # # dc/xepldf55yq6s5 + # + # # Men Employed + # # https://screenshot.googleplex.com/C24sFkxNbUpWiQJ +# - ancestor_sv_id: Count_Person_Male_Employed +# source_sv_ids: +# - dc/2s6hps4z1qced +# - dc/xvtk9180lme1h +# - dc/twdr4c500yev5 +# - dc/n0btf3nglhqqc +# - dc/7qwvkhr4tsyt8 +# - dc/pkpm179bbh822 +# - dc/gyrw68q6x77l9 +# - dc/ly0fvmlf4mtf5 +# - dc/m020zpvzch2gd +# - dc/sgm602ncby3bf +# - dc/xepldf55yq6s5 +# - dc/s909dd4r22fw +# - dc/5cxs4br0jz02c + # } + + # Crime + - type: STAT_VAR_AGGREGATION + output_import: USNationalPrisonerStatistics_Crime_StatVarAgg + input_imports: + - USNationalPrisonerStatistics + stat_var_aggregation: + aggregations: + - ancestor_sv_id: Count_Person_CorrectionalFacilityLocation_OutOfState + source_sv_ids: + - dc/qgv9d3frn35qc + - dc/91vy0sf20wlg9 + + # WithOwnChildrenUnder18. + - type: STAT_VAR_AGGREGATION + output_import: CensusACS5YearSurvey_SubjectTables_S1251_StatVarAgg + input_imports: + - CensusACS5YearSurvey_SubjectTables_S1251 + stat_var_aggregation: + aggregations: + - ancestor_sv_id: Count_Person_WithOwnChildrenUnder18_Female + source_sv_ids: + - Count_Person_WithOwnChildrenUnder18_Female_FamilyHousehold_MarriedInThePast12Months_ResidesInHousehold + - Count_Person_WithOwnChildrenUnder18_Female_FamilyHousehold_DivorcedInThePast12Months_ResidesInHousehold + - ancestor_sv_id: Count_Person_WithOwnChildrenUnder18_Male + source_sv_ids: + - Count_Person_WithOwnChildrenUnder18_Male_FamilyHousehold_MarriedInThePast12Months_ResidesInHousehold + - Count_Person_WithOwnChildrenUnder18_Male_FamilyHousehold_DivorcedInThePast12Months_ResidesInHousehold + - ancestor_sv_id: Count_Person_WithOwnChildrenUnder18 + source_sv_ids: + - Count_Person_WithOwnChildrenUnder18_Female + - Count_Person_WithOwnChildrenUnder18_Male + + # Marriage + - type: STAT_VAR_AGGREGATION + output_import: CensusACS5YearSurvey_SubjectTables_S1201_StatVarAgg + input_imports: + - CensusACS5YearSurvey_SubjectTables_S1201 + stat_var_aggregation: + aggregations: + - ancestor_sv_id: Count_Person_InLaborForce_Divorced + source_sv_ids: + - Count_Person_InLaborForce_Female_Divorced + - Count_Person_InLaborForce_Male_Divorced + - ancestor_sv_id: Count_Person_InLaborForce_NeverMarried + source_sv_ids: + - Count_Person_InLaborForce_Female_NeverMarried + - Count_Person_InLaborForce_Male_NeverMarried + - ancestor_sv_id: Count_Person_InLaborForce_MarriedAndNotSeparated + source_sv_ids: + - Count_Person_InLaborForce_Female_MarriedAndNotSeparated + - Count_Person_InLaborForce_Male_MarriedAndNotSeparated + - ancestor_sv_id: Count_Person_InLaborForce_Widowed + source_sv_ids: + - Count_Person_InLaborForce_Female_Widowed + - Count_Person_InLaborForce_Male_Widowed + + # Employment by business ownership type. + - type: STAT_VAR_AGGREGATION + output_import: CensusACS5YearSurvey_SubjectTables_S2408_StatVarAgg + input_imports: + - CensusACS5YearSurvey_SubjectTables_S2408 + stat_var_aggregation: + aggregations: + - ancestor_sv_id: Count_Person_PrivatelyOwnedNotForProfitEstablishment_PaidWorker + source_sv_ids: + - Count_Person_PrivatelyOwnedNotForProfitEstablishment_Male_PaidWorker + - Count_Person_PrivatelyOwnedNotForProfitEstablishment_Female_PaidWorker + - ancestor_sv_id: Count_Person_PrivatelyOwnedForProfitEstablishment_PaidWorker + source_sv_ids: + - Count_Person_PrivatelyOwnedForProfitEstablishment_Male_PaidWorker + - Count_Person_PrivatelyOwnedForProfitEstablishment_Female_PaidWorker + + # US Citizen by Naturalization + - type: STAT_VAR_AGGREGATION + output_import: CensusACS5YearSurvey_SubjectTables_S0504_StatVarAgg + input_imports: + - CensusACS5YearSurvey_SubjectTables_S0504 + stat_var_aggregation: + aggregations: + - ancestor_sv_id: Count_Person_USCitizenByNaturalization_DateOfEntry2000OrLater_ForeignBorn + source_sv_ids: + - Count_Person_USCitizenByNaturalization_DateOfEntry2000OrLater_ForeignBorn_PlaceOfBirthAfrica + - Count_Person_USCitizenByNaturalization_DateOfEntry2000OrLater_ForeignBorn_PlaceOfBirthAsia + - Count_Person_USCitizenByNaturalization_DateOfEntry2000OrLater_ForeignBorn_PlaceOfBirthCaribbean + - Count_Person_USCitizenByNaturalization_DateOfEntry2000OrLater_ForeignBorn_PlaceOfBirthCentralAmericaExceptMexico + - Count_Person_USCitizenByNaturalization_DateOfEntry2000OrLater_ForeignBorn_PlaceOfBirthMexico + - Count_Person_USCitizenByNaturalization_DateOfEntry2000OrLater_ForeignBorn_PlaceOfBirthEasternAsia + - Count_Person_USCitizenByNaturalization_DateOfEntry2000OrLater_ForeignBorn_PlaceOfBirthEurope + - Count_Person_USCitizenByNaturalization_DateOfEntry2000OrLater_ForeignBorn_PlaceOfBirthLatinAmerica + - Count_Person_USCitizenByNaturalization_DateOfEntry2000OrLater_ForeignBorn_PlaceOfBirthNorthamerica + - Count_Person_USCitizenByNaturalization_DateOfEntry2000OrLater_ForeignBorn_PlaceOfBirthNorthernWesternEurope + - Count_Person_USCitizenByNaturalization_DateOfEntry2000OrLater_ForeignBorn_PlaceOfBirthOceania + - Count_Person_USCitizenByNaturalization_DateOfEntry2000OrLater_ForeignBorn_PlaceOfBirthSouthCentralAsia + - Count_Person_USCitizenByNaturalization_DateOfEntry2000OrLater_ForeignBorn_PlaceOfBirthSouthEasternAsia + - Count_Person_USCitizenByNaturalization_DateOfEntry2000OrLater_ForeignBorn_PlaceOfBirthSouthamerica + - Count_Person_USCitizenByNaturalization_DateOfEntry2000OrLater_ForeignBorn_PlaceOfBirthSouthernEasternEurope + - Count_Person_USCitizenByNaturalization_DateOfEntry2000OrLater_ForeignBorn_PlaceOfBirthWesternAsia + skip_all_sources_present_check: true + - ancestor_sv_id: Count_Person_USCitizenByNaturalization_DateOfEntry2010OrLater_ForeignBorn + source_sv_ids: + - Count_Person_USCitizenByNaturalization_DateOfEntry2010OrLater_ForeignBorn_PlaceOfBirthAfrica + - Count_Person_USCitizenByNaturalization_DateOfEntry2010OrLater_ForeignBorn_PlaceOfBirthAsia + - Count_Person_USCitizenByNaturalization_DateOfEntry2010OrLater_ForeignBorn_PlaceOfBirthCaribbean + - Count_Person_USCitizenByNaturalization_DateOfEntry2010OrLater_ForeignBorn_PlaceOfBirthCentralAmericaExceptMexico + - Count_Person_USCitizenByNaturalization_DateOfEntry2010OrLater_ForeignBorn_PlaceOfBirthMexico + - Count_Person_USCitizenByNaturalization_DateOfEntry2010OrLater_ForeignBorn_PlaceOfBirthEasternAsia + - Count_Person_USCitizenByNaturalization_DateOfEntry2010OrLater_ForeignBorn_PlaceOfBirthEurope + - Count_Person_USCitizenByNaturalization_DateOfEntry2010OrLater_ForeignBorn_PlaceOfBirthLatinAmerica + - Count_Person_USCitizenByNaturalization_DateOfEntry2010OrLater_ForeignBorn_PlaceOfBirthNorthamerica + - Count_Person_USCitizenByNaturalization_DateOfEntry2010OrLater_ForeignBorn_PlaceOfBirthNorthernWesternEurope + - Count_Person_USCitizenByNaturalization_DateOfEntry2010OrLater_ForeignBorn_PlaceOfBirthOceania + - Count_Person_USCitizenByNaturalization_DateOfEntry2010OrLater_ForeignBorn_PlaceOfBirthSouthCentralAsia + - Count_Person_USCitizenByNaturalization_DateOfEntry2010OrLater_ForeignBorn_PlaceOfBirthSouthEasternAsia + - Count_Person_USCitizenByNaturalization_DateOfEntry2010OrLater_ForeignBorn_PlaceOfBirthSouthamerica + - Count_Person_USCitizenByNaturalization_DateOfEntry2010OrLater_ForeignBorn_PlaceOfBirthSouthernEasternEurope + - Count_Person_USCitizenByNaturalization_DateOfEntry2010OrLater_ForeignBorn_PlaceOfBirthWesternAsia + skip_all_sources_present_check: true + - ancestor_sv_id: Count_Person_USCitizenByNaturalization_DateOfEntry1990To1999_ForeignBorn + source_sv_ids: + - Count_Person_USCitizenByNaturalization_DateOfEntry1990To1999_ForeignBorn_PlaceOfBirthAfrica + - Count_Person_USCitizenByNaturalization_DateOfEntry1990To1999_ForeignBorn_PlaceOfBirthAsia + - Count_Person_USCitizenByNaturalization_DateOfEntry1990To1999_ForeignBorn_PlaceOfBirthCaribbean + - Count_Person_USCitizenByNaturalization_DateOfEntry1990To1999_ForeignBorn_PlaceOfBirthCentralAmericaExceptMexico + - Count_Person_USCitizenByNaturalization_DateOfEntry1990To1999_ForeignBorn_PlaceOfBirthMexico + - Count_Person_USCitizenByNaturalization_DateOfEntry1990To1999_ForeignBorn_PlaceOfBirthEasternAsia + - Count_Person_USCitizenByNaturalization_DateOfEntry1990To1999_ForeignBorn_PlaceOfBirthEurope + - Count_Person_USCitizenByNaturalization_DateOfEntry1990To1999_ForeignBorn_PlaceOfBirthLatinAmerica + - Count_Person_USCitizenByNaturalization_DateOfEntry1990To1999_ForeignBorn_PlaceOfBirthNorthamerica + - Count_Person_USCitizenByNaturalization_DateOfEntry1990To1999_ForeignBorn_PlaceOfBirthNorthernWesternEurope + - Count_Person_USCitizenByNaturalization_DateOfEntry1990To1999_ForeignBorn_PlaceOfBirthOceania + - Count_Person_USCitizenByNaturalization_DateOfEntry1990To1999_ForeignBorn_PlaceOfBirthSouthCentralAsia + - Count_Person_USCitizenByNaturalization_DateOfEntry1990To1999_ForeignBorn_PlaceOfBirthSouthEasternAsia + - Count_Person_USCitizenByNaturalization_DateOfEntry1990To1999_ForeignBorn_PlaceOfBirthSouthamerica + - Count_Person_USCitizenByNaturalization_DateOfEntry1990To1999_ForeignBorn_PlaceOfBirthSouthernEasternEurope + - Count_Person_USCitizenByNaturalization_DateOfEntry1990To1999_ForeignBorn_PlaceOfBirthWesternAsia + skip_all_sources_present_check: true + - ancestor_sv_id: Count_Person_USCitizenByNaturalization_DateOfEntry2000To2009_ForeignBorn + source_sv_ids: + - Count_Person_USCitizenByNaturalization_DateOfEntry2000To2009_ForeignBorn_PlaceOfBirthAfrica + - Count_Person_USCitizenByNaturalization_DateOfEntry2000To2009_ForeignBorn_PlaceOfBirthAsia + - Count_Person_USCitizenByNaturalization_DateOfEntry2000To2009_ForeignBorn_PlaceOfBirthCaribbean + - Count_Person_USCitizenByNaturalization_DateOfEntry2000To2009_ForeignBorn_PlaceOfBirthCentralAmericaExceptMexico + - Count_Person_USCitizenByNaturalization_DateOfEntry2000To2009_ForeignBorn_PlaceOfBirthMexico + - Count_Person_USCitizenByNaturalization_DateOfEntry2000To2009_ForeignBorn_PlaceOfBirthEasternAsia + - Count_Person_USCitizenByNaturalization_DateOfEntry2000To2009_ForeignBorn_PlaceOfBirthEurope + - Count_Person_USCitizenByNaturalization_DateOfEntry2000To2009_ForeignBorn_PlaceOfBirthLatinAmerica + - Count_Person_USCitizenByNaturalization_DateOfEntry2000To2009_ForeignBorn_PlaceOfBirthNorthamerica + - Count_Person_USCitizenByNaturalization_DateOfEntry2000To2009_ForeignBorn_PlaceOfBirthNorthernWesternEurope + - Count_Person_USCitizenByNaturalization_DateOfEntry2000To2009_ForeignBorn_PlaceOfBirthOceania + - Count_Person_USCitizenByNaturalization_DateOfEntry2000To2009_ForeignBorn_PlaceOfBirthSouthCentralAsia + - Count_Person_USCitizenByNaturalization_DateOfEntry2000To2009_ForeignBorn_PlaceOfBirthSouthEasternAsia + - Count_Person_USCitizenByNaturalization_DateOfEntry2000To2009_ForeignBorn_PlaceOfBirthSouthamerica + - Count_Person_USCitizenByNaturalization_DateOfEntry2000To2009_ForeignBorn_PlaceOfBirthSouthernEasternEurope + - Count_Person_USCitizenByNaturalization_DateOfEntry2000To2009_ForeignBorn_PlaceOfBirthWesternAsia + skip_all_sources_present_check: true + - ancestor_sv_id: Count_Person_USCitizenByNaturalization_DateOfEntry2000OrEarlier_ForeignBorn + source_sv_ids: + - Count_Person_USCitizenByNaturalization_DateOfEntry2000OrEarlier_ForeignBorn_PlaceOfBirthAfrica + - Count_Person_USCitizenByNaturalization_DateOfEntry2000OrEarlier_ForeignBorn_PlaceOfBirthAsia + - Count_Person_USCitizenByNaturalization_DateOfEntry2000OrEarlier_ForeignBorn_PlaceOfBirthCaribbean + - Count_Person_USCitizenByNaturalization_DateOfEntry2000OrEarlier_ForeignBorn_PlaceOfBirthCentralAmericaExceptMexico + - Count_Person_USCitizenByNaturalization_DateOfEntry2000OrEarlier_ForeignBorn_PlaceOfBirthMexico + - Count_Person_USCitizenByNaturalization_DateOfEntry2000OrEarlier_ForeignBorn_PlaceOfBirthEasternAsia + - Count_Person_USCitizenByNaturalization_DateOfEntry2000OrEarlier_ForeignBorn_PlaceOfBirthEurope + - Count_Person_USCitizenByNaturalization_DateOfEntry2000OrEarlier_ForeignBorn_PlaceOfBirthLatinAmerica + - Count_Person_USCitizenByNaturalization_DateOfEntry2000OrEarlier_ForeignBorn_PlaceOfBirthNorthamerica + - Count_Person_USCitizenByNaturalization_DateOfEntry2000OrEarlier_ForeignBorn_PlaceOfBirthNorthernWesternEurope + - Count_Person_USCitizenByNaturalization_DateOfEntry2000OrEarlier_ForeignBorn_PlaceOfBirthOceania + - Count_Person_USCitizenByNaturalization_DateOfEntry2000OrEarlier_ForeignBorn_PlaceOfBirthSouthCentralAsia + - Count_Person_USCitizenByNaturalization_DateOfEntry2000OrEarlier_ForeignBorn_PlaceOfBirthSouthEasternAsia + - Count_Person_USCitizenByNaturalization_DateOfEntry2000OrEarlier_ForeignBorn_PlaceOfBirthSouthamerica + - Count_Person_USCitizenByNaturalization_DateOfEntry2000OrEarlier_ForeignBorn_PlaceOfBirthSouthernEasternEurope + - Count_Person_USCitizenByNaturalization_DateOfEntry2000OrEarlier_ForeignBorn_PlaceOfBirthWesternAsia + skip_all_sources_present_check: true + - ancestor_sv_id: Count_Person_USCitizenByNaturalization_DateOfEntry1990OrEarlier_ForeignBorn + source_sv_ids: + - Count_Person_USCitizenByNaturalization_DateOfEntry1990OrEarlier_ForeignBorn_PlaceOfBirthAfrica + - Count_Person_USCitizenByNaturalization_DateOfEntry1990OrEarlier_ForeignBorn_PlaceOfBirthAsia + - Count_Person_USCitizenByNaturalization_DateOfEntry1990OrEarlier_ForeignBorn_PlaceOfBirthCaribbean + - Count_Person_USCitizenByNaturalization_DateOfEntry1990OrEarlier_ForeignBorn_PlaceOfBirthCentralAmericaExceptMexico + - Count_Person_USCitizenByNaturalization_DateOfEntry1990OrEarlier_ForeignBorn_PlaceOfBirthMexico + - Count_Person_USCitizenByNaturalization_DateOfEntry1990OrEarlier_ForeignBorn_PlaceOfBirthEasternAsia + - Count_Person_USCitizenByNaturalization_DateOfEntry1990OrEarlier_ForeignBorn_PlaceOfBirthEurope + - Count_Person_USCitizenByNaturalization_DateOfEntry1990OrEarlier_ForeignBorn_PlaceOfBirthLatinAmerica + - Count_Person_USCitizenByNaturalization_DateOfEntry1990OrEarlier_ForeignBorn_PlaceOfBirthNorthamerica + - Count_Person_USCitizenByNaturalization_DateOfEntry1990OrEarlier_ForeignBorn_PlaceOfBirthNorthernWesternEurope + - Count_Person_USCitizenByNaturalization_DateOfEntry1990OrEarlier_ForeignBorn_PlaceOfBirthOceania + - Count_Person_USCitizenByNaturalization_DateOfEntry1990OrEarlier_ForeignBorn_PlaceOfBirthSouthCentralAsia + - Count_Person_USCitizenByNaturalization_DateOfEntry1990OrEarlier_ForeignBorn_PlaceOfBirthSouthEasternAsia + - Count_Person_USCitizenByNaturalization_DateOfEntry1990OrEarlier_ForeignBorn_PlaceOfBirthSouthamerica + - Count_Person_USCitizenByNaturalization_DateOfEntry1990OrEarlier_ForeignBorn_PlaceOfBirthSouthernEasternEurope + - Count_Person_USCitizenByNaturalization_DateOfEntry1990OrEarlier_ForeignBorn_PlaceOfBirthWesternAsia + skip_all_sources_present_check: true + - ancestor_sv_id: Count_Person_USCitizenByNaturalization + source_sv_ids: + - Count_Person_USCitizenByNaturalization_DateOfEntry1990OrEarlier_ForeignBorn + - Count_Person_USCitizenByNaturalization_DateOfEntry2000OrEarlier_ForeignBorn + - Count_Person_USCitizenByNaturalization_DateOfEntry2000To2009_ForeignBorn + - Count_Person_USCitizenByNaturalization_DateOfEntry1990To1999_ForeignBorn + - Count_Person_USCitizenByNaturalization_DateOfEntry2000OrLater_ForeignBorn + - Count_Person_USCitizenByNaturalization_DateOfEntry2010OrLater_ForeignBorn + + # HousingUnit HomeValue. + - type: STAT_VAR_AGGREGATION + output_import: CensusACS5YearSurvey_HousingUnit_HomeValue_StatVarAgg + input_imports: + - CensusACS5YearSurvey + - CensusACS5YearSurvey_AggCountry + stat_var_aggregation: + aggregations: + # Census ACS 5 year, Count_HousingUnit_HomeValue + - ancestor_sv_id: Count_HousingUnit_HomeValueUpto49999USDollar + source_sv_ids: + - Count_HousingUnit_HomeValueUpto10000USDollar + - Count_HousingUnit_HomeValue10000To14999USDollar + - Count_HousingUnit_HomeValue15000To19999USDollar + - Count_HousingUnit_HomeValue20000To24999USDollar + - Count_HousingUnit_HomeValue25000To29999USDollar + - Count_HousingUnit_HomeValue30000To34999USDollar + - Count_HousingUnit_HomeValue35000To39999USDollar + - Count_HousingUnit_HomeValue40000To49999USDollar + skip_all_sources_present_check: true + - ancestor_sv_id: Count_HousingUnit_HomeValue50000To99999USDollar + source_sv_ids: + - Count_HousingUnit_HomeValue50000To59999USDollar + - Count_HousingUnit_HomeValue60000To69999USDollar + - Count_HousingUnit_HomeValue70000To79999USDollar + - Count_HousingUnit_HomeValue80000To89999USDollar + - Count_HousingUnit_HomeValue90000To99999USDollar + skip_all_sources_present_check: true + - ancestor_sv_id: Count_HousingUnit_HomeValue100000To199999USDollar + source_sv_ids: + - Count_HousingUnit_HomeValue100000To124999USDollar + - Count_HousingUnit_HomeValue125000To149999USDollar + - Count_HousingUnit_HomeValue150000To174999USDollar + - Count_HousingUnit_HomeValue175000To199999USDollar + skip_all_sources_present_check: true + - ancestor_sv_id: Count_HousingUnit_HomeValue200000To299999USDollar + source_sv_ids: + - Count_HousingUnit_HomeValue200000To249999USDollar + - Count_HousingUnit_HomeValue250000To299999USDollar + skip_all_sources_present_check: true + - ancestor_sv_id: Count_HousingUnit_HomeValue300000To499999USDollar + source_sv_ids: + - Count_HousingUnit_HomeValue300000To399999USDollar + - Count_HousingUnit_HomeValue400000To499999USDollar + skip_all_sources_present_check: true + - ancestor_sv_id: Count_HousingUnit_HomeValue500000To999999USDollar + source_sv_ids: + - Count_HousingUnit_HomeValue500000To749999USDollar + - Count_HousingUnit_HomeValue750000To999999USDollar + skip_all_sources_present_check: true + + - type: STAT_VAR_AGGREGATION + output_import: CensusACS5YearSurvey_Person_Age_StatVarAgg + input_imports: + - CensusACS5YearSurvey + - CensusACS5YearSurvey_AggCountry + stat_var_aggregation: + aggregations: + # Census ACS 5 year, Age group + - ancestor_sv_id: Count_Person_55To64Years + source_sv_ids: + - Count_Person_55To59Years + - Count_Person_60To61Years + - Count_Person_62To64Years + skip_all_sources_present_check: true + + - type: STAT_VAR_AGGREGATION + output_import: CensusACS5YearSurvey_Income_StatVarAgg + input_imports: + - CensusACS5YearSurvey + - CensusACS5YearSurvey_AggCountry + stat_var_aggregation: + aggregations: + # The following 9 aggregations are: High income (100,000 USD or More), by race. + - ancestor_sv_id: Count_Person_16OrMoreYears_USDollar100000Onwards_WithIncome_AmericanIndianOrAlaskaNativeAlone + source_sv_ids: + # Population: Female, 100,000 USD or More, American Indian or Alaska Native Alone, Not Worked Full Time + - dc/bdl1bpzj3klq4 + # Population: Female, 100,000 USD or More, American Indian or Alaska Native Alone, Worked Full Time + - dc/8v29dc0442ljg + # Population: Male, 100,000 USD or More, American Indian or Alaska Native Alone, Not Worked Full Time + - dc/4130fr4llf992 + # Population: Male, 100,000 USD or More, American Indian or Alaska Native Alone, Worked Full Time + - dc/ehgx6y0qyncl2 + - ancestor_sv_id: Count_Person_16OrMoreYears_USDollar100000Onwards_WithIncome_AsianAlone + source_sv_ids: + # Population: Female, 100,000 USD or More, Asian Alone, Not Worked Full Time + - dc/5zrn0w3nwe9e2 + # Population: Female, 100,000 USD or More, Asian Alone, Worked Full Time + - dc/sk4nqnbdskvhg + # Population: Male, 100,000 USD or More, Asian Alone, Not Worked Full Time + - dc/w4xl5kcbyjmnb + # Population: Male, 100,000 USD or More, Asian Alone, Worked Full Time + - dc/68zc6qjdtxllf + - ancestor_sv_id: Count_Person_16OrMoreYears_USDollar100000Onwards_WithIncome_BlackOrAfricanAmericanAlone + source_sv_ids: + # Population: Female, 100,000 USD or More, Black or African American Alone, Not Worked Full Time + - dc/hndxvvnzf2yr1 + # Population: Female, 100,000 USD or More, Black or African American Alone, Worked Full Time + - dc/vf30v7c1t5lz5 + # Population: Male, 100,000 USD or More, Black or African American Alone, Not Worked Full Time + - dc/nm3bftsv2z2w2 + # Population: Male, 100,000 USD or More, Black or African American Alone, Worked Full Time + - dc/whevye7y9vg1c + - ancestor_sv_id: Count_Person_16OrMoreYears_USDollar100000Onwards_WithIncome_HispanicOrLatino + source_sv_ids: + # Population: Female, 100,000 USD or More, Hispanic or Latino, Not Worked Full Time + - dc/nwzb4v1hp2jw4 + # Population: Female, 100,000 USD or More, Hispanic or Latino, Worked Full Time + - dc/w5geg8s1lmb04 + # Population: Male, 100,000 USD or More, Hispanic or Latino, Not Worked Full Time + - dc/xdrmxlpjlcrrb + # Population: Male, 100,000 USD or More, Hispanic or Latino, Worked Full Time + - dc/espye40zdmhgg + - ancestor_sv_id: Count_Person_16OrMoreYears_USDollar100000Onwards_WithIncome_NativeHawaiianOrOtherPacificIslanderAlone + source_sv_ids: + # Population: Female, 100,000 USD or More, Native Hawaiian or Other Pacific Islander Alone, Not Worked Full Time + - dc/8bwnnflm9kwbg + # Population: Female, 100,000 USD or More, Native Hawaiian or Other Pacific Islander Alone, Worked Full Time + - dc/4r5pvjszn5x03 + # Population: Male, 100,000 USD or More, Native Hawaiian or Other Pacific Islander Alone, Not Worked Full Time + - dc/mx7edmkkrxvl9 + # Population: Male, 100,000 USD or More, Native Hawaiian or Other Pacific Islander Alone, Worked Full Time + - dc/ce7frpwyfpx85 + - ancestor_sv_id: Count_Person_16OrMoreYears_USDollar100000Onwards_WithIncome_SomeOtherRaceAlone + source_sv_ids: + # Population: Female, 100,000 USD or More, Some Other Race Alone, Not Worked Full Time + - dc/y70ppl4rxjhjh + # Population: Female, 100,000 USD or More, Some Other Race Alone, Worked Full Time + - dc/3zq436nrf2d66 + # Population: Male, 100,000 USD or More, Some Other Race Alone, Not Worked Full Time + - dc/pgrfnkfnswr5d + # Population: Male, 100,000 USD or More, Some Other Race Alone, Worked Full Time + - dc/cmnkc5fpvpfk9 + - ancestor_sv_id: Count_Person_16OrMoreYears_USDollar100000Onwards_WithIncome_TwoOrMoreRaces + source_sv_ids: + # Population: Female, 100,000 USD or More, Two or More Races, Not Worked Full Time + - dc/8lffdmtgqv269 + # Population: Female, 100,000 USD or More, Two or More Races, Worked Full Time + - dc/n4z306vm3zdm6 + # Population: Male, 100,000 USD or More, Two or More Races, Not Worked Full Time + - dc/yr2g2ct310d43 + # Population: Male, 100,000 USD or More, Two or More Races, Worked Full Time + - dc/em5ysxsf96xb8 + - ancestor_sv_id: Count_Person_16OrMoreYears_USDollar100000Onwards_WithIncome_WhiteAlone + source_sv_ids: + # Population: Female, 100,000 USD or More, White Alone, Not Worked Full Time + - dc/kd7qcdzs2q8p4 + # Population: Female, 100,000 USD or More, White Alone, Worked Full Time + - dc/gjkbs2qeb9m0g + # Population: Male, 100,000 USD or More, White Alone, Not Worked Full Time + - dc/dhjfv19drrr9 + # Population: Male, 100,000 USD or More, White Alone, Worked Full Time + - dc/z76nk40e6qtn8 + - ancestor_sv_id: Count_Person_16OrMoreYears_USDollar100000Onwards_WithIncome_WhiteAloneNotHispanicOrLatino + source_sv_ids: + # Population: Female, 100,000 USD or More, White Alone Not Hispanic or Latino, Not Worked Full Time + - dc/elr21hp41d4jh + # Population: Female, 100,000 USD or More, White Alone Not Hispanic or Latino, Worked Full Time + - dc/cqc4zh1dg8s6d + # Population: Male, 100,000 USD or More, White Alone Not Hispanic or Latino, Not Worked Full Time + - dc/7g2mm704kfq63 + # Population: Male, 100,000 USD or More, White Alone Not Hispanic or Latino, Worked Full Time + - dc/ekh1g39v9sgj4 + + # The following 9 aggregations are: Low income (10,000 - 12,499 USD), by race. + - ancestor_sv_id: Count_Person_16OrMoreYears_USDollar10000To12499_WithIncome_AmericanIndianOrAlaskaNativeAlone + source_sv_ids: + # Population: Female, 10,000 - 12,499 USD, American Indian or Alaska Native Alone, Not Worked Full Time + - dc/g1lqz36qc6x26 + # Population: Female, 10,000 - 12,499 USD, American Indian or Alaska Native Alone, Worked Full Time + - dc/nfj1c6f5xvjpf + # Population: Male, 10,000 - 12,499 USD, American Indian or Alaska Native Alone, Not Worked Full Time + - dc/1cytg38zh4kmb + # Population: Male, 10,000 - 12,499 USD, American Indian or Alaska Native Alone, Worked Full Time + - dc/kvpkg8xd66q1d + - ancestor_sv_id: Count_Person_16OrMoreYears_USDollar10000To12499_WithIncome_AsianAlone + source_sv_ids: + # Population: Female, 10,000 - 12,499 USD, Asian Alone, Not Worked Full Time + - dc/rkylr8pzrhvrb + # Population: Female, 10,000 - 12,499 USD, Asian Alone, Worked Full Time + - dc/ecz488knjvddf + # Population: Male, 10,000 - 12,499 USD, Asian Alone, Not Worked Full Time + - dc/l481yty608g4 + # Population: Male, 10,000 - 12,499 USD, Asian Alone, Worked Full Time + - dc/f6psb62vxhcgb + - ancestor_sv_id: Count_Person_16OrMoreYears_USDollar10000To12499_WithIncome_BlackOrAfricanAmericanAlone + source_sv_ids: + # Population: Female, 10,000 - 12,499 USD, Black or African American Alone, Not Worked Full Time + - dc/zvn4y702wsxe3 + # Population: Female, 10,000 - 12,499 USD, Black or African American Alone, Worked Full Time + - dc/6jgshc72wscd7 + # Population: Male, 10,000 - 12,499 USD, Black or African American Alone, Not Worked Full Time + - dc/h255gx042f7y6 + # Population: Male, 10,000 - 12,499 USD, Black or African American Alone, Worked Full Time + - dc/pd0td6p7wbgvc + - ancestor_sv_id: Count_Person_16OrMoreYears_USDollar10000To12499_WithIncome_HispanicOrLatino + source_sv_ids: + # Population: Female, 10,000 - 12,499 USD, Hispanic or Latino, Not Worked Full Time + - dc/p5ewnb0xtrrp + # Population: Female, 10,000 - 12,499 USD, Hispanic or Latino, Worked Full Time + - dc/7ev9gs9ppxbq2 + # Population: Male, 10,000 - 12,499 USD, Hispanic or Latino, Not Worked Full Time + - dc/t5d500j7754qb + # Population: Male, 10,000 - 12,499 USD, Hispanic or Latino, Worked Full Time + - dc/x2p3338cck7vg + - ancestor_sv_id: Count_Person_16OrMoreYears_USDollar10000To12499_WithIncome_NativeHawaiianOrOtherPacificIslanderAlone + source_sv_ids: + # Population: Female, 10,000 - 12,499 USD, Native Hawaiian or Other Pacific Islander Alone, Not Worked Full Time + - dc/xbq05lkgsx8b6 + # Population: Female, 10,000 - 12,499 USD, Native Hawaiian or Other Pacific Islander Alone, Worked Full Time + - dc/nxqjjpwkkxpx4 + # Population: Male, 10,000 - 12,499 USD, Native Hawaiian or Other Pacific Islander Alone, Not Worked Full Time + - dc/bd5nldln2dxng + # Population: Male, 10,000 - 12,499 USD, Native Hawaiian or Other Pacific Islander Alone, Worked Full Time + - dc/rgme2je6kbj27 + - ancestor_sv_id: Count_Person_16OrMoreYears_USDollar10000To12499_WithIncome_SomeOtherRaceAlone + source_sv_ids: + # Population: Female, 10,000 - 12,499 USD, Some Other Race Alone, Not Worked Full Time + - dc/8ws12v06y0sqh + # Population: Female, 10,000 - 12,499 USD, Some Other Race Alone, Worked Full Time + - dc/43pxcsbgjksmg + # Population: Male, 10,000 - 12,499 USD, Some Other Race Alone, Not Worked Full Time + - dc/lj0q1zj5d8df + # Population: Male, 10,000 - 12,499 USD, Some Other Race Alone, Worked Full Time + - dc/sy61mbcgrsz73 + - ancestor_sv_id: Count_Person_16OrMoreYears_USDollar10000To12499_WithIncome_TwoOrMoreRaces + source_sv_ids: + # Population: Female, 10,000 - 12,499 USD, Two or More Races, Not Worked Full Time + - dc/see1v777j4807 + # Population: Female, 10,000 - 12,499 USD, Two or More Races, Worked Full Time + - dc/422zpgm23b2m7 + # Population: Male, 10,000 - 12,499 USD, Two or More Races, Not Worked Full Time + - dc/6k4k59zwbgw92 + # Population: Male, 10,000 - 12,499 USD, Two or More Races, Worked Full Time + - dc/qdrpgr3zby8k + - ancestor_sv_id: Count_Person_16OrMoreYears_USDollar10000To12499_WithIncome_WhiteAlone + source_sv_ids: + # Population: Female, 10,000 - 12,499 USD, White Alone, Not Worked Full Time + - dc/c42c6rmszjdv7 + # Population: Female, 10,000 - 12,499 USD, White Alone, Worked Full Time + - dc/bpzm3y2rxt7y9 + # Population: Male, 10,000 - 12,499 USD, White Alone, Not Worked Full Time + - dc/kv7x6r2cqyxcf + # Population: Male, 10,000 - 12,499 USD, White Alone, Worked Full Time + - dc/xzg5t6eq4pnph + - ancestor_sv_id: Count_Person_16OrMoreYears_USDollar10000To12499_WithIncome_WhiteAloneNotHispanicOrLatino + source_sv_ids: + # Population: Female, 10,000 - 12,499 USD, White Alone Not Hispanic or Latino, Not Worked Full Time + - dc/t6b45v6bsdf1 + # Population: Female, 10,000 - 12,499 USD, White Alone Not Hispanic or Latino, Worked Full Time + - dc/j8dgk3864jpbd + # Population: Male, 10,000 - 12,499 USD, White Alone Not Hispanic or Latino, Not Worked Full Time + - dc/bv5sd8hrhs82h + # Population: Male, 10,000 - 12,499 USD, White Alone Not Hispanic or Latino, Worked Full Time + - dc/5jp07brw3g26h + + - type: STAT_VAR_AGGREGATION + output_import: IndiaNSS_HealthAilments_StatVarAgg + input_imports: + - IndiaNSS_HealthAilments + stat_var_aggregation: + aggregations: + - ancestor_sv_id: Count_MedicalConditionIncident_ConditionBloodDisease_AsAFractionOf_Count_Person + source_sv_ids: + - Count_MedicalConditionIncident_0To4Years_Female_ConditionBloodDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_15To29Years_Female_ConditionBloodDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_30To44Years_Female_ConditionBloodDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_45To59Years_Female_ConditionBloodDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_5To14Years_Female_ConditionBloodDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60OrMoreYears_Female_ConditionBloodDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60To69Years_Female_ConditionBloodDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_70OrMoreYears_Female_ConditionBloodDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_0To4Years_Male_ConditionBloodDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_15To29Years_Male_ConditionBloodDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_30To44Years_Male_ConditionBloodDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_45To59Years_Male_ConditionBloodDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_5To14Years_Male_ConditionBloodDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60OrMoreYears_Male_ConditionBloodDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60To69Years_Male_ConditionBloodDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_70OrMoreYears_Male_ConditionBloodDisease_AsAFractionOf_Count_Person + skip_all_sources_present_check: true + - ancestor_sv_id: Count_MedicalConditionIncident_ConditionCancer_AsAFractionOf_Count_Person + source_sv_ids: + - Count_MedicalConditionIncident_0To4Years_Female_ConditionCancer_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_15To29Years_Female_ConditionCancer_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_30To44Years_Female_ConditionCancer_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_45To59Years_Female_ConditionCancer_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_5To14Years_Female_ConditionCancer_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60OrMoreYears_Female_ConditionCancer_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60To69Years_Female_ConditionCancer_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_70OrMoreYears_Female_ConditionCancer_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_0To4Years_Male_ConditionCancer_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_15To29Years_Male_ConditionCancer_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_30To44Years_Male_ConditionCancer_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_45To59Years_Male_ConditionCancer_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_5To14Years_Male_ConditionCancer_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60OrMoreYears_Male_ConditionCancer_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60To69Years_Male_ConditionCancer_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_70OrMoreYears_Male_ConditionCancer_AsAFractionOf_Count_Person + skip_all_sources_present_check: true + - ancestor_sv_id: Count_MedicalConditionIncident_ConditionCardiovascularDisease_AsAFractionOf_Count_Person + source_sv_ids: + - Count_MedicalConditionIncident_0To4Years_Female_ConditionCardiovascularDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_15To29Years_Female_ConditionCardiovascularDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_30To44Years_Female_ConditionCardiovascularDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_45To59Years_Female_ConditionCardiovascularDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_5To14Years_Female_ConditionCardiovascularDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60OrMoreYears_Female_ConditionCardiovascularDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60To69Years_Female_ConditionCardiovascularDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_70OrMoreYears_Female_ConditionCardiovascularDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_0To4Years_Male_ConditionCardiovascularDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_15To29Years_Male_ConditionCardiovascularDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_30To44Years_Male_ConditionCardiovascularDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_45To59Years_Male_ConditionCardiovascularDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_5To14Years_Male_ConditionCardiovascularDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60OrMoreYears_Male_ConditionCardiovascularDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60To69Years_Male_ConditionCardiovascularDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_70OrMoreYears_Male_ConditionCardiovascularDisease_AsAFractionOf_Count_Person + skip_all_sources_present_check: true + - ancestor_sv_id: Count_MedicalConditionIncident_ConditionEarCondition_AsAFractionOf_Count_Person + source_sv_ids: + - Count_MedicalConditionIncident_0To4Years_Female_ConditionEarCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_15To29Years_Female_ConditionEarCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_30To44Years_Female_ConditionEarCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_45To59Years_Female_ConditionEarCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_5To14Years_Female_ConditionEarCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60OrMoreYears_Female_ConditionEarCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60To69Years_Female_ConditionEarCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_70OrMoreYears_Female_ConditionEarCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_0To4Years_Male_ConditionEarCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_15To29Years_Male_ConditionEarCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_30To44Years_Male_ConditionEarCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_45To59Years_Male_ConditionEarCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_5To14Years_Male_ConditionEarCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60OrMoreYears_Male_ConditionEarCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60To69Years_Male_ConditionEarCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_70OrMoreYears_Male_ConditionEarCondition_AsAFractionOf_Count_Person + skip_all_sources_present_check: true + - ancestor_sv_id: Count_MedicalConditionIncident_ConditionEndocrineMetabolicCondition_AsAFractionOf_Count_Person + source_sv_ids: + - Count_MedicalConditionIncident_0To4Years_Female_ConditionEndocrineMetabolicCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_15To29Years_Female_ConditionEndocrineMetabolicCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_30To44Years_Female_ConditionEndocrineMetabolicCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_45To59Years_Female_ConditionEndocrineMetabolicCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_5To14Years_Female_ConditionEndocrineMetabolicCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60OrMoreYears_Female_ConditionEndocrineMetabolicCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60To69Years_Female_ConditionEndocrineMetabolicCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_70OrMoreYears_Female_ConditionEndocrineMetabolicCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_0To4Years_Male_ConditionEndocrineMetabolicCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_15To29Years_Male_ConditionEndocrineMetabolicCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_30To44Years_Male_ConditionEndocrineMetabolicCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_45To59Years_Male_ConditionEndocrineMetabolicCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_5To14Years_Male_ConditionEndocrineMetabolicCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60OrMoreYears_Male_ConditionEndocrineMetabolicCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60To69Years_Male_ConditionEndocrineMetabolicCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_70OrMoreYears_Male_ConditionEndocrineMetabolicCondition_AsAFractionOf_Count_Person + skip_all_sources_present_check: true + - ancestor_sv_id: Count_MedicalConditionIncident_ConditionEyeCondition_AsAFractionOf_Count_Person + source_sv_ids: + - Count_MedicalConditionIncident_0To4Years_Female_ConditionEyeCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_15To29Years_Female_ConditionEyeCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_30To44Years_Female_ConditionEyeCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_45To59Years_Female_ConditionEyeCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_5To14Years_Female_ConditionEyeCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60OrMoreYears_Female_ConditionEyeCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60To69Years_Female_ConditionEyeCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_70OrMoreYears_Female_ConditionEyeCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_0To4Years_Male_ConditionEyeCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_15To29Years_Male_ConditionEyeCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_30To44Years_Male_ConditionEyeCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_45To59Years_Male_ConditionEyeCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_5To14Years_Male_ConditionEyeCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60OrMoreYears_Male_ConditionEyeCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60To69Years_Male_ConditionEyeCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_70OrMoreYears_Male_ConditionEyeCondition_AsAFractionOf_Count_Person + skip_all_sources_present_check: true + - ancestor_sv_id: Count_MedicalConditionIncident_ConditionGastroIntestinalCondition_AsAFractionOf_Count_Person + source_sv_ids: + - Count_MedicalConditionIncident_0To4Years_Female_ConditionGastroIntestinalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_15To29Years_Female_ConditionGastroIntestinalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_30To44Years_Female_ConditionGastroIntestinalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_45To59Years_Female_ConditionGastroIntestinalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_5To14Years_Female_ConditionGastroIntestinalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60OrMoreYears_Female_ConditionGastroIntestinalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60To69Years_Female_ConditionGastroIntestinalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_70OrMoreYears_Female_ConditionGastroIntestinalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_0To4Years_Male_ConditionGastroIntestinalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_15To29Years_Male_ConditionGastroIntestinalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_30To44Years_Male_ConditionGastroIntestinalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_45To59Years_Male_ConditionGastroIntestinalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_5To14Years_Male_ConditionGastroIntestinalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60OrMoreYears_Male_ConditionGastroIntestinalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60To69Years_Male_ConditionGastroIntestinalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_70OrMoreYears_Male_ConditionGastroIntestinalCondition_AsAFractionOf_Count_Person + skip_all_sources_present_check: true + - ancestor_sv_id: Count_MedicalConditionIncident_ConditionGenitoUrinaryCondition_AsAFractionOf_Count_Person + source_sv_ids: + - Count_MedicalConditionIncident_0To4Years_Female_ConditionGenitoUrinaryCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_15To29Years_Female_ConditionGenitoUrinaryCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_30To44Years_Female_ConditionGenitoUrinaryCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_45To59Years_Female_ConditionGenitoUrinaryCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_5To14Years_Female_ConditionGenitoUrinaryCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60OrMoreYears_Female_ConditionGenitoUrinaryCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60To69Years_Female_ConditionGenitoUrinaryCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_70OrMoreYears_Female_ConditionGenitoUrinaryCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_0To4Years_Male_ConditionGenitoUrinaryCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_15To29Years_Male_ConditionGenitoUrinaryCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_30To44Years_Male_ConditionGenitoUrinaryCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_45To59Years_Male_ConditionGenitoUrinaryCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_5To14Years_Male_ConditionGenitoUrinaryCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60OrMoreYears_Male_ConditionGenitoUrinaryCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60To69Years_Male_ConditionGenitoUrinaryCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_70OrMoreYears_Male_ConditionGenitoUrinaryCondition_AsAFractionOf_Count_Person + skip_all_sources_present_check: true + - ancestor_sv_id: Count_MedicalConditionIncident_ConditionInfection_AsAFractionOf_Count_Person + source_sv_ids: + - Count_MedicalConditionIncident_0To4Years_Female_ConditionInfection_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_15To29Years_Female_ConditionInfection_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_30To44Years_Female_ConditionInfection_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_45To59Years_Female_ConditionInfection_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_5To14Years_Female_ConditionInfection_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60OrMoreYears_Female_ConditionInfection_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60To69Years_Female_ConditionInfection_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_70OrMoreYears_Female_ConditionInfection_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_0To4Years_Male_ConditionInfection_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_15To29Years_Male_ConditionInfection_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_30To44Years_Male_ConditionInfection_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_45To59Years_Male_ConditionInfection_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_5To14Years_Male_ConditionInfection_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60OrMoreYears_Male_ConditionInfection_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60To69Years_Male_ConditionInfection_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_70OrMoreYears_Male_ConditionInfection_AsAFractionOf_Count_Person + skip_all_sources_present_check: true + - ancestor_sv_id: Count_MedicalConditionIncident_ConditionInjury_AsAFractionOf_Count_Person + source_sv_ids: + - Count_MedicalConditionIncident_0To4Years_Female_ConditionInjury_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_15To29Years_Female_ConditionInjury_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_30To44Years_Female_ConditionInjury_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_45To59Years_Female_ConditionInjury_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_5To14Years_Female_ConditionInjury_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60OrMoreYears_Female_ConditionInjury_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60To69Years_Female_ConditionInjury_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_70OrMoreYears_Female_ConditionInjury_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_0To4Years_Male_ConditionInjury_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_15To29Years_Male_ConditionInjury_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_30To44Years_Male_ConditionInjury_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_45To59Years_Male_ConditionInjury_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_5To14Years_Male_ConditionInjury_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60OrMoreYears_Male_ConditionInjury_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60To69Years_Male_ConditionInjury_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_70OrMoreYears_Male_ConditionInjury_AsAFractionOf_Count_Person + skip_all_sources_present_check: true + - ancestor_sv_id: Count_MedicalConditionIncident_ConditionMusculoSkeletalCondition_AsAFractionOf_Count_Person + source_sv_ids: + - Count_MedicalConditionIncident_0To4Years_Female_ConditionMusculoSkeletalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_15To29Years_Female_ConditionMusculoSkeletalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_30To44Years_Female_ConditionMusculoSkeletalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_45To59Years_Female_ConditionMusculoSkeletalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_5To14Years_Female_ConditionMusculoSkeletalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60OrMoreYears_Female_ConditionMusculoSkeletalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60To69Years_Female_ConditionMusculoSkeletalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_70OrMoreYears_Female_ConditionMusculoSkeletalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_0To4Years_Male_ConditionMusculoSkeletalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_15To29Years_Male_ConditionMusculoSkeletalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_30To44Years_Male_ConditionMusculoSkeletalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_45To59Years_Male_ConditionMusculoSkeletalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_5To14Years_Male_ConditionMusculoSkeletalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60OrMoreYears_Male_ConditionMusculoSkeletalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60To69Years_Male_ConditionMusculoSkeletalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_70OrMoreYears_Male_ConditionMusculoSkeletalCondition_AsAFractionOf_Count_Person + skip_all_sources_present_check: true + - ancestor_sv_id: Count_MedicalConditionIncident_ConditionNSSHealthOtherCondition_AsAFractionOf_Count_Person + source_sv_ids: + - Count_MedicalConditionIncident_0To4Years_Female_ConditionNSSHealthOtherCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_15To29Years_Female_ConditionNSSHealthOtherCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_30To44Years_Female_ConditionNSSHealthOtherCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_45To59Years_Female_ConditionNSSHealthOtherCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_5To14Years_Female_ConditionNSSHealthOtherCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60OrMoreYears_Female_ConditionNSSHealthOtherCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60To69Years_Female_ConditionNSSHealthOtherCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_70OrMoreYears_Female_ConditionNSSHealthOtherCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_0To4Years_Male_ConditionNSSHealthOtherCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_15To29Years_Male_ConditionNSSHealthOtherCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_30To44Years_Male_ConditionNSSHealthOtherCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_45To59Years_Male_ConditionNSSHealthOtherCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_5To14Years_Male_ConditionNSSHealthOtherCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60OrMoreYears_Male_ConditionNSSHealthOtherCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60To69Years_Male_ConditionNSSHealthOtherCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_70OrMoreYears_Male_ConditionNSSHealthOtherCondition_AsAFractionOf_Count_Person + skip_all_sources_present_check: true + - ancestor_sv_id: Count_MedicalConditionIncident_ConditionObstetricCondition_AsAFractionOf_Count_Person + source_sv_ids: + - Count_MedicalConditionIncident_0To4Years_Female_ConditionObstetricCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_15To29Years_Female_ConditionObstetricCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_30To44Years_Female_ConditionObstetricCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_45To59Years_Female_ConditionObstetricCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_5To14Years_Female_ConditionObstetricCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60OrMoreYears_Female_ConditionObstetricCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60To69Years_Female_ConditionObstetricCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_70OrMoreYears_Female_ConditionObstetricCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_0To4Years_Male_ConditionObstetricCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_15To29Years_Male_ConditionObstetricCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_30To44Years_Male_ConditionObstetricCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_45To59Years_Male_ConditionObstetricCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_5To14Years_Male_ConditionObstetricCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60OrMoreYears_Male_ConditionObstetricCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60To69Years_Male_ConditionObstetricCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_70OrMoreYears_Male_ConditionObstetricCondition_AsAFractionOf_Count_Person + skip_all_sources_present_check: true + - ancestor_sv_id: Count_MedicalConditionIncident_ConditionPsychiatricNeurologicalCondition_AsAFractionOf_Count_Person + source_sv_ids: + - Count_MedicalConditionIncident_0To4Years_Female_ConditionPsychiatricNeurologicalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_15To29Years_Female_ConditionPsychiatricNeurologicalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_30To44Years_Female_ConditionPsychiatricNeurologicalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_45To59Years_Female_ConditionPsychiatricNeurologicalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_5To14Years_Female_ConditionPsychiatricNeurologicalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60OrMoreYears_Female_ConditionPsychiatricNeurologicalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60To69Years_Female_ConditionPsychiatricNeurologicalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_70OrMoreYears_Female_ConditionPsychiatricNeurologicalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_0To4Years_Male_ConditionPsychiatricNeurologicalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_15To29Years_Male_ConditionPsychiatricNeurologicalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_30To44Years_Male_ConditionPsychiatricNeurologicalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_45To59Years_Male_ConditionPsychiatricNeurologicalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_5To14Years_Male_ConditionPsychiatricNeurologicalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60OrMoreYears_Male_ConditionPsychiatricNeurologicalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60To69Years_Male_ConditionPsychiatricNeurologicalCondition_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_70OrMoreYears_Male_ConditionPsychiatricNeurologicalCondition_AsAFractionOf_Count_Person + skip_all_sources_present_check: true + - ancestor_sv_id: Count_MedicalConditionIncident_ConditionRespiratoryDisease_AsAFractionOf_Count_Person + source_sv_ids: + - Count_MedicalConditionIncident_0To4Years_Female_ConditionRespiratoryDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_15To29Years_Female_ConditionRespiratoryDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_30To44Years_Female_ConditionRespiratoryDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_45To59Years_Female_ConditionRespiratoryDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_5To14Years_Female_ConditionRespiratoryDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60OrMoreYears_Female_ConditionRespiratoryDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60To69Years_Female_ConditionRespiratoryDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_70OrMoreYears_Female_ConditionRespiratoryDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_0To4Years_Male_ConditionRespiratoryDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_15To29Years_Male_ConditionRespiratoryDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_30To44Years_Male_ConditionRespiratoryDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_45To59Years_Male_ConditionRespiratoryDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_5To14Years_Male_ConditionRespiratoryDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60OrMoreYears_Male_ConditionRespiratoryDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60To69Years_Male_ConditionRespiratoryDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_70OrMoreYears_Male_ConditionRespiratoryDisease_AsAFractionOf_Count_Person + skip_all_sources_present_check: true + - ancestor_sv_id: Count_MedicalConditionIncident_ConditionSkinDisease_AsAFractionOf_Count_Person + source_sv_ids: + - Count_MedicalConditionIncident_0To4Years_Female_ConditionSkinDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_15To29Years_Female_ConditionSkinDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_30To44Years_Female_ConditionSkinDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_45To59Years_Female_ConditionSkinDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_5To14Years_Female_ConditionSkinDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60OrMoreYears_Female_ConditionSkinDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60To69Years_Female_ConditionSkinDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_70OrMoreYears_Female_ConditionSkinDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_0To4Years_Male_ConditionSkinDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_15To29Years_Male_ConditionSkinDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_30To44Years_Male_ConditionSkinDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_45To59Years_Male_ConditionSkinDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_5To14Years_Male_ConditionSkinDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60OrMoreYears_Male_ConditionSkinDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_60To69Years_Male_ConditionSkinDisease_AsAFractionOf_Count_Person + - Count_MedicalConditionIncident_70OrMoreYears_Male_ConditionSkinDisease_AsAFractionOf_Count_Person + skip_all_sources_present_check: true + + - type: STAT_VAR_AGGREGATION + output_import: OECDRegionalDemography_Person_Age_StatVarAgg + input_imports: + - OECDRegionalDemography_Population + stat_var_aggregation: + aggregations: + # OECD, Age group + - ancestor_sv_id: Count_Person_Upto9Years + source_sv_ids: + - Count_Person_Upto4Years + - Count_Person_5To9Years + skip_all_sources_present_check: true + - ancestor_sv_id: Count_Person_10To19Years + source_sv_ids: + - Count_Person_10To14Years + - Count_Person_15To19Years + skip_all_sources_present_check: true + - ancestor_sv_id: Count_Person_20To29Years + source_sv_ids: + - Count_Person_20To24Years + - Count_Person_25To29Years + skip_all_sources_present_check: true + - ancestor_sv_id: Count_Person_30To39Years + source_sv_ids: + - Count_Person_30To34Years + - Count_Person_35To39Years + skip_all_sources_present_check: true + - ancestor_sv_id: Count_Person_40To49Years + source_sv_ids: + - Count_Person_40To44Years + - Count_Person_45To49Years + skip_all_sources_present_check: true + - ancestor_sv_id: Count_Person_50To59Years + source_sv_ids: + - Count_Person_50To54Years + - Count_Person_55To59Years + skip_all_sources_present_check: true + - ancestor_sv_id: Count_Person_60To69Years + source_sv_ids: + - Count_Person_60To64Years + - Count_Person_65To69Years + skip_all_sources_present_check: true + - ancestor_sv_id: Count_Person_70OrMoreYears + source_sv_ids: + - Count_Person_70To74Years + - Count_Person_75To79Years + - Count_Person_80OrMoreYears + skip_all_sources_present_check: true + - ancestor_sv_id: Count_Person_5To19Years + source_sv_ids: + - Count_Person_5To9Years + - Count_Person_10To14Years + - Count_Person_15To19Years + skip_all_sources_present_check: true + - ancestor_sv_id: Count_Person_25To34Years + source_sv_ids: + - Count_Person_25To29Years + - Count_Person_30To34Years + skip_all_sources_present_check: true + - ancestor_sv_id: Count_Person_35To44Years + source_sv_ids: + - Count_Person_35To39Years + - Count_Person_40To44Years + skip_all_sources_present_check: true + - ancestor_sv_id: Count_Person_45To54Years + source_sv_ids: + - Count_Person_45To49Years + - Count_Person_50To54Years + skip_all_sources_present_check: true + - ancestor_sv_id: Count_Person_55To64Years + source_sv_ids: + - Count_Person_55To59Years + - Count_Person_60To64Years + skip_all_sources_present_check: true + - ancestor_sv_id: Count_Person_65To74Years + source_sv_ids: + - Count_Person_65To69Years + - Count_Person_70To74Years + skip_all_sources_present_check: true + - ancestor_sv_id: Count_Person_75OrMoreYears + source_sv_ids: + - Count_Person_75To79Years + - Count_Person_80OrMoreYears + skip_all_sources_present_check: true + + - type: STAT_VAR_AGGREGATION + output_import: CensusACS5YearSurvey_SubjectTables_S0801_StatVarAgg + input_imports: + - CensusACS5YearSurvey_SubjectTables_S0801 + stat_var_aggregation: + aggregations: + # Census ACS 5 year, Work Commute group + - ancestor_sv_id: Count_Person_Years16Onwards_WorkCommute_Employed + source_sv_ids: + - Count_Person_Years16Onwards_CarTruckOrVan_WorkCommute_Employed + - Count_Person_Years16Onwards_DroveAlone_WorkCommute_Employed + - Count_Person_Years16Onwards_CarTruckOrVanCarpooled_WorkCommute_Employed + - Count_Person_Years16Onwards_CarpoolOccupancy2Person_CarTruckOrVanCarpooled_WorkCommute_Employed + - Count_Person_Years16Onwards_CarpoolOccupancy3Person_CarTruckOrVanCarpooled_WorkCommute_Employed + - Count_Person_Years16Onwards_CarpoolOccupancy4OrMorePerson_CarTruckOrVanCarpooled_WorkCommute_Employed + - Count_Person_Years16Onwards_PublicTransportationExcludingTaxicab_WorkCommute_Employed + - Count_Person_Years16Onwards_Walked_WorkCommute_Employed + - Count_Person_Years16Onwards_Bicycle_WorkCommute_Employed + - Count_Person_Years16Onwards_TaxicabMotorcycleBicycleOrOtherMeans_WorkCommute_Employed + - Count_Person_Years16Onwards_WorkedAtHome_WorkCommute_Employed + - Count_Person_Years16Onwards_WorkedFromHome_WorkCommute_Employed + skip_all_sources_present_check: true + - ancestor_sv_id: Count_Person_Years16Onwards_WorkCommute_Employed_Male + source_sv_ids: + - Count_Person_Years16Onwards_CarTruckOrVan_WorkCommute_Employed_Male + - Count_Person_Years16Onwards_DroveAlone_WorkCommute_Employed_Male + - Count_Person_Years16Onwards_CarTruckOrVanCarpooled_WorkCommute_Employed_Male + - Count_Person_Years16Onwards_CarpoolOccupancy2Person_CarTruckOrVanCarpooled_WorkCommute_Employed_Male + - Count_Person_Years16Onwards_CarpoolOccupancy3Person_CarTruckOrVanCarpooled_WorkCommute_Employed_Male + - Count_Person_Years16Onwards_CarpoolOccupancy4OrMorePerson_CarTruckOrVanCarpooled_WorkCommute_Employed_Male + - Count_Person_Years16Onwards_PublicTransportationExcludingTaxicab_WorkCommute_Employed_Male + - Count_Person_Years16Onwards_Walked_WorkCommute_Employed_Male + - Count_Person_Years16Onwards_Bicycle_WorkCommute_Employed_Male + - Count_Person_Years16Onwards_TaxicabMotorcycleBicycleOrOtherMeans_WorkCommute_Employed_Male + - Count_Person_Years16Onwards_WorkedAtHome_WorkCommute_Employed_Male + - Count_Person_Years16Onwards_WorkedFromHome_WorkCommute_Employed_Male + skip_all_sources_present_check: true + - ancestor_sv_id: Count_Person_Years16Onwards_WorkCommute_Employed_Female + source_sv_ids: + - Count_Person_Years16Onwards_CarTruckOrVan_WorkCommute_Employed_Female + - Count_Person_Years16Onwards_DroveAlone_WorkCommute_Employed_Female + - Count_Person_Years16Onwards_CarTruckOrVanCarpooled_WorkCommute_Employed_Female + - Count_Person_Years16Onwards_CarpoolOccupancy2Person_CarTruckOrVanCarpooled_WorkCommute_Employed_Female + - Count_Person_Years16Onwards_CarpoolOccupancy3Person_CarTruckOrVanCarpooled_WorkCommute_Employed_Female + - Count_Person_Years16Onwards_CarpoolOccupancy4OrMorePerson_CarTruckOrVanCarpooled_WorkCommute_Employed_Female + - Count_Person_Years16Onwards_PublicTransportationExcludingTaxicab_WorkCommute_Employed_Female + - Count_Person_Years16Onwards_Walked_WorkCommute_Employed_Female + - Count_Person_Years16Onwards_Bicycle_WorkCommute_Employed_Female + - Count_Person_Years16Onwards_TaxicabMotorcycleBicycleOrOtherMeans_WorkCommute_Employed_Female + - Count_Person_Years16Onwards_WorkedAtHome_WorkCommute_Employed_Female + - Count_Person_Years16Onwards_WorkedFromHome_WorkCommute_Employed_Female + skip_all_sources_present_check: true + # Worked outside of Home + - ancestor_sv_id: Count_Person_Years16Onwards_WorkCommute_Employed_WorkedOutsideOfHome + source_sv_ids: + - Count_Person_Years16Onwards_StartTimeHour0000To0459_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_StartTimeHour0500To0529_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_StartTimeHour0530To0559_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_StartTimeHour0600To0629_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_StartTimeHour0630To0659_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_StartTimeHour0700To0729_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_StartTimeHour0730To0759_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_StartTimeHour0800To0829_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_StartTimeHour0830To0859_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_StartTimeHour0900To2359_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_10OrLessMinute_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_10To14Minute_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_15To19Minute_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_20To24Minute_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_25To29Minute_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_30To34Minute_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_35To44Minute_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_45To59Minute_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_60OrMoreMinute_WorkCommute_Employed_WorkedOutsideOfHome + - ancestor_sv_id: Count_Person_Years16Onwards_WorkCommute_Employed_Male_WorkedOutsideOfHome + source_sv_ids: + - Count_Person_Years16Onwards_StartTimeHour0000To0459_WorkCommute_Employed_Male_WorkedOutsideOfHome + - Count_Person_Years16Onwards_StartTimeHour0500To0529_WorkCommute_Employed_Male_WorkedOutsideOfHome + - Count_Person_Years16Onwards_StartTimeHour0530To0559_WorkCommute_Employed_Male_WorkedOutsideOfHome + - Count_Person_Years16Onwards_StartTimeHour0600To0629_WorkCommute_Employed_Male_WorkedOutsideOfHome + - Count_Person_Years16Onwards_StartTimeHour0630To0659_WorkCommute_Employed_Male_WorkedOutsideOfHome + - Count_Person_Years16Onwards_StartTimeHour0700To0729_WorkCommute_Employed_Male_WorkedOutsideOfHome + - Count_Person_Years16Onwards_StartTimeHour0730To0759_WorkCommute_Employed_Male_WorkedOutsideOfHome + - Count_Person_Years16Onwards_StartTimeHour0800To0829_WorkCommute_Employed_Male_WorkedOutsideOfHome + - Count_Person_Years16Onwards_StartTimeHour0830To0859_WorkCommute_Employed_Male_WorkedOutsideOfHome + - Count_Person_Years16Onwards_StartTimeHour0900To2359_WorkCommute_Employed_Male_WorkedOutsideOfHome + - Count_Person_Years16Onwards_10OrLessMinute_WorkCommute_Employed_Male_WorkedOutsideOfHome + - Count_Person_Years16Onwards_10To14Minute_WorkCommute_Employed_Male_WorkedOutsideOfHome + - Count_Person_Years16Onwards_15To19Minute_WorkCommute_Employed_Male_WorkedOutsideOfHome + - Count_Person_Years16Onwards_20To24Minute_WorkCommute_Employed_Male_WorkedOutsideOfHome + - Count_Person_Years16Onwards_25To29Minute_WorkCommute_Employed_Male_WorkedOutsideOfHome + - Count_Person_Years16Onwards_30To34Minute_WorkCommute_Employed_Male_WorkedOutsideOfHome + - Count_Person_Years16Onwards_35To44Minute_WorkCommute_Employed_Male_WorkedOutsideOfHome + - Count_Person_Years16Onwards_45To59Minute_WorkCommute_Employed_Male_WorkedOutsideOfHome + - Count_Person_Years16Onwards_60OrMoreMinute_WorkCommute_Employed_Male_WorkedOutsideOfHome + - ancestor_sv_id: Count_Person_Years16Onwards_WorkCommute_Employed_Female_WorkedOutsideOfHome + source_sv_ids: + - Count_Person_Years16Onwards_StartTimeHour0000To0459_WorkCommute_Employed_Female_WorkedOutsideOfHome + - Count_Person_Years16Onwards_StartTimeHour0500To0529_WorkCommute_Employed_Female_WorkedOutsideOfHome + - Count_Person_Years16Onwards_StartTimeHour0530To0559_WorkCommute_Employed_Female_WorkedOutsideOfHome + - Count_Person_Years16Onwards_StartTimeHour0600To0629_WorkCommute_Employed_Female_WorkedOutsideOfHome + - Count_Person_Years16Onwards_StartTimeHour0630To0659_WorkCommute_Employed_Female_WorkedOutsideOfHome + - Count_Person_Years16Onwards_StartTimeHour0700To0729_WorkCommute_Employed_Female_WorkedOutsideOfHome + - Count_Person_Years16Onwards_StartTimeHour0730To0759_WorkCommute_Employed_Female_WorkedOutsideOfHome + - Count_Person_Years16Onwards_StartTimeHour0800To0829_WorkCommute_Employed_Female_WorkedOutsideOfHome + - Count_Person_Years16Onwards_StartTimeHour0830To0859_WorkCommute_Employed_Female_WorkedOutsideOfHome + - Count_Person_Years16Onwards_StartTimeHour0900To2359_WorkCommute_Employed_Female_WorkedOutsideOfHome + - Count_Person_Years16Onwards_10OrLessMinute_WorkCommute_Employed_Female_WorkedOutsideOfHome + - Count_Person_Years16Onwards_10To14Minute_WorkCommute_Employed_Female_WorkedOutsideOfHome + - Count_Person_Years16Onwards_15To19Minute_WorkCommute_Employed_Female_WorkedOutsideOfHome + - Count_Person_Years16Onwards_20To24Minute_WorkCommute_Employed_Female_WorkedOutsideOfHome + - Count_Person_Years16Onwards_25To29Minute_WorkCommute_Employed_Female_WorkedOutsideOfHome + - Count_Person_Years16Onwards_30To34Minute_WorkCommute_Employed_Female_WorkedOutsideOfHome + - Count_Person_Years16Onwards_35To44Minute_WorkCommute_Employed_Female_WorkedOutsideOfHome + - Count_Person_Years16Onwards_45To59Minute_WorkCommute_Employed_Female_WorkedOutsideOfHome + - Count_Person_Years16Onwards_60OrMoreMinute_WorkCommute_Employed_Female_WorkedOutsideOfHome + - type: STAT_VAR_AGGREGATION + output_import: CensusACS5YearSurvey_SubjectTables_S0804_StatVarAgg + input_imports: + - CensusACS5YearSurvey_SubjectTables_S0804 + stat_var_aggregation: + aggregations: + - ancestor_sv_id: Count_Person_Years16Onwards_PublicTransportationExcludingTaxicab_WorkCommute_Employed_WorkedOutsideOfHome + source_sv_ids: + - Count_Person_Years16Onwards_PublicTransportationExcludingTaxicab_10OrLessMinute_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_PublicTransportationExcludingTaxicab_10To14Minute_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_PublicTransportationExcludingTaxicab_15To19Minute_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_PublicTransportationExcludingTaxicab_20To24Minute_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_PublicTransportationExcludingTaxicab_25To29Minute_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_PublicTransportationExcludingTaxicab_30To34Minute_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_PublicTransportationExcludingTaxicab_35To44Minute_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_PublicTransportationExcludingTaxicab_45To59Minute_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_PublicTransportationExcludingTaxicab_60OrMoreMinute_WorkCommute_Employed_WorkedOutsideOfHome + - ancestor_sv_id: Count_Person_Years16Onwards_CarTruckOrVanDroveAlone_WorkCommute_Employed_WorkedOutsideOfHome + source_sv_ids: + - Count_Person_Years16Onwards_CarTruckOrVanDroveAlone_10OrLessMinute_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_CarTruckOrVanDroveAlone_10To14Minute_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_CarTruckOrVanDroveAlone_15To19Minute_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_CarTruckOrVanDroveAlone_20To24Minute_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_CarTruckOrVanDroveAlone_25To29Minute_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_CarTruckOrVanDroveAlone_30To34Minute_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_CarTruckOrVanDroveAlone_35To44Minute_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_CarTruckOrVanDroveAlone_45To59Minute_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_CarTruckOrVanDroveAlone_60OrMoreMinute_WorkCommute_Employed_WorkedOutsideOfHome + - ancestor_sv_id: Count_Person_Years16Onwards_CarTruckOrVanCarpooled_WorkCommute_Employed_WorkedOutsideOfHome + source_sv_ids: + - Count_Person_Years16Onwards_CarTruckOrVanCarpooled_10OrLessMinute_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_CarTruckOrVanCarpooled_10To14Minute_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_CarTruckOrVanCarpooled_15To19Minute_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_CarTruckOrVanCarpooled_25To29Minute_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_CarTruckOrVanCarpooled_20To24Minute_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_CarTruckOrVanCarpooled_30To34Minute_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_CarTruckOrVanCarpooled_35To44Minute_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_CarTruckOrVanCarpooled_45To59Minute_WorkCommute_Employed_WorkedOutsideOfHome + - Count_Person_Years16Onwards_CarTruckOrVanCarpooled_60OrMoreMinute_WorkCommute_Employed_WorkedOutsideOfHome + - type: STAT_VAR_AGGREGATION + output_import: India_RBIStateDomesticProduct_StatVarAgg + input_imports: + - India_RBIStateDomesticProduct + stat_var_aggregation: + aggregations: + - ancestor_sv_id: Nominal_Amount_EconomicActivity_GrossValueAdded + source_sv_ids: + - Nominal_Amount_EconomicActivity_GrossValueAdded_Agriculture + - Nominal_Amount_EconomicActivity_GrossValueAdded_BankingAndInsuranceSector + - Nominal_Amount_EconomicActivity_GrossValueAdded_Construction + - Nominal_Amount_EconomicActivity_GrossValueAdded_Industry + - Nominal_Amount_EconomicActivity_GrossValueAdded_ManufacturingSector + - Nominal_Amount_EconomicActivity_GrossValueAdded_Services + - ancestor_sv_id: Nominal_Amount_EconomicActivity_NetValueAdded + source_sv_ids: + - Nominal_Amount_EconomicActivity_NetValueAdded_Agriculture + - Nominal_Amount_EconomicActivity_NetValueAdded_Construction + - Nominal_Amount_EconomicActivity_NetValueAdded_Industry + - Nominal_Amount_EconomicActivity_NetValueAdded_ManufacturingSector + - Nominal_Amount_EconomicActivity_NetValueAdded_Services + - ancestor_sv_id: Amount_EconomicActivity_GrossValueAdded_RealValue + source_sv_ids: + - RealValue_Amount_EconomicActivity_GrossValueAdded_Agriculture + - RealValue_Amount_EconomicActivity_GrossValueAdded_BankingAndInsuranceSector + - RealValue_Amount_EconomicActivity_GrossValueAdded_Construction + - RealValue_Amount_EconomicActivity_GrossValueAdded_Industry + - RealValue_Amount_EconomicActivity_GrossValueAdded_ManufacturingSector + - RealValue_Amount_EconomicActivity_GrossValueAdded_Services + - ancestor_sv_id: RealValue_Amount_EconomicActivity_NetValueAdded + source_sv_ids: + - RealValue_Amount_EconomicActivity_NetValueAdded_Agriculture + - RealValue_Amount_EconomicActivity_NetValueAdded_Construction + - RealValue_Amount_EconomicActivity_NetValueAdded_Industry + - RealValue_Amount_EconomicActivity_NetValueAdded_ManufacturingSector + - RealValue_Amount_EconomicActivity_NetValueAdded_Services diff --git a/pipeline/workflow/ingestion-helper/aggregation/configs/statvar_calculation.yaml b/pipeline/workflow/ingestion-helper/aggregation/configs/statvar_calculation.yaml new file mode 100644 index 000000000..5fd320fb9 --- /dev/null +++ b/pipeline/workflow/ingestion-helper/aggregation/configs/statvar_calculation.yaml @@ -0,0 +1,324 @@ +calculations: + +# Energy. + - type: STAT_VAR_CALCULATION + input_imports: + - EPA_GHGRP_AggCounty + - EPA_GHGRP_AggCensusZipCodeTabulationArea + - EPA_GHGRP_AggCounty_AggState + - EIA_Electricity + - UNEnergy + output_import: Energy_StatVarCalculation + stat_var_calculation: + calculations: + - input1: + sv_regex: Annual_Emissions_GreenhouseGas_NonBiogenic + measurement_method_regex: dcAggregate/EPA_GHGRP + facet_info: + unit: MetricTonCO2e + observation_period: P1Y + input2: + sv_regex: Annual_Generation_Electricity + facet_info: + unit: GigawattHour + operation: DIVIDE + output: + sv: Annual_Emissions_GreenhouseGas_NonBiogenic_Per_Annual_Generation_Electricity + measurement_method: EPA_GHGRP_EIA_Electricity + facet_info: + unit: MetricTonCO2ePerGigawattHour + observation_period: P1Y + +# Climate: Temperature modeling diffs from actual. +# There are 30+ models. For each model, the output diffs are done for +# Aggregations: Mean, Min, Max and also for SSP2 with RCP4.5 and SSP5 with +# RCP8.5 (for each model). +# This includes diffs for P1M, P1Y, P5Y and P10Y. +# Note that the computations below use sv_regex and measurement_method_regex +# instead of sv and measurement_method to allow more expressive matching. Also +# note that the output uses sv_prefix and measurement_method_prefix which is not +# the case above, for example. Whenever sv and measurement_method are set in the +# output, they are preferred. Otherwise, the *_prefix is used. + - type: STAT_VAR_CALCULATION + input_imports: + - NASA_NEXGDDP_CMIP6_Subnational + - NASA_NEXGDDP_CMIP6_IpccPlaces50 + - NOAA_EPA_Observed_Historical_Weather + - NASA_NEXDCP30 + - Copernicus_ECMWF_ERA5_Monthly + - Copernicus_ECMWF_ERA5_Monthly_AggrYearsStats + - NASA_NEXDCP30_AggrYearsStats + - NASA_NEXGDDP_CMIP6_IpccPlaces50_AggrYearsStats + - NASA_NEXGDDP_CMIP6_Subnational_AggrYearsStats + - NOAA_EPA_Observed_Historical_Weather_AggrYearsStats + output_import: Climate_StatVarCalculation + stat_var_calculation: + calculations: + - input1: + sv_regex: "^Temperature(_SSP[0-9]+)*$" + measurement_method_regex: "^dcAggregate/NASA_Mean_CMIP6_.*" + facet_info: + unit: Celsius + observation_period: P1M + input2: + sv_regex: ^Mean_Temperature$ + measurement_method_regex: dcAggregate/NASAGSOD_NASAGHCN_EPA + facet_info: + unit: Celsius + observation_period: P1M + operation: SUBTRACT + output: + sv_prefix: DifferenceRelativeToObservationalData_ + measurement_method_prefix: dcAggregate/NASA_Mean_CMIP6_WithBaseAs_ + facet_info: + unit: Celsius + observation_period: P1M + + calculations: + - input1: + sv_regex: "^Min_Temperature(_SSP[0-9]+)*$" + measurement_method_regex: "^dcAggregate/NASA_Mean_CMIP6_.*" + facet_info: + unit: Celsius + observation_period: P1M + input2: + sv_regex: ^Min_Temperature$ + measurement_method_regex: dcAggregate/NASAGSOD_NASAGHCN_EPA + facet_info: + unit: Celsius + observation_period: P1M + operation: SUBTRACT + output: + sv_prefix: DifferenceRelativeToObservationalData_ + measurement_method_prefix: dcAggregate/NASA_Mean_CMIP6_WithBaseAs_ + facet_info: + unit: Celsius + observation_period: P1M + + calculations: + - input1: + sv_regex: "^Max_Temperature(_SSP[0-9]+)*$" + measurement_method_regex: "^dcAggregate/NASA_Mean_CMIP6_.*" + facet_info: + unit: Celsius + observation_period: P1M + input2: + sv_regex: ^Max_Temperature$ + measurement_method_regex: dcAggregate/NASAGSOD_NASAGHCN_EPA + facet_info: + unit: Celsius + observation_period: P1M + operation: SUBTRACT + output: + sv_prefix: DifferenceRelativeToObservationalData_ + measurement_method_prefix: dcAggregate/NASA_Mean_CMIP6_WithBaseAs_ + facet_info: + unit: Celsius + observation_period: P1M + + calculations: + - input1: + sv_regex: "^Temperature(_SSP[0-9]+)*$" + measurement_method_regex: "^dcAggregate/NASA_Mean_CMIP6_.*" + facet_info: + unit: Celsius + observation_period: P1Y + import_name_regex: ".*AggrYearsStats$" + input2: + sv_regex: ^Mean_Temperature$ + measurement_method_regex: dcAggregate/NASAGSOD_NASAGHCN_EPA + facet_info: + unit: Celsius + observation_period: P1Y + import_name_regex: ".*AggrYearsStats$" + operation: SUBTRACT + output: + sv_prefix: DifferenceRelativeToObservationalData_ + measurement_method_prefix: dcAggregate/NASA_Mean_CMIP6_WithBaseAs_ + facet_info: + unit: Celsius + observation_period: P1Y + + calculations: + - input1: + sv_regex: "^Min_Temperature(_SSP[0-9]+)*$" + measurement_method_regex: "^dcAggregate/NASA_Mean_CMIP6_.*" + facet_info: + unit: Celsius + observation_period: P1Y + import_name_regex: ".*AggrYearsStats$" + input2: + sv_regex: ^Min_Temperature$ + measurement_method_regex: dcAggregate/NASAGSOD_NASAGHCN_EPA + facet_info: + unit: Celsius + observation_period: P1Y + import_name_regex: ".*AggrYearsStats$" + operation: SUBTRACT + output: + sv_prefix: DifferenceRelativeToObservationalData_ + measurement_method_prefix: dcAggregate/NASA_Mean_CMIP6_WithBaseAs_ + facet_info: + unit: Celsius + observation_period: P1Y + + calculations: + - input1: + sv_regex: "^Max_Temperature(_SSP[0-9]+)*$" + measurement_method_regex: "^dcAggregate/NASA_Mean_CMIP6_.*" + facet_info: + unit: Celsius + observation_period: P1Y + import_name_regex: ".*AggrYearsStats$" + input2: + sv_regex: ^Max_Temperature$ + measurement_method_regex: dcAggregate/NASAGSOD_NASAGHCN_EPA + facet_info: + unit: Celsius + observation_period: P1Y + import_name_regex: ".*AggrYearsStats$" + operation: SUBTRACT + output: + sv_prefix: DifferenceRelativeToObservationalData_ + measurement_method_prefix: dcAggregate/NASA_Mean_CMIP6_WithBaseAs_ + facet_info: + unit: Celsius + observation_period: P1Y + + calculations: + - input1: + sv_regex: "^Temperature(_SSP[0-9]+)*$" + measurement_method_regex: "^dcAggregate/NASA_Mean_CMIP6_.*" + facet_info: + unit: Celsius + observation_period: P5Y + import_name_regex: ".*AggrYearsStats$" + input2: + sv_regex: ^Mean_Temperature$ + measurement_method_regex: dcAggregate/NASAGSOD_NASAGHCN_EPA + facet_info: + unit: Celsius + observation_period: P5Y + import_name_regex: ".*AggrYearsStats$" + operation: SUBTRACT + output: + sv_prefix: DifferenceRelativeToObservationalData_ + measurement_method_prefix: dcAggregate/NASA_Mean_CMIP6_WithBaseAs_ + facet_info: + unit: Celsius + observation_period: P5Y + + calculations: + - input1: + sv_regex: "^Min_Temperature(_SSP[0-9]+)*$" + measurement_method_regex: "^dcAggregate/NASA_Mean_CMIP6_.*" + facet_info: + unit: Celsius + observation_period: P5Y + import_name_regex: ".*AggrYearsStats$" + input2: + sv_regex: ^Min_Temperature$ + measurement_method_regex: dcAggregate/NASAGSOD_NASAGHCN_EPA + facet_info: + unit: Celsius + observation_period: P5Y + import_name_regex: ".*AggrYearsStats$" + operation: SUBTRACT + output: + sv_prefix: DifferenceRelativeToObservationalData_ + measurement_method_prefix: dcAggregate/NASA_Mean_CMIP6_WithBaseAs_ + facet_info: + unit: Celsius + observation_period: P5Y + + calculations: + - input1: + sv_regex: "^Max_Temperature(_SSP[0-9]+)*$" + measurement_method_regex: "^dcAggregate/NASA_Mean_CMIP6_.*" + facet_info: + unit: Celsius + observation_period: P5Y + import_name_regex: ".*AggrYearsStats$" + input2: + sv_regex: ^Max_Temperature$ + measurement_method_regex: dcAggregate/NASAGSOD_NASAGHCN_EPA + facet_info: + unit: Celsius + observation_period: P5Y + import_name_regex: ".*AggrYearsStats$" + operation: SUBTRACT + output: + sv_prefix: DifferenceRelativeToObservationalData_ + measurement_method_prefix: dcAggregate/NASA_Mean_CMIP6_WithBaseAs_ + facet_info: + unit: Celsius + observation_period: P5Y + + calculations: + - input1: + sv_regex: "^Temperature(_SSP[0-9]+)*$" + measurement_method_regex: "^dcAggregate/NASA_Mean_CMIP6_.*" + facet_info: + unit: Celsius + observation_period: P10Y + import_name_regex: ".*AggrYearsStats$" + input2: + sv_regex: ^Mean_Temperature$ + measurement_method_regex: dcAggregate/NASAGSOD_NASAGHCN_EPA + facet_info: + unit: Celsius + observation_period: P10Y + import_name_regex: ".*AggrYearsStats$" + operation: SUBTRACT + output: + sv_prefix: DifferenceRelativeToObservationalData_ + measurement_method_prefix: dcAggregate/NASA_Mean_CMIP6_WithBaseAs_ + facet_info: + unit: Celsius + observation_period: P10Y + + calculations: + - input1: + sv_regex: "^Min_Temperature(_SSP[0-9]+)*$" + measurement_method_regex: "^dcAggregate/NASA_Mean_CMIP6_.*" + facet_info: + unit: Celsius + observation_period: P10Y + import_name_regex: ".*AggrYearsStats$" + input2: + sv_regex: ^Min_Temperature$ + measurement_method_regex: dcAggregate/NASAGSOD_NASAGHCN_EPA + facet_info: + unit: Celsius + observation_period: P10Y + import_name_regex: ".*AggrYearsStats$" + operation: SUBTRACT + output: + sv_prefix: DifferenceRelativeToObservationalData_ + measurement_method_prefix: dcAggregate/NASA_Mean_CMIP6_WithBaseAs_ + facet_info: + unit: Celsius + observation_period: P10Y + + calculations: + - input1: + sv_regex: "^Max_Temperature(_SSP[0-9]+)*$" + measurement_method_regex: "^dcAggregate/NASA_Mean_CMIP6_.*" + facet_info: + unit: Celsius + observation_period: P10Y + import_name_regex: ".*AggrYearsStats$" + input2: + sv_regex: ^Max_Temperature$ + measurement_method_regex: dcAggregate/NASAGSOD_NASAGHCN_EPA + facet_info: + unit: Celsius + observation_period: P10Y + import_name_regex: ".*AggrYearsStats$" + operation: SUBTRACT + output: + sv_prefix: DifferenceRelativeToObservationalData_ + measurement_method_prefix: dcAggregate/NASA_Mean_CMIP6_WithBaseAs_ + facet_info: + unit: Celsius + observation_period: P10Y diff --git a/pipeline/workflow/ingestion-helper/aggregation/configs/statvar_series.yaml b/pipeline/workflow/ingestion-helper/aggregation/configs/statvar_series.yaml new file mode 100644 index 000000000..97c7da2e2 --- /dev/null +++ b/pipeline/workflow/ingestion-helper/aggregation/configs/statvar_series.yaml @@ -0,0 +1,449 @@ +calculations: + - type: STAT_VAR_SERIES_AGGREGATION + input_imports: + - NASA_NEXDCP30 + round: 1 + output_import: NASA_NEXDCP30_AggrDiffStats + stat_var_series_aggregation: + aggr_funcs: + - max_diff_across_measurement_methods: + - diff_relative_to_base_date: + # TODO(b/220070731): Consider adding start_date/end_date spec. + date_specs: + # There are two series in this import: + # 1. Monthly historical data from 1950 to 2005. + # 2. Monthly projections from 2006 to 2099. + dates: + - 1990 + - 2006 + - type: STAT_VAR_SERIES_AGGREGATION + input_imports: + - NASA_NEXGDDP_Subnational + round: 1 + output_import: NASA_NEXGDDP_Subnational_AggrDiffStats + stat_var_series_aggregation: + aggr_funcs: + - max_diff_across_measurement_methods: + - diff_relative_to_base_date: + date_specs: + start_date: 2006 + end_date: 2020 + date_specs: + # There are two series in this import: + # 1. Monthly historical data from 1950 to 2005. + # 2. Monthly projections from 2006 to 2099. + dates: + - 1990 + - 2006 + - type: STAT_VAR_SERIES_AGGREGATION + input_imports: + - NASA_NEXGDDP_Country + round: 1 + output_import: NASA_NEXGDDP_Country_AggrDiffStats + stat_var_series_aggregation: + aggr_funcs: + - max_diff_across_measurement_methods: + - diff_relative_to_base_date: + # TODO(b/220070731): Consider adding start_date/end_date spec. + date_specs: + # There are two series in this import: + # 1. Monthly historical data from 1950 to 2005. + # 2. Monthly projections from 2006 to 2099. + dates: + - 1990 + - 2006 + - type: STAT_VAR_SERIES_AGGREGATION + input_imports: + - NASA_NEXGDDP_CMIP6_Subnational + round: 1 + output_import: NASA_NEXGDDP_CMIP6_Subnational_AggrDiffStats + stat_var_series_aggregation: + aggr_funcs: + - max_diff_across_measurement_methods: + - diff_relative_to_base_date: + date_specs: + start_date: 2015 + end_date: 2020 + date_specs: + # There are four series in this import: + # 1. Monthly historical data from 1950 to 2014. + # 2. Yearly historical data from 1950 to 2014 + # 3. Monthly projections from 2015 to 2100. + # 4. Yearly projections from 2015 to 2100. + dates: + - 1990 + - 2015 + - type: STAT_VAR_SERIES_AGGREGATION + input_imports: + - NASA_NEXGDDP_CMIP6_IpccPlaces50 + round: 1 + output_import: NASA_NEXGDDP_CMIP6_IpccPlaces50_AggrDiffStats + stat_var_series_aggregation: + aggr_funcs: + - max_diff_across_measurement_methods: + - diff_relative_to_base_date: + date_specs: + start_date: 2015 + end_date: 2020 + date_specs: + # There are four series in this import: + # 1. Monthly historical data from 1950 to 2014. + # 2. Yearly historical data from 1950 to 2014 + # 3. Monthly projections from 2015 to 2100. + # 4. Yearly projections from 2015 to 2100. + dates: + - 1990 + - 2015 + - type: STAT_VAR_SERIES_AGGREGATION + input_imports: + - NASA_NEXGDDP_Subnational_AggrDiffStats + round: 2 + output_import: NASA_NEXGDDP_Subnational_AggrStatsAcrossModels + stat_var_series_aggregation: + aggr_funcs: + - stats_across_models: + - type: STAT_VAR_SERIES_AGGREGATION + input_imports: + - NASA_NEXGDDP_CMIP6_Subnational_AggrDiffStats + round: 2 + output_import: NASA_NEXGDDP_CMIP6_Subnational_AggrStatsAcrossModels + stat_var_series_aggregation: + aggr_funcs: + - stats_across_models: + - type: STAT_VAR_SERIES_AGGREGATION + input_imports: + - NASA_NEXGDDP_CMIP6_IpccPlaces50_AggrDiffStats + round: 2 + output_import: NASA_NEXGDDP_CMIP6_IpccPlaces50_AggrStatsAcrossModels + stat_var_series_aggregation: + aggr_funcs: + - stats_across_models: + - type: STAT_VAR_SERIES_AGGREGATION + input_imports: + - NASA_NEXGDDP_CMIP6_Subnational_AggrStatsAcrossModels + round: 3 + output_import: NASA_NEXGDDP_CMIP6_Subnational_AggrOverTimeOnStatsAcrossModels + stat_var_series_aggregation: + aggr_funcs: + # Config for future 10-year max/min aggregate + - aggr_over_time + time_range + input_obs_period: P1M + output_obs_period: P10Y + output_obs_date: 2030 + sv_configs + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" + aggregation_op: OPERATOR_MAX + sv_configs + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" + aggregation_op: OPERATOR_MIN + sv_configs + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate((?!Max|Min).)*_Temperature.*" + aggregation_op: OPERATOR_MEAN + # Config for future 30-year max/min aggregate + - aggr_over_time + time_range + input_obs_period: P1M + output_obs_period: P30Y + output_obs_date: 2050 + sv_configs + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" + aggregation_op: OPERATOR_MAX + sv_configs + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" + aggregation_op: OPERATOR_MIN + sv_configs + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate((?!Max|Min).)*_Temperature.*" + aggregation_op: OPERATOR_MEAN + # Config for future 80-year max/min aggregate + - aggr_over_time + time_range + input_obs_period: P1M + output_obs_period: P80Y + output_obs_date: 2100 + sv_configs + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" + aggregation_op: OPERATOR_MAX + sv_configs + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" + aggregation_op: OPERATOR_MIN + sv_configs + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate((?!Max|Min).)*_Temperature.*" + aggregation_op: OPERATOR_MEAN + # Config for future 10-year period of num-months with max/min temp above/below threshold + - count_threshold_exception_over_time + time_range + input_obs_period: P1M + output_obs_period: P10Y + output_obs_date: 2030 + thresholds + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" + threshold_value: 5 + unit: Celsius + comparison: OPERATOR_GE + output_cprop: maxTemperature + thresholds + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" + threshold_value: -5 + unit: Celsius + comparison: OPERATOR_LE + output_cprop: minTemperature + # Config for future 30-year period of num-months with max/min temp above/below threshold + - count_threshold_exception_over_time + time_range + input_obs_period: P1M + output_obs_period: P30Y + output_obs_date: 2050 + thresholds + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" + threshold_value: 5 + unit: Celsius + comparison: OPERATOR_GE + output_cprop: maxTemperature + thresholds + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" + threshold_value: -5 + unit: Celsius + comparison: OPERATOR_LE + output_cprop: minTemperature + # Config for future 80-year period of num-months with max/min temp above/below threshold + - count_threshold_exception_over_time + time_range + input_obs_period: P1M + output_obs_period: P80Y + output_obs_date: 2100 + thresholds + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" + threshold_value: 5 + unit: Celsius + comparison: OPERATOR_GE + output_cprop: maxTemperature + thresholds + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" + threshold_value: -5 + unit: Celsius + comparison: OPERATOR_LE + output_cprop: minTemperature + - type: STAT_VAR_SERIES_AGGREGATION + input_imports: + - NASA_NEXGDDP_Subnational_AggrStatsAcrossModels + round: 3 + output_import: NASA_NEXGDDP_Subnational_AggrOverTimeOnStatsAcrossModels + stat_var_series_aggregation: + aggr_funcs: + # Config for future 10-year max/min aggregate + - aggr_over_time + time_range + input_obs_period: P1M + output_obs_period: P10Y + output_obs_date: 2030 + sv_configs + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" + aggregation_op: OPERATOR_MAX + sv_configs + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" + aggregation_op: OPERATOR_MIN + # Config for future 30-year max/min aggregate + - aggr_over_time + time_range + input_obs_period: P1M + output_obs_period: P30Y + output_obs_date: 2050 + sv_configs + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" + aggregation_op: OPERATOR_MAX + sv_configs + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" + aggregation_op: OPERATOR_MIN + # Config for future 80-year max/min aggregate + - aggr_over_time + time_range + input_obs_period: P1M + output_obs_period: P80Y + output_obs_date: 2100 + sv_configs + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" + aggregation_op: OPERATOR_MAX + sv_configs + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" + aggregation_op: OPERATOR_MIN + # Config for future 10-year period of num-months with max/min temp above/below threshold + - count_threshold_exception_over_time + time_range + input_obs_period: P1M + output_obs_period: P10Y + output_obs_date: 2030 + thresholds + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" + threshold_value: 5 + unit: Celsius + comparison: OPERATOR_GE + output_cprop: maxTemperature + thresholds + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" + threshold_value: -5 + unit: Celsius + comparison: OPERATOR_LE + output_cprop: minTemperature + # Config for future 30-year period of num-months with max/min temp above/below threshold + - count_threshold_exception_over_time + time_range + input_obs_period: P1M + output_obs_period: P30Y + output_obs_date: 2050 + thresholds + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" + threshold_value: 5 + unit: Celsius + comparison: OPERATOR_GE + output_cprop: maxTemperature + thresholds + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" + threshold_value: -5 + unit: Celsius + comparison: OPERATOR_LE + output_cprop: minTemperature + # Config for future 80-year period of num-months with max/min temp above/below threshold + - count_threshold_exception_over_time + time_range + input_obs_period: P1M + output_obs_period: P80Y + output_obs_date: 2100 + thresholds + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" + threshold_value: 5 + unit: Celsius + comparison: OPERATOR_GE + output_cprop: maxTemperature + thresholds + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" + threshold_value: -5 + unit: Celsius + comparison: OPERATOR_LE + output_cprop: minTemperature + - type: STAT_VAR_SERIES_AGGREGATION + input_imports: + - NASA_NEXGDDP_CMIP6_IpccPlaces50_AggrStatsAcrossModels + round: 3 + output_import: NASA_NEXGDDP_CMIP6_IpccPlaces50_AggrOverTimeOnStatsAcrossModels + stat_var_series_aggregation: + aggr_funcs: + # Config for future 10-year max/min aggregate + - aggr_over_time + time_range + input_obs_period: P1M + output_obs_period: P10Y + output_obs_date: 2030 + sv_configs + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" + aggregation_op: OPERATOR_MAX + sv_configs + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" + aggregation_op: OPERATOR_MIN + sv_configs + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate((?!Max|Min).)*_Temperature.*" + aggregation_op: OPERATOR_MEAN + # Config for future 30-year max/min aggregate + - aggr_over_time + time_range + input_obs_period: P1M + output_obs_period: P30Y + output_obs_date: 2050 + sv_configs + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" + aggregation_op: OPERATOR_MAX + sv_configs + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" + aggregation_op: OPERATOR_MIN + sv_configs + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate((?!Max|Min).)*_Temperature.*" + aggregation_op: OPERATOR_MEAN + # Config for future 80-year max/min aggregate + - aggr_over_time + time_range + input_obs_period: P1M + output_obs_period: P80Y + output_obs_date: 2100 + sv_configs + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" + aggregation_op: OPERATOR_MAX + sv_configs + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" + aggregation_op: OPERATOR_MIN + sv_configs + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate((?!Max|Min).)*_Temperature.*" + aggregation_op: OPERATOR_MEAN + # Config for future 10-year period of num-months with max/min temp above/below threshold + - count_threshold_exception_over_time + time_range + input_obs_period: P1M + output_obs_period: P10Y + output_obs_date: 2030 + thresholds + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" + threshold_value: 5 + unit: Celsius + comparison: OPERATOR_GE + output_cprop: maxTemperature + thresholds + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" + threshold_value: -5 + unit: Celsius + comparison: OPERATOR_LE + output_cprop: minTemperature + # Config for future 30-year period of num-months with max/min temp above/below threshold + - count_threshold_exception_over_time + time_range + input_obs_period: P1M + output_obs_period: P30Y + output_obs_date: 2050 + thresholds + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" + threshold_value: 5 + unit: Celsius + comparison: OPERATOR_GE + output_cprop: maxTemperature + thresholds + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" + threshold_value: -5 + unit: Celsius + comparison: OPERATOR_LE + output_cprop: minTemperature + # Config for future 80-year period of num-months with max/min temp above/below threshold + - count_threshold_exception_over_time + time_range + input_obs_period: P1M + output_obs_period: P80Y + output_obs_date: 2100 + thresholds + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" + threshold_value: 5 + unit: Celsius + comparison: OPERATOR_GE + output_cprop: maxTemperature + thresholds + sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" + threshold_value: -5 + unit: Celsius + comparison: OPERATOR_LE + output_cprop: minTemperature + - type: STAT_VAR_SERIES_AGGREGATION + input_imports: + - NASA_NEXGDDP_IpccPlaces50 + round: 1 + output_import: NASA_NEXGDDP_IpccPlaces50_AggrDiffStats + stat_var_series_aggregation: + aggr_funcs: + - max_diff_across_measurement_methods: + - diff_relative_to_base_date: + date_specs: + start_date: 2015 + end_date: 2020 + date_specs: + # There are two series in this import: + # 1. Monthly historical data from 1950 to 2005. + # 2. Monthly projections from 2006 to 2099. + dates: + - 1990 + - 2006 + diff --git a/pipeline/workflow/ingestion-helper/aggregation/configs/super_enum.yaml b/pipeline/workflow/ingestion-helper/aggregation/configs/super_enum.yaml new file mode 100644 index 000000000..661d51dcf --- /dev/null +++ b/pipeline/workflow/ingestion-helper/aggregation/configs/super_enum.yaml @@ -0,0 +1,25 @@ +calculations: + - type: SUPER_ENUM_AGGREGATION + input_imports: + - CensusACS5YearSurvey + output_import: CensusACS5YearSurvey_SuperEnum + + - type: SUPER_ENUM_AGGREGATION + input_imports: + - CensusACS5YearSurvey_AggCountry + output_import: CensusACS5YearSurvey_AggCountry_SuperEnum + + - type: SUPER_ENUM_AGGREGATION + input_imports: + - NCES_PrivateSchoolStats + output_import: NCES_PrivateSchoolStats_SuperEnum + + - type: SUPER_ENUM_AGGREGATION + input_imports: + - NCES_PublicSchoolStats + output_import: NCES_PublicSchoolStats_SuperEnum + + - type: SUPER_ENUM_AGGREGATION + input_imports: + - NCES_SchoolDistrictStats + output_import: NCES_SchoolDistrictStats_SuperEnum From 481a4694474e883fe1fe262d18d6553754696cbb Mon Sep 17 00:00:00 2001 From: Sandeep Tuniki Date: Thu, 2 Jul 2026 15:29:00 +0530 Subject: [PATCH 18/33] style(aggregation): remove internal links and bug references from YAML configs --- .../aggregation/configs/statvar.yaml | 13 ------------- .../aggregation/configs/statvar_series.yaml | 2 -- 2 files changed, 15 deletions(-) diff --git a/pipeline/workflow/ingestion-helper/aggregation/configs/statvar.yaml b/pipeline/workflow/ingestion-helper/aggregation/configs/statvar.yaml index fe6743c5b..6f2896c6e 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/configs/statvar.yaml +++ b/pipeline/workflow/ingestion-helper/aggregation/configs/statvar.yaml @@ -16,7 +16,6 @@ calculations: # We use that to compute total and by disability-status. # # Total uninsured - # https://screenshot.googleplex.com/v4s43nzoCC9G5Qp - ancestor_sv_id: Count_Person_NoHealthInsurance source_sv_ids: # No Disability @@ -28,7 +27,6 @@ calculations: - dc/96dqj47csvmy8 - dc/qr4s77egv27q2 # Total insured - # https://screenshot.googleplex.com/ARqaVprXtizwETy - ancestor_sv_id: Count_Person_WithHealthInsurance source_sv_ids: # No Disability @@ -73,7 +71,6 @@ calculations: # breakdown (https://data.census.gov/cedsci/table?tid=ACSDT5Y2019.B27001). We # use that to compute gender breakdown. # - # https://screenshot.googleplex.com/aSP3i8ASJDxct2C - ancestor_sv_id: Count_Person_Female_NoHealthInsurance source_sv_ids: - dc/jx2q10tbnwhf3 @@ -85,7 +82,6 @@ calculations: - dc/jydkzj9v0yb2 - dc/qb6th3c5tcd92 - dc/zj152fzswgrw6 - # https://screenshot.googleplex.com/57jf9CpaC7KGd7y - ancestor_sv_id: Count_Person_Male_NoHealthInsurance source_sv_ids: - dc/fwqlkjjch0dg5 @@ -97,7 +93,6 @@ calculations: - dc/nn4xz2lv98h5c - dc/ywjqcxc244sx2 - dc/j88b1fy7045j1 - # https://screenshot.googleplex.com/5buzauZqyhTdnkV - ancestor_sv_id: Count_Person_Female_WithHealthInsurance source_sv_ids: - dc/jjhxlkp3qk2m6 @@ -109,7 +104,6 @@ calculations: - dc/y9xxbzfr8yb06 - dc/whh1y53fj4p69 - dc/965mdb0c8tef9 - # https://screenshot.googleplex.com/38JDwJJGKhNLmWM - ancestor_sv_id: Count_Person_Male_WithHealthInsurance source_sv_ids: - dc/ve6s1v1eyrbt7 @@ -220,7 +214,6 @@ calculations: - CensusACS5YearSurvey_AggCountry stat_var_aggregation: aggregations: - # https://screenshot.googleplex.com/3Fw42DhNnD5fPLi - ancestor_sv_id: Count_Person_SpeakEnglishNotAtAll source_sv_ids: - dc/edfjy64gmxf6f @@ -231,7 +224,6 @@ calculations: - dc/6yzk8t4e9t6v6 - dc/0kx32ff9c6d79 - dc/7cxlkzf56zk26 - # https://screenshot.googleplex.com/3ExaNScXGyyEP2c - ancestor_sv_id: Count_Person_SpeakEnglishNotWell source_sv_ids: - dc/f8qh4hp1830dg @@ -242,7 +234,6 @@ calculations: - dc/6rzmvxpgqlww6 - dc/wf1rm5zx34dtg - dc/4xykjw3v6n4t3 - # https://screenshot.googleplex.com/AaaSNUMpQwHRvSZ - ancestor_sv_id: Count_Person_SpeakEnglishWell source_sv_ids: - dc/4n1wg2c7hjem @@ -271,7 +262,6 @@ calculations: stat_var_aggregation: aggregations: # Women in armed forces - # https://screenshot.googleplex.com/3ik3W3g2eqMjkSm - ancestor_sv_id: Count_Person_Female_InArmedForces source_sv_ids: - dc/173smewzddlb @@ -285,7 +275,6 @@ calculations: - dc/lb3lb4mg82mph - dc/g2m31qc7q1x64 # Men in armed forces - # https://screenshot.googleplex.com/6dvhYcbmtHsjifQ - ancestor_sv_id: Count_Person_Male_InArmedForces source_sv_ids: - dc/vp1gqv00d2ql3 @@ -694,7 +683,6 @@ calculations: # stat_var_aggregation: # aggregations: # # Women Employed - # # https://screenshot.googleplex.com/AYFWvJqXa3EB2eU # # # The ancestor has "armedForcesStatus: dcs:Civilian", but the following sources # # don't have. Fix it before adding this to aggregation. @@ -725,7 +713,6 @@ calculations: # # dc/xepldf55yq6s5 # # # Men Employed - # # https://screenshot.googleplex.com/C24sFkxNbUpWiQJ # - ancestor_sv_id: Count_Person_Male_Employed # source_sv_ids: # - dc/2s6hps4z1qced diff --git a/pipeline/workflow/ingestion-helper/aggregation/configs/statvar_series.yaml b/pipeline/workflow/ingestion-helper/aggregation/configs/statvar_series.yaml index 97c7da2e2..3ce0fbdc4 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/configs/statvar_series.yaml +++ b/pipeline/workflow/ingestion-helper/aggregation/configs/statvar_series.yaml @@ -8,7 +8,6 @@ calculations: aggr_funcs: - max_diff_across_measurement_methods: - diff_relative_to_base_date: - # TODO(b/220070731): Consider adding start_date/end_date spec. date_specs: # There are two series in this import: # 1. Monthly historical data from 1950 to 2005. @@ -44,7 +43,6 @@ calculations: aggr_funcs: - max_diff_across_measurement_methods: - diff_relative_to_base_date: - # TODO(b/220070731): Consider adding start_date/end_date spec. date_specs: # There are two series in this import: # 1. Monthly historical data from 1950 to 2005. From 832726c4af3d27393fb3759e68e1b04fcba4873e Mon Sep 17 00:00:00 2001 From: Sandeep Tuniki Date: Thu, 2 Jul 2026 15:54:20 +0530 Subject: [PATCH 19/33] feat(aggregation): update schema.json and validator.py for YAML calculations configs --- .../aggregation/configs/place.yaml | 51 +- .../configs/statvar_calculation.yaml | 133 ++-- .../aggregation/configs/statvar_series.yaml | 587 +++++++----------- .../ingestion-helper/aggregation/schema.json | 98 +-- .../ingestion-helper/aggregation/validator.py | 83 ++- 5 files changed, 404 insertions(+), 548 deletions(-) diff --git a/pipeline/workflow/ingestion-helper/aggregation/configs/place.yaml b/pipeline/workflow/ingestion-helper/aggregation/configs/place.yaml index 6d1108f73..892a2b128 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/configs/place.yaml +++ b/pipeline/workflow/ingestion-helper/aggregation/configs/place.yaml @@ -1,6 +1,7 @@ calculations: - type: PLACE_AGGREGATION - input_imports: CensusACS5YearSurvey + input_imports: + - CensusACS5YearSurvey place_aggregation: from_place_types: State to_place_types: Country @@ -8,7 +9,8 @@ calculations: output_import: CensusACS5YearSurvey_AggCountry - type: PLACE_AGGREGATION - input_imports: CensusSAHIE + input_imports: + - CensusSAHIE place_aggregation: from_place_types: State to_place_types: Country @@ -16,7 +18,8 @@ calculations: output_import: CensusSAHIE_AggCountry - type: PLACE_AGGREGATION - input_imports: CDCMortality + input_imports: + - CDCMortality place_aggregation: from_place_types: County to_place_types: State @@ -24,7 +27,8 @@ calculations: output_import: CDCMortality_AggState - type: PLACE_AGGREGATION - input_imports: CDCMortality_AggState + input_imports: + - CDCMortality_AggState place_aggregation: from_place_types: State to_place_types: Country @@ -32,7 +36,8 @@ calculations: output_import: CDCMortality_AggState_AggCountry # - type: PLACE_AGGREGATION - # input_imports: FBIGovCrime + # input_imports: + # - FBIGovCrime # place_aggregation: # from_place_types: State # to_place_types: Country @@ -40,7 +45,8 @@ calculations: # output_import: FBIGovCrime_AggCountry - type: PLACE_AGGREGATION - input_imports: DEA_ARCOS + input_imports: + - DEA_ARCOS place_aggregation: from_place_types: County to_place_types: State @@ -48,7 +54,8 @@ calculations: output_import: DEA_ARCOS_AggState - type: PLACE_AGGREGATION - input_imports: DEA_ARCOS_AggState + input_imports: + - DEA_ARCOS_AggState place_aggregation: from_place_types: State to_place_types: Country @@ -56,7 +63,8 @@ calculations: output_import: DEA_ARCOS_AggState_AggCountry - type: PLACE_AGGREGATION - input_imports: EPA_EJSCREEN + input_imports: + - EPA_EJSCREEN place_aggregation: from_place_types: CensusBlockGroup to_place_types: CensusTract @@ -64,7 +72,8 @@ calculations: output_import: EPA_EJSCREEN_AggCensusTract - type: PLACE_AGGREGATION - input_imports: EPA_EJSCREEN_AggCensusTract + input_imports: + - EPA_EJSCREEN_AggCensusTract place_aggregation: from_place_types: CensusTract to_place_types: County @@ -72,7 +81,8 @@ calculations: output_import: EPA_EJSCREEN_AggCensusTract_AggCounty - type: PLACE_AGGREGATION - input_imports: DeepSolar + input_imports: + - DeepSolar place_aggregation: from_place_types: CensusBlockGroup to_place_types: CensusTract @@ -80,7 +90,8 @@ calculations: output_import: DeepSolar_AggCensusTract - type: PLACE_AGGREGATION - input_imports: DeepSolar_AggCensusTract + input_imports: + - DeepSolar_AggCensusTract place_aggregation: from_place_types: CensusTract to_place_types: County @@ -88,7 +99,8 @@ calculations: output_import: DeepSolar_AggCensusTract_AggCounty - type: PLACE_AGGREGATION - input_imports: EPA_GHGRP + input_imports: + - EPA_GHGRP place_aggregation: from_place_types: EpaReportingFacility to_place_types: County @@ -96,7 +108,8 @@ calculations: output_import: EPA_GHGRP_AggCounty - type: PLACE_AGGREGATION - input_imports: EPA_GHGRP + input_imports: + - EPA_GHGRP place_aggregation: from_place_types: EpaReportingFacility to_place_types: CensusZipCodeTabulationArea @@ -104,7 +117,8 @@ calculations: output_import: EPA_GHGRP_AggCensusZipCodeTabulationArea - type: PLACE_AGGREGATION - input_imports: EPA_GHGRP_AggCounty + input_imports: + - EPA_GHGRP_AggCounty place_aggregation: from_place_types: County to_place_types: State @@ -112,7 +126,8 @@ calculations: output_import: EPA_GHGRP_AggCounty_AggState - type: PLACE_AGGREGATION - input_imports: RFF_USGridGeo_WeatherVariabilityForecast + input_imports: + - RFF_USGridGeo_WeatherVariabilityForecast place_aggregation: from_place_types: GeoGridPlace_0.25Deg to_place_types: County @@ -121,7 +136,8 @@ calculations: output_import: RFF_USGridGeo_WeatherVariabilityForecast_AggCounty - type: PLACE_AGGREGATION - input_imports: India_RBIStateDomesticProduct + input_imports: + - India_RBIStateDomesticProduct place_aggregation: from_place_types: State to_place_types: Country @@ -130,7 +146,8 @@ calculations: output_import: India_RBIStateDomesticProduct_AggCountry - type: PLACE_AGGREGATION - input_imports: India_RBIStateDomesticProduct_StatVarAgg + input_imports: + - India_RBIStateDomesticProduct_StatVarAgg place_aggregation: from_place_types: State to_place_types: Country diff --git a/pipeline/workflow/ingestion-helper/aggregation/configs/statvar_calculation.yaml b/pipeline/workflow/ingestion-helper/aggregation/configs/statvar_calculation.yaml index 5fd320fb9..12408e433 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/configs/statvar_calculation.yaml +++ b/pipeline/workflow/ingestion-helper/aggregation/configs/statvar_calculation.yaml @@ -21,7 +21,7 @@ calculations: sv_regex: Annual_Generation_Electricity facet_info: unit: GigawattHour - operation: DIVIDE + operation: DIVIDE output: sv: Annual_Emissions_GreenhouseGas_NonBiogenic_Per_Annual_Generation_Electricity measurement_method: EPA_GHGRP_EIA_Electricity @@ -61,20 +61,19 @@ calculations: unit: Celsius observation_period: P1M input2: - sv_regex: ^Mean_Temperature$ - measurement_method_regex: dcAggregate/NASAGSOD_NASAGHCN_EPA + sv_regex: "^Mean_Temperature$" + measurement_method_regex: "dcAggregate/NASAGSOD_NASAGHCN_EPA" facet_info: unit: Celsius observation_period: P1M - operation: SUBTRACT + operation: SUBTRACT output: - sv_prefix: DifferenceRelativeToObservationalData_ - measurement_method_prefix: dcAggregate/NASA_Mean_CMIP6_WithBaseAs_ + sv_prefix: "DifferenceRelativeToObservationalData_" + measurement_method_prefix: "dcAggregate/NASA_Mean_CMIP6_WithBaseAs_" facet_info: unit: Celsius observation_period: P1M - calculations: - input1: sv_regex: "^Min_Temperature(_SSP[0-9]+)*$" measurement_method_regex: "^dcAggregate/NASA_Mean_CMIP6_.*" @@ -82,20 +81,19 @@ calculations: unit: Celsius observation_period: P1M input2: - sv_regex: ^Min_Temperature$ - measurement_method_regex: dcAggregate/NASAGSOD_NASAGHCN_EPA + sv_regex: "^Min_Temperature$" + measurement_method_regex: "dcAggregate/NASAGSOD_NASAGHCN_EPA" facet_info: unit: Celsius observation_period: P1M - operation: SUBTRACT + operation: SUBTRACT output: - sv_prefix: DifferenceRelativeToObservationalData_ - measurement_method_prefix: dcAggregate/NASA_Mean_CMIP6_WithBaseAs_ + sv_prefix: "DifferenceRelativeToObservationalData_" + measurement_method_prefix: "dcAggregate/NASA_Mean_CMIP6_WithBaseAs_" facet_info: unit: Celsius observation_period: P1M - calculations: - input1: sv_regex: "^Max_Temperature(_SSP[0-9]+)*$" measurement_method_regex: "^dcAggregate/NASA_Mean_CMIP6_.*" @@ -103,20 +101,19 @@ calculations: unit: Celsius observation_period: P1M input2: - sv_regex: ^Max_Temperature$ - measurement_method_regex: dcAggregate/NASAGSOD_NASAGHCN_EPA + sv_regex: "^Max_Temperature$" + measurement_method_regex: "dcAggregate/NASAGSOD_NASAGHCN_EPA" facet_info: unit: Celsius observation_period: P1M - operation: SUBTRACT + operation: SUBTRACT output: - sv_prefix: DifferenceRelativeToObservationalData_ - measurement_method_prefix: dcAggregate/NASA_Mean_CMIP6_WithBaseAs_ + sv_prefix: "DifferenceRelativeToObservationalData_" + measurement_method_prefix: "dcAggregate/NASA_Mean_CMIP6_WithBaseAs_" facet_info: unit: Celsius observation_period: P1M - calculations: - input1: sv_regex: "^Temperature(_SSP[0-9]+)*$" measurement_method_regex: "^dcAggregate/NASA_Mean_CMIP6_.*" @@ -125,21 +122,20 @@ calculations: observation_period: P1Y import_name_regex: ".*AggrYearsStats$" input2: - sv_regex: ^Mean_Temperature$ - measurement_method_regex: dcAggregate/NASAGSOD_NASAGHCN_EPA + sv_regex: "^Mean_Temperature$" + measurement_method_regex: "dcAggregate/NASAGSOD_NASAGHCN_EPA" facet_info: unit: Celsius observation_period: P1Y import_name_regex: ".*AggrYearsStats$" - operation: SUBTRACT + operation: SUBTRACT output: - sv_prefix: DifferenceRelativeToObservationalData_ - measurement_method_prefix: dcAggregate/NASA_Mean_CMIP6_WithBaseAs_ + sv_prefix: "DifferenceRelativeToObservationalData_" + measurement_method_prefix: "dcAggregate/NASA_Mean_CMIP6_WithBaseAs_" facet_info: unit: Celsius observation_period: P1Y - calculations: - input1: sv_regex: "^Min_Temperature(_SSP[0-9]+)*$" measurement_method_regex: "^dcAggregate/NASA_Mean_CMIP6_.*" @@ -148,21 +144,20 @@ calculations: observation_period: P1Y import_name_regex: ".*AggrYearsStats$" input2: - sv_regex: ^Min_Temperature$ - measurement_method_regex: dcAggregate/NASAGSOD_NASAGHCN_EPA + sv_regex: "^Min_Temperature$" + measurement_method_regex: "dcAggregate/NASAGSOD_NASAGHCN_EPA" facet_info: unit: Celsius observation_period: P1Y import_name_regex: ".*AggrYearsStats$" - operation: SUBTRACT + operation: SUBTRACT output: - sv_prefix: DifferenceRelativeToObservationalData_ - measurement_method_prefix: dcAggregate/NASA_Mean_CMIP6_WithBaseAs_ + sv_prefix: "DifferenceRelativeToObservationalData_" + measurement_method_prefix: "dcAggregate/NASA_Mean_CMIP6_WithBaseAs_" facet_info: unit: Celsius observation_period: P1Y - calculations: - input1: sv_regex: "^Max_Temperature(_SSP[0-9]+)*$" measurement_method_regex: "^dcAggregate/NASA_Mean_CMIP6_.*" @@ -171,21 +166,20 @@ calculations: observation_period: P1Y import_name_regex: ".*AggrYearsStats$" input2: - sv_regex: ^Max_Temperature$ - measurement_method_regex: dcAggregate/NASAGSOD_NASAGHCN_EPA + sv_regex: "^Max_Temperature$" + measurement_method_regex: "dcAggregate/NASAGSOD_NASAGHCN_EPA" facet_info: unit: Celsius observation_period: P1Y import_name_regex: ".*AggrYearsStats$" - operation: SUBTRACT + operation: SUBTRACT output: - sv_prefix: DifferenceRelativeToObservationalData_ - measurement_method_prefix: dcAggregate/NASA_Mean_CMIP6_WithBaseAs_ + sv_prefix: "DifferenceRelativeToObservationalData_" + measurement_method_prefix: "dcAggregate/NASA_Mean_CMIP6_WithBaseAs_" facet_info: unit: Celsius observation_period: P1Y - calculations: - input1: sv_regex: "^Temperature(_SSP[0-9]+)*$" measurement_method_regex: "^dcAggregate/NASA_Mean_CMIP6_.*" @@ -194,21 +188,20 @@ calculations: observation_period: P5Y import_name_regex: ".*AggrYearsStats$" input2: - sv_regex: ^Mean_Temperature$ - measurement_method_regex: dcAggregate/NASAGSOD_NASAGHCN_EPA + sv_regex: "^Mean_Temperature$" + measurement_method_regex: "dcAggregate/NASAGSOD_NASAGHCN_EPA" facet_info: unit: Celsius observation_period: P5Y import_name_regex: ".*AggrYearsStats$" - operation: SUBTRACT + operation: SUBTRACT output: - sv_prefix: DifferenceRelativeToObservationalData_ - measurement_method_prefix: dcAggregate/NASA_Mean_CMIP6_WithBaseAs_ + sv_prefix: "DifferenceRelativeToObservationalData_" + measurement_method_prefix: "dcAggregate/NASA_Mean_CMIP6_WithBaseAs_" facet_info: unit: Celsius observation_period: P5Y - calculations: - input1: sv_regex: "^Min_Temperature(_SSP[0-9]+)*$" measurement_method_regex: "^dcAggregate/NASA_Mean_CMIP6_.*" @@ -217,21 +210,20 @@ calculations: observation_period: P5Y import_name_regex: ".*AggrYearsStats$" input2: - sv_regex: ^Min_Temperature$ - measurement_method_regex: dcAggregate/NASAGSOD_NASAGHCN_EPA + sv_regex: "^Min_Temperature$" + measurement_method_regex: "dcAggregate/NASAGSOD_NASAGHCN_EPA" facet_info: unit: Celsius observation_period: P5Y import_name_regex: ".*AggrYearsStats$" - operation: SUBTRACT + operation: SUBTRACT output: - sv_prefix: DifferenceRelativeToObservationalData_ - measurement_method_prefix: dcAggregate/NASA_Mean_CMIP6_WithBaseAs_ + sv_prefix: "DifferenceRelativeToObservationalData_" + measurement_method_prefix: "dcAggregate/NASA_Mean_CMIP6_WithBaseAs_" facet_info: unit: Celsius observation_period: P5Y - calculations: - input1: sv_regex: "^Max_Temperature(_SSP[0-9]+)*$" measurement_method_regex: "^dcAggregate/NASA_Mean_CMIP6_.*" @@ -240,21 +232,20 @@ calculations: observation_period: P5Y import_name_regex: ".*AggrYearsStats$" input2: - sv_regex: ^Max_Temperature$ - measurement_method_regex: dcAggregate/NASAGSOD_NASAGHCN_EPA + sv_regex: "^Max_Temperature$" + measurement_method_regex: "dcAggregate/NASAGSOD_NASAGHCN_EPA" facet_info: unit: Celsius observation_period: P5Y import_name_regex: ".*AggrYearsStats$" - operation: SUBTRACT + operation: SUBTRACT output: - sv_prefix: DifferenceRelativeToObservationalData_ - measurement_method_prefix: dcAggregate/NASA_Mean_CMIP6_WithBaseAs_ + sv_prefix: "DifferenceRelativeToObservationalData_" + measurement_method_prefix: "dcAggregate/NASA_Mean_CMIP6_WithBaseAs_" facet_info: unit: Celsius observation_period: P5Y - calculations: - input1: sv_regex: "^Temperature(_SSP[0-9]+)*$" measurement_method_regex: "^dcAggregate/NASA_Mean_CMIP6_.*" @@ -263,21 +254,20 @@ calculations: observation_period: P10Y import_name_regex: ".*AggrYearsStats$" input2: - sv_regex: ^Mean_Temperature$ - measurement_method_regex: dcAggregate/NASAGSOD_NASAGHCN_EPA + sv_regex: "^Mean_Temperature$" + measurement_method_regex: "dcAggregate/NASAGSOD_NASAGHCN_EPA" facet_info: unit: Celsius observation_period: P10Y import_name_regex: ".*AggrYearsStats$" - operation: SUBTRACT + operation: SUBTRACT output: - sv_prefix: DifferenceRelativeToObservationalData_ - measurement_method_prefix: dcAggregate/NASA_Mean_CMIP6_WithBaseAs_ + sv_prefix: "DifferenceRelativeToObservationalData_" + measurement_method_prefix: "dcAggregate/NASA_Mean_CMIP6_WithBaseAs_" facet_info: unit: Celsius observation_period: P10Y - calculations: - input1: sv_regex: "^Min_Temperature(_SSP[0-9]+)*$" measurement_method_regex: "^dcAggregate/NASA_Mean_CMIP6_.*" @@ -286,21 +276,20 @@ calculations: observation_period: P10Y import_name_regex: ".*AggrYearsStats$" input2: - sv_regex: ^Min_Temperature$ - measurement_method_regex: dcAggregate/NASAGSOD_NASAGHCN_EPA + sv_regex: "^Min_Temperature$" + measurement_method_regex: "dcAggregate/NASAGSOD_NASAGHCN_EPA" facet_info: unit: Celsius observation_period: P10Y import_name_regex: ".*AggrYearsStats$" - operation: SUBTRACT + operation: SUBTRACT output: - sv_prefix: DifferenceRelativeToObservationalData_ - measurement_method_prefix: dcAggregate/NASA_Mean_CMIP6_WithBaseAs_ + sv_prefix: "DifferenceRelativeToObservationalData_" + measurement_method_prefix: "dcAggregate/NASA_Mean_CMIP6_WithBaseAs_" facet_info: unit: Celsius observation_period: P10Y - calculations: - input1: sv_regex: "^Max_Temperature(_SSP[0-9]+)*$" measurement_method_regex: "^dcAggregate/NASA_Mean_CMIP6_.*" @@ -309,16 +298,16 @@ calculations: observation_period: P10Y import_name_regex: ".*AggrYearsStats$" input2: - sv_regex: ^Max_Temperature$ - measurement_method_regex: dcAggregate/NASAGSOD_NASAGHCN_EPA + sv_regex: "^Max_Temperature$" + measurement_method_regex: "dcAggregate/NASAGSOD_NASAGHCN_EPA" facet_info: unit: Celsius observation_period: P10Y import_name_regex: ".*AggrYearsStats$" - operation: SUBTRACT + operation: SUBTRACT output: - sv_prefix: DifferenceRelativeToObservationalData_ - measurement_method_prefix: dcAggregate/NASA_Mean_CMIP6_WithBaseAs_ + sv_prefix: "DifferenceRelativeToObservationalData_" + measurement_method_prefix: "dcAggregate/NASA_Mean_CMIP6_WithBaseAs_" facet_info: unit: Celsius observation_period: P10Y diff --git a/pipeline/workflow/ingestion-helper/aggregation/configs/statvar_series.yaml b/pipeline/workflow/ingestion-helper/aggregation/configs/statvar_series.yaml index 3ce0fbdc4..4359a8f93 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/configs/statvar_series.yaml +++ b/pipeline/workflow/ingestion-helper/aggregation/configs/statvar_series.yaml @@ -6,15 +6,13 @@ calculations: output_import: NASA_NEXDCP30_AggrDiffStats stat_var_series_aggregation: aggr_funcs: - - max_diff_across_measurement_methods: + - max_diff_across_measurement_methods: {} - diff_relative_to_base_date: date_specs: - # There are two series in this import: - # 1. Monthly historical data from 1950 to 2005. - # 2. Monthly projections from 2006 to 2099. - dates: - - 1990 - - 2006 + - dates: + - "1990" + - "2006" + - type: STAT_VAR_SERIES_AGGREGATION input_imports: - NASA_NEXGDDP_Subnational @@ -22,18 +20,15 @@ calculations: output_import: NASA_NEXGDDP_Subnational_AggrDiffStats stat_var_series_aggregation: aggr_funcs: - - max_diff_across_measurement_methods: + - max_diff_across_measurement_methods: {} - diff_relative_to_base_date: date_specs: - start_date: 2006 - end_date: 2020 - date_specs: - # There are two series in this import: - # 1. Monthly historical data from 1950 to 2005. - # 2. Monthly projections from 2006 to 2099. - dates: - - 1990 - - 2006 + - start_date: "2006" + end_date: "2020" + - dates: + - "1990" + - "2006" + - type: STAT_VAR_SERIES_AGGREGATION input_imports: - NASA_NEXGDDP_Country @@ -41,15 +36,13 @@ calculations: output_import: NASA_NEXGDDP_Country_AggrDiffStats stat_var_series_aggregation: aggr_funcs: - - max_diff_across_measurement_methods: + - max_diff_across_measurement_methods: {} - diff_relative_to_base_date: date_specs: - # There are two series in this import: - # 1. Monthly historical data from 1950 to 2005. - # 2. Monthly projections from 2006 to 2099. - dates: - - 1990 - - 2006 + - dates: + - "1990" + - "2006" + - type: STAT_VAR_SERIES_AGGREGATION input_imports: - NASA_NEXGDDP_CMIP6_Subnational @@ -57,20 +50,15 @@ calculations: output_import: NASA_NEXGDDP_CMIP6_Subnational_AggrDiffStats stat_var_series_aggregation: aggr_funcs: - - max_diff_across_measurement_methods: + - max_diff_across_measurement_methods: {} - diff_relative_to_base_date: date_specs: - start_date: 2015 - end_date: 2020 - date_specs: - # There are four series in this import: - # 1. Monthly historical data from 1950 to 2014. - # 2. Yearly historical data from 1950 to 2014 - # 3. Monthly projections from 2015 to 2100. - # 4. Yearly projections from 2015 to 2100. - dates: - - 1990 - - 2015 + - start_date: "2015" + end_date: "2020" + - dates: + - "1990" + - "2006" + - type: STAT_VAR_SERIES_AGGREGATION input_imports: - NASA_NEXGDDP_CMIP6_IpccPlaces50 @@ -78,44 +66,43 @@ calculations: output_import: NASA_NEXGDDP_CMIP6_IpccPlaces50_AggrDiffStats stat_var_series_aggregation: aggr_funcs: - - max_diff_across_measurement_methods: + - max_diff_across_measurement_methods: {} - diff_relative_to_base_date: date_specs: - start_date: 2015 - end_date: 2020 - date_specs: - # There are four series in this import: - # 1. Monthly historical data from 1950 to 2014. - # 2. Yearly historical data from 1950 to 2014 - # 3. Monthly projections from 2015 to 2100. - # 4. Yearly projections from 2015 to 2100. - dates: - - 1990 - - 2015 - - type: STAT_VAR_SERIES_AGGREGATION - input_imports: - - NASA_NEXGDDP_Subnational_AggrDiffStats - round: 2 - output_import: NASA_NEXGDDP_Subnational_AggrStatsAcrossModels - stat_var_series_aggregation: - aggr_funcs: - - stats_across_models: + - start_date: "2015" + end_date: "2020" + - dates: + - "1990" + - "2006" + - type: STAT_VAR_SERIES_AGGREGATION input_imports: - - NASA_NEXGDDP_CMIP6_Subnational_AggrDiffStats + - NASA_NEXGDDP_CMIP6_Subnational round: 2 output_import: NASA_NEXGDDP_CMIP6_Subnational_AggrStatsAcrossModels stat_var_series_aggregation: aggr_funcs: - - stats_across_models: + - aggr_stats_across_models: + sv_regex: "^DifferenceRelativeToObservationalData_.*" + aggregation_ops: + - OPERATOR_MEDIAN + - OPERATOR_PERCENTILE90 + - OPERATOR_PERCENTILE10 + - type: STAT_VAR_SERIES_AGGREGATION input_imports: - - NASA_NEXGDDP_CMIP6_IpccPlaces50_AggrDiffStats + - NASA_NEXGDDP_CMIP6_IpccPlaces50 round: 2 output_import: NASA_NEXGDDP_CMIP6_IpccPlaces50_AggrStatsAcrossModels stat_var_series_aggregation: aggr_funcs: - - stats_across_models: + - aggr_stats_across_models: + sv_regex: "^DifferenceRelativeToObservationalData_.*" + aggregation_ops: + - OPERATOR_MEDIAN + - OPERATOR_PERCENTILE90 + - OPERATOR_PERCENTILE10 + - type: STAT_VAR_SERIES_AGGREGATION input_imports: - NASA_NEXGDDP_CMIP6_Subnational_AggrStatsAcrossModels @@ -124,201 +111,101 @@ calculations: stat_var_series_aggregation: aggr_funcs: # Config for future 10-year max/min aggregate - - aggr_over_time - time_range - input_obs_period: P1M - output_obs_period: P10Y - output_obs_date: 2030 - sv_configs - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" - aggregation_op: OPERATOR_MAX - sv_configs - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" - aggregation_op: OPERATOR_MIN - sv_configs - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate((?!Max|Min).)*_Temperature.*" - aggregation_op: OPERATOR_MEAN - # Config for future 30-year max/min aggregate - - aggr_over_time - time_range - input_obs_period: P1M - output_obs_period: P30Y - output_obs_date: 2050 - sv_configs - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" - aggregation_op: OPERATOR_MAX - sv_configs - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" - aggregation_op: OPERATOR_MIN - sv_configs - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate((?!Max|Min).)*_Temperature.*" - aggregation_op: OPERATOR_MEAN - # Config for future 80-year max/min aggregate - - aggr_over_time - time_range - input_obs_period: P1M - output_obs_period: P80Y - output_obs_date: 2100 - sv_configs - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" - aggregation_op: OPERATOR_MAX - sv_configs - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" - aggregation_op: OPERATOR_MIN - sv_configs - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate((?!Max|Min).)*_Temperature.*" - aggregation_op: OPERATOR_MEAN - # Config for future 10-year period of num-months with max/min temp above/below threshold - - count_threshold_exception_over_time - time_range - input_obs_period: P1M - output_obs_period: P10Y - output_obs_date: 2030 - thresholds - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" - threshold_value: 5 - unit: Celsius - comparison: OPERATOR_GE - output_cprop: maxTemperature - thresholds - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" - threshold_value: -5 - unit: Celsius - comparison: OPERATOR_LE - output_cprop: minTemperature - # Config for future 30-year period of num-months with max/min temp above/below threshold - - count_threshold_exception_over_time - time_range - input_obs_period: P1M - output_obs_period: P30Y - output_obs_date: 2050 - thresholds - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" - threshold_value: 5 - unit: Celsius - comparison: OPERATOR_GE - output_cprop: maxTemperature - thresholds - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" - threshold_value: -5 - unit: Celsius - comparison: OPERATOR_LE - output_cprop: minTemperature - # Config for future 80-year period of num-months with max/min temp above/below threshold - - count_threshold_exception_over_time - time_range - input_obs_period: P1M - output_obs_period: P80Y - output_obs_date: 2100 - thresholds - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" - threshold_value: 5 - unit: Celsius - comparison: OPERATOR_GE - output_cprop: maxTemperature - thresholds - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" - threshold_value: -5 - unit: Celsius - comparison: OPERATOR_LE - output_cprop: minTemperature - - type: STAT_VAR_SERIES_AGGREGATION - input_imports: - - NASA_NEXGDDP_Subnational_AggrStatsAcrossModels - round: 3 - output_import: NASA_NEXGDDP_Subnational_AggrOverTimeOnStatsAcrossModels - stat_var_series_aggregation: - aggr_funcs: - # Config for future 10-year max/min aggregate - - aggr_over_time - time_range - input_obs_period: P1M - output_obs_period: P10Y - output_obs_date: 2030 - sv_configs - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" - aggregation_op: OPERATOR_MAX - sv_configs - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" - aggregation_op: OPERATOR_MIN + - aggr_over_time: + time_range: + input_obs_period: P1M + output_obs_period: P10Y + output_obs_date: "2030" + sv_configs: + - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" + aggregation_op: OPERATOR_MAX + - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" + aggregation_op: OPERATOR_MIN + - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate((?!Max|Min).)*_Temperature.*" + aggregation_op: OPERATOR_MEAN + # Config for future 30-year max/min aggregate - - aggr_over_time - time_range - input_obs_period: P1M - output_obs_period: P30Y - output_obs_date: 2050 - sv_configs - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" - aggregation_op: OPERATOR_MAX - sv_configs - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" - aggregation_op: OPERATOR_MIN + - aggr_over_time: + time_range: + input_obs_period: P1M + output_obs_period: P30Y + output_obs_date: "2050" + sv_configs: + - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" + aggregation_op: OPERATOR_MAX + - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" + aggregation_op: OPERATOR_MIN + - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate((?!Max|Min).)*_Temperature.*" + aggregation_op: OPERATOR_MEAN + # Config for future 80-year max/min aggregate - - aggr_over_time - time_range - input_obs_period: P1M - output_obs_period: P80Y - output_obs_date: 2100 - sv_configs - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" - aggregation_op: OPERATOR_MAX - sv_configs - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" - aggregation_op: OPERATOR_MIN + - aggr_over_time: + time_range: + input_obs_period: P1M + output_obs_period: P80Y + output_obs_date: "2100" + sv_configs: + - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" + aggregation_op: OPERATOR_MAX + - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" + aggregation_op: OPERATOR_MIN + - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate((?!Max|Min).)*_Temperature.*" + aggregation_op: OPERATOR_MEAN + # Config for future 10-year period of num-months with max/min temp above/below threshold - - count_threshold_exception_over_time - time_range - input_obs_period: P1M - output_obs_period: P10Y - output_obs_date: 2030 - thresholds - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" - threshold_value: 5 - unit: Celsius - comparison: OPERATOR_GE - output_cprop: maxTemperature - thresholds - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" - threshold_value: -5 - unit: Celsius - comparison: OPERATOR_LE - output_cprop: minTemperature + - count_threshold_exception_over_time: + time_range: + input_obs_period: P1M + output_obs_period: P10Y + output_obs_date: "2030" + thresholds: + - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" + threshold_value: 5 + unit: Celsius + comparison: OPERATOR_GE + output_cprop: maxTemperature + - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" + threshold_value: -5 + unit: Celsius + comparison: OPERATOR_LE + output_cprop: minTemperature + # Config for future 30-year period of num-months with max/min temp above/below threshold - - count_threshold_exception_over_time - time_range - input_obs_period: P1M - output_obs_period: P30Y - output_obs_date: 2050 - thresholds - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" - threshold_value: 5 - unit: Celsius - comparison: OPERATOR_GE - output_cprop: maxTemperature - thresholds - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" - threshold_value: -5 - unit: Celsius - comparison: OPERATOR_LE - output_cprop: minTemperature + - count_threshold_exception_over_time: + time_range: + input_obs_period: P1M + output_obs_period: P30Y + output_obs_date: "2050" + thresholds: + - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" + threshold_value: 5 + unit: Celsius + comparison: OPERATOR_GE + output_cprop: maxTemperature + - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" + threshold_value: -5 + unit: Celsius + comparison: OPERATOR_LE + output_cprop: minTemperature + # Config for future 80-year period of num-months with max/min temp above/below threshold - - count_threshold_exception_over_time - time_range - input_obs_period: P1M - output_obs_period: P80Y - output_obs_date: 2100 - thresholds - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" - threshold_value: 5 - unit: Celsius - comparison: OPERATOR_GE - output_cprop: maxTemperature - thresholds - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" - threshold_value: -5 - unit: Celsius - comparison: OPERATOR_LE - output_cprop: minTemperature + - count_threshold_exception_over_time: + time_range: + input_obs_period: P1M + output_obs_period: P80Y + output_obs_date: "2100" + thresholds: + - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" + threshold_value: 5 + unit: Celsius + comparison: OPERATOR_GE + output_cprop: maxTemperature + - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" + threshold_value: -5 + unit: Celsius + comparison: OPERATOR_LE + output_cprop: minTemperature + - type: STAT_VAR_SERIES_AGGREGATION input_imports: - NASA_NEXGDDP_CMIP6_IpccPlaces50_AggrStatsAcrossModels @@ -327,104 +214,101 @@ calculations: stat_var_series_aggregation: aggr_funcs: # Config for future 10-year max/min aggregate - - aggr_over_time - time_range - input_obs_period: P1M - output_obs_period: P10Y - output_obs_date: 2030 - sv_configs - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" - aggregation_op: OPERATOR_MAX - sv_configs - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" - aggregation_op: OPERATOR_MIN - sv_configs - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate((?!Max|Min).)*_Temperature.*" - aggregation_op: OPERATOR_MEAN + - aggr_over_time: + time_range: + input_obs_period: P1M + output_obs_period: P10Y + output_obs_date: "2030" + sv_configs: + - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" + aggregation_op: OPERATOR_MAX + - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" + aggregation_op: OPERATOR_MIN + - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate((?!Max|Min).)*_Temperature.*" + aggregation_op: OPERATOR_MEAN + # Config for future 30-year max/min aggregate - - aggr_over_time - time_range - input_obs_period: P1M - output_obs_period: P30Y - output_obs_date: 2050 - sv_configs - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" - aggregation_op: OPERATOR_MAX - sv_configs - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" - aggregation_op: OPERATOR_MIN - sv_configs - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate((?!Max|Min).)*_Temperature.*" - aggregation_op: OPERATOR_MEAN + - aggr_over_time: + time_range: + input_obs_period: P1M + output_obs_period: P30Y + output_obs_date: "2050" + sv_configs: + - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" + aggregation_op: OPERATOR_MAX + - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" + aggregation_op: OPERATOR_MIN + - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate((?!Max|Min).)*_Temperature.*" + aggregation_op: OPERATOR_MEAN + # Config for future 80-year max/min aggregate - - aggr_over_time - time_range - input_obs_period: P1M - output_obs_period: P80Y - output_obs_date: 2100 - sv_configs - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" - aggregation_op: OPERATOR_MAX - sv_configs - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" - aggregation_op: OPERATOR_MIN - sv_configs - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate((?!Max|Min).)*_Temperature.*" - aggregation_op: OPERATOR_MEAN + - aggr_over_time: + time_range: + input_obs_period: P1M + output_obs_period: P80Y + output_obs_date: "2100" + sv_configs: + - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" + aggregation_op: OPERATOR_MAX + - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" + aggregation_op: OPERATOR_MIN + - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate((?!Max|Min).)*_Temperature.*" + aggregation_op: OPERATOR_MEAN + # Config for future 10-year period of num-months with max/min temp above/below threshold - - count_threshold_exception_over_time - time_range - input_obs_period: P1M - output_obs_period: P10Y - output_obs_date: 2030 - thresholds - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" - threshold_value: 5 - unit: Celsius - comparison: OPERATOR_GE - output_cprop: maxTemperature - thresholds - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" - threshold_value: -5 - unit: Celsius - comparison: OPERATOR_LE - output_cprop: minTemperature + - count_threshold_exception_over_time: + time_range: + input_obs_period: P1M + output_obs_period: P10Y + output_obs_date: "2030" + thresholds: + - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" + threshold_value: 5 + unit: Celsius + comparison: OPERATOR_GE + output_cprop: maxTemperature + - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" + threshold_value: -5 + unit: Celsius + comparison: OPERATOR_LE + output_cprop: minTemperature + # Config for future 30-year period of num-months with max/min temp above/below threshold - - count_threshold_exception_over_time - time_range - input_obs_period: P1M - output_obs_period: P30Y - output_obs_date: 2050 - thresholds - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" - threshold_value: 5 - unit: Celsius - comparison: OPERATOR_GE - output_cprop: maxTemperature - thresholds - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" - threshold_value: -5 - unit: Celsius - comparison: OPERATOR_LE - output_cprop: minTemperature + - count_threshold_exception_over_time: + time_range: + input_obs_period: P1M + output_obs_period: P30Y + output_obs_date: "2050" + thresholds: + - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" + threshold_value: 5 + unit: Celsius + comparison: OPERATOR_GE + output_cprop: maxTemperature + - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" + threshold_value: -5 + unit: Celsius + comparison: OPERATOR_LE + output_cprop: minTemperature + # Config for future 80-year period of num-months with max/min temp above/below threshold - - count_threshold_exception_over_time - time_range - input_obs_period: P1M - output_obs_period: P80Y - output_obs_date: 2100 - thresholds - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" - threshold_value: 5 - unit: Celsius - comparison: OPERATOR_GE - output_cprop: maxTemperature - thresholds - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" - threshold_value: -5 - unit: Celsius - comparison: OPERATOR_LE - output_cprop: minTemperature + - count_threshold_exception_over_time: + time_range: + input_obs_period: P1M + output_obs_period: P80Y + output_obs_date: "2100" + thresholds: + - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Max_Temperature.*" + threshold_value: 5 + unit: Celsius + comparison: OPERATOR_GE + output_cprop: maxTemperature + - sv_regex: "^(Median|Percentile90|Percentile10)AcrossModels_DifferenceRelativeToBaseDate.*_Min_Temperature.*" + threshold_value: -5 + unit: Celsius + comparison: OPERATOR_LE + output_cprop: minTemperature + - type: STAT_VAR_SERIES_AGGREGATION input_imports: - NASA_NEXGDDP_IpccPlaces50 @@ -432,16 +316,11 @@ calculations: output_import: NASA_NEXGDDP_IpccPlaces50_AggrDiffStats stat_var_series_aggregation: aggr_funcs: - - max_diff_across_measurement_methods: + - max_diff_across_measurement_methods: {} - diff_relative_to_base_date: date_specs: - start_date: 2015 - end_date: 2020 - date_specs: - # There are two series in this import: - # 1. Monthly historical data from 1950 to 2005. - # 2. Monthly projections from 2006 to 2099. - dates: - - 1990 - - 2006 - + - start_date: "2015" + end_date: "2020" + - dates: + - "1990" + - "2006" diff --git a/pipeline/workflow/ingestion-helper/aggregation/schema.json b/pipeline/workflow/ingestion-helper/aggregation/schema.json index b0deba8f6..57805f320 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/schema.json +++ b/pipeline/workflow/ingestion-helper/aggregation/schema.json @@ -1,93 +1,41 @@ { "$schema": "http://json-schema.org/draft-07/schema#", - "title": "AggregationConfig", + "title": "AggregationCalculationsConfig", "type": "object", + "required": ["calculations"], + "additionalProperties": false, "properties": { - "aggregations": { + "calculations": { "type": "array", "items": { "type": "object", - "required": ["type", "imports"], + "required": ["type", "input_imports"], "properties": { "type": { "type": "string", - "enum": ["place", "stat_var", "entity", "linked_edges", "provenance_summary", "stat_var_groups"] - }, - "disabled": { - "type": "boolean", - "default": false - }, - "stage": { - "type": "integer", - "minimum": 1, - "default": 1 + "enum": [ + "PLACE_AGGREGATION", + "STAT_VAR_AGGREGATION", + "ENTITY_AGGREGATION", + "STAT_VAR_SERIES_AGGREGATION", + "STAT_VAR_CALCULATION", + "SUPER_ENUM_AGGREGATION" + ] }, - "imports": { + "input_imports": { "type": "array", "items": { "type": "string" }, "minItems": 1 - } - }, - "dependencies": { - "type": { - "oneOf": [ - { - "properties": { - "type": { "const": "place" }, - "source_type": { "type": "string" }, - "destination_type": { "type": "string" }, - "allow_multiple_to_places": { "type": "boolean" } - }, - "required": ["source_type", "destination_type"] - }, - { - "properties": { - "type": { "const": "stat_var" }, - "ancestor_sv_id": { "type": "string" }, - "source_sv_ids": { - "type": "array", - "items": { "type": "string" }, - "minItems": 1 - }, - "skip_all_sources_present_check": { "type": "boolean" }, - "output_import_name": { "type": "string" } - }, - "required": ["ancestor_sv_id", "source_sv_ids"] - }, - { - "properties": { - "type": { "const": "entity" }, - "entity_types": { - "type": "array", - "items": { "type": "string" } - }, - "location_props": { - "type": "array", - "items": { "type": "string" } - }, - "date_prop": { "type": "string" }, - "agg_date_formats": { - "type": "array", - "items": { "type": "string" } - }, - "constraints": { - "type": "array", - "items": { "type": "string" } - } - }, - "required": ["entity_types", "location_props"] - }, - { - "properties": { - "type": { "enum": ["linked_edges", "provenance_summary", "stat_var_groups"] } - } - } - ] - } + }, + "output_import": { "type": "string" }, + "round": { "type": "integer" }, + "place_aggregation": { "type": "object" }, + "stat_var_aggregation": { "type": "object" }, + "entity_aggregation": { "type": "object" }, + "stat_var_series_aggregation": { "type": "object" }, + "stat_var_calculation": { "type": "object" } } } } - }, - "required": ["aggregations"], - "additionalProperties": false + } } diff --git a/pipeline/workflow/ingestion-helper/aggregation/validator.py b/pipeline/workflow/ingestion-helper/aggregation/validator.py index c26427e8f..42d6dc2b1 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/validator.py +++ b/pipeline/workflow/ingestion-helper/aggregation/validator.py @@ -15,6 +15,7 @@ """Configuration validator and CLI tool for Data Commons aggregations.""" import argparse +import glob import json import logging import os @@ -30,14 +31,14 @@ def validate_config(config_file_path: str, schema_file_path: str) -> List[Dict[str, Any]]: - """Loads and validates the aggregation YAML configuration against the JSON Schema. + """Loads and validates an aggregation YAML configuration file against the JSON Schema. Args: - config_file_path: Path to the aggregation.yaml configuration file. - schema_file_path: Path to the aggregation_schema.json validation file. + config_file_path: Path to a .yaml configuration file. + schema_file_path: Path to the JSON Schema validation file. Returns: - A list of validated aggregation dictionaries. + A list of validated calculation dictionaries. Raises: FileNotFoundError: If either the config or schema file is missing. @@ -75,24 +76,25 @@ def validate_config(config_file_path: str, schema_file_path: str) -> List[Dict[s logging.error(f"Schema validation failed for config {config_file_path}: {e.message}") raise e - return config["aggregations"] + return config.get("calculations", []) def main(): """CLI entry point for standalone configuration validation.""" logging.basicConfig(level=logging.INFO) - parser = argparse.ArgumentParser(description="Validate Data Commons aggregation configuration files against the JSON Schema.") - - # Resolve default paths relative to this script's directory (aggregation/) + parser = argparse.ArgumentParser( + description="Validate Data Commons aggregation configuration files against the JSON Schema." + ) + curr_dir = os.path.dirname(os.path.abspath(__file__)) - default_config = os.path.join(curr_dir, "..", "aggregation.yaml") + default_config = os.path.join(curr_dir, "configs") default_schema = os.path.join(curr_dir, "schema.json") parser.add_argument( "--config", type=str, default=default_config, - help=f"Path to the aggregation YAML config file (default: {default_config})" + help=f"Path to an aggregation YAML config file or directory (default: {default_config})" ) parser.add_argument( "--schema", @@ -103,27 +105,48 @@ def main(): args = parser.parse_args() - print(f"Validating '{os.path.basename(args.config)}' against '{os.path.basename(args.schema)}'...") - - try: - aggregations = validate_config(args.config, args.schema) - print(f"{GREEN}[SUCCESS] Configuration is valid!{RESET}") - print(f"Parsed {len(aggregations)} aggregation steps successfully.") - sys.exit(0) - except FileNotFoundError as e: - print(f"{RED}[ERROR] File not found: {e}{RESET}", file=sys.stderr) - sys.exit(1) - except jsonschema.exceptions.ValidationError as e: - print(f"{RED}[ERROR] Schema Validation Failed:{RESET}", file=sys.stderr) - print(f"{RED} - Path: {'.'.join(str(p) for p in e.path)}{RESET}", file=sys.stderr) - print(f"{RED} - Message: {e.message}{RESET}", file=sys.stderr) - sys.exit(1) - except yaml.YAMLError as e: - print(f"{RED}[ERROR] YAML Syntax Error: {e}{RESET}", file=sys.stderr) - sys.exit(1) - except Exception as e: - print(f"{RED}[ERROR] Unexpected validation failure: {e}{RESET}", file=sys.stderr) + # Collect target config files + if os.path.isdir(args.config): + yaml_files = sorted( + glob.glob(os.path.join(args.config, "*.yaml")) + glob.glob(os.path.join(args.config, "*.yml")) + ) + if not yaml_files: + print(f"{RED}[ERROR] No YAML config files found in directory: {args.config}{RESET}", file=sys.stderr) + sys.exit(1) + print(f"Validating {len(yaml_files)} configuration file(s) in '{args.config}' against '{os.path.basename(args.schema)}'...") + else: + yaml_files = [args.config] + print(f"Validating '{os.path.basename(args.config)}' against '{os.path.basename(args.schema)}'...") + + total_calculations = 0 + has_error = False + + for file_path in yaml_files: + try: + calculations = validate_config(file_path, args.schema) + total_calculations += len(calculations) + print(f" {GREEN}✓{RESET} {os.path.basename(file_path)} ({len(calculations)} calculation steps)") + except FileNotFoundError as e: + print(f" {RED}✗ {os.path.basename(file_path)} - File not found: {e}{RESET}", file=sys.stderr) + has_error = True + except jsonschema.exceptions.ValidationError as e: + print(f" {RED}✗ {os.path.basename(file_path)} - Schema Validation Failed:{RESET}", file=sys.stderr) + print(f" {RED}- Path: {'.'.join(str(p) for p in e.path)}{RESET}", file=sys.stderr) + print(f" {RED}- Message: {e.message}{RESET}", file=sys.stderr) + has_error = True + except yaml.YAMLError as e: + print(f" {RED}✗ {os.path.basename(file_path)} - YAML Syntax Error: {e}{RESET}", file=sys.stderr) + has_error = True + except Exception as e: + print(f" {RED}✗ {os.path.basename(file_path)} - Validation failure: {e}{RESET}", file=sys.stderr) + has_error = True + + if has_error: + print(f"\n{RED}[FAILURE] Validation failed for one or more configuration files.{RESET}", file=sys.stderr) sys.exit(1) + else: + print(f"\n{GREEN}[SUCCESS] All {len(yaml_files)} configuration file(s) passed validation! ({total_calculations} calculation steps total){RESET}") + sys.exit(0) if __name__ == "__main__": From d6012a76d42e8a2b02ff51e11a55baf0226f3869 Mon Sep 17 00:00:00 2001 From: Sandeep Tuniki Date: Thu, 2 Jul 2026 16:02:50 +0530 Subject: [PATCH 20/33] feat(aggregation): expand schema.json with type-specific calculation validation rules --- .../aggregation/configs/statvar.yaml | 118 ++++++++++++++++++ .../ingestion-helper/aggregation/schema.json | 115 ++++++++++++++++- 2 files changed, 232 insertions(+), 1 deletion(-) diff --git a/pipeline/workflow/ingestion-helper/aggregation/configs/statvar.yaml b/pipeline/workflow/ingestion-helper/aggregation/configs/statvar.yaml index 6f2896c6e..c25e7fae2 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/configs/statvar.yaml +++ b/pipeline/workflow/ingestion-helper/aggregation/configs/statvar.yaml @@ -120,81 +120,145 @@ calculations: - ancestor_sv_id: Count_Person_NoPrivateHealthInsurance source_sv_ids: # Population: 26 - 34 Years, Female, No Private Health Insurance + - dc/0kyv91d5902mg # Population: 75 Years or More, Male, No Private Health Insurance + - dc/1ld59l0sp7z85 # Population: 6 Years or Less, Male, No Private Health Insurance + - dc/27q8fsx1rm8e1 # Population: 55 - 64 Years, Female, No Private Health Insurance + - dc/2v3y7wgqrzm5f # Population: 65 - 74 Years, Female, No Private Health Insurance + - dc/388xq8h8hdhyh # Population: 18 - 24 Years, Female, No Private Health Insurance + - dc/40rjc7178vkrd # Population: 25 - 34 Years, Male, No Private Health Insurance + - dc/4qk2e1c4zhsr8 # Population: 45 - 54 Years, Female, No Private Health Insurance + - dc/60mq1lysnc39g # Population: 35 - 44 Years, Female, No Private Health Insurance + - dc/9cwrwgd2t1r95 # Population: 19 - 25 Years, Male, No Private Health Insurance + - dc/9xxzewchgl6bd # Population: 6 - 17 Years, Female, No Private Health Insurance + - dc/b9s112lccg17g # Population: 26 - 34 Years, Male, No Private Health Insurance + - dc/dsmsdwbj1yqf9 # Population: 45 - 54 Years, Male, No Private Health Insurance + - dc/hycewjthgy8w # Population: 6 Years or Less, Female, No Private Health Insurance + - dc/n0e5f7zjsbedg # Population: 6 - 18 Years, Male, No Private Health Insurance + - dc/njeqykf73dng2 # Population: 19 - 25 Years, Female, No Private Health Insurance + - dc/nl2wdjffxpdx3 # Population: 25 - 34 Years, Female, No Private Health Insurance + - dc/s11tttbvg3dzh # Population: 55 - 64 Years, Male, No Private Health Insurance + - dc/v6d0158t101s7 # Population: 65 - 74 Years, Male, No Private Health Insurance + - dc/v80klfl3e9m7f # Population: 6 - 18 Years, Female, No Private Health Insurance + - dc/w77cv2c32fck7 # Population: 18 - 24 Years, Male, No Private Health Insurance + - dc/wch97nj65s85f # Population: 75 Years or More, Female, No Private Health Insurance + - dc/wrz9m375r1yy # Population: 35 - 44 Years, Male, No Private Health Insurance + - dc/wyfvejhn9fe7 # Population: 6 - 17 Years, Male, No Private Health Insurance + - dc/yfmzp444fj5r5 # dc/g/Person_HealthInsurance-NoPublicHealthInsurance # Level 3 - ancestor_sv_id: Count_Person_NoPublicHealthInsurance source_sv_ids: # Population: 26 - 34 Years, Female, No Public Health Insurance + - dc/0l10mjl6jcb43 # Population: 65 - 74 Years, Male, No Public Health Insurance + - dc/34kcegr057z16 # Population: 19 - 25 Years, Female, No Public Health Insurance + - dc/3l656p2tqt8m9 # Population: 45 - 54 Years, Female, No Public Health Insurance + - dc/81xpj06sdg2kd # Population: 26 - 34 Years, Male, No Public Health Insurance + - dc/cpcn8j7mw3tx3 # Population: 35 - 44 Years, Male, No Public Health Insurance + - dc/e922mrcqvemm6 # Population: 19 - 25 Years, Male, No Public Health Insurance + - dc/edh0y8vjc9y55 # Population: 6 - 18 Years, Male, No Public Health Insurance + - dc/efg2r5lbj2rs # Population: 65 - 74 Years, Female, No Public Health Insurance + - dc/fz5dn8yl6m516 # Population: 45 - 54 Years, Male, No Public Health Insurance + - dc/g9qp5cejpyhq5 # Population: 75 Years or More, Male, No Public Health Insurance + - dc/gc7lcsrrv1rm5 # Population: 6 - 18 Years, Female, No Public Health Insurance + - dc/grj8xq0mmr602 # Population: 35 - 44 Years, Female, No Public Health Insurance + - dc/h8123v7kk1xr # Population: 55 - 64 Years, Male, No Public Health Insurance + - dc/jrh189de6x30h # Population: 6 - 17 Years, Male, No Public Health Insurance + - dc/ljcepzzm38m7h # Population: 18 - 24 Years, Female, No Public Health Insurance + - dc/mhfv62j3s22rc # Population: 18 - 24 Years, Male, No Public Health Insurance + - dc/p8pjrf6ffdeb # Population: 75 Years or More, Female, No Public Health Insurance + - dc/qtmer196cydk # Population: 6 Years or Less, Female, No Public Health Insurance + - dc/r4yy8tmxk2pb # Population: 25 - 34 Years, Male, No Public Health Insurance + - dc/t8p05s93nvjq1 # Population: 6 - 17 Years, Female, No Public Health Insurance + - dc/v1ez3g1cnv2b2 # Population: 6 Years or Less, Male, No Public Health Insurance + - dc/v7xhmvehm80ph # Population: 55 - 64 Years, Female, No Public Health Insurance + - dc/vxecjxxbmhy43 # Population: 25 - 34 Years, Female, No Public Health Insurance + - dc/yjxmx3n02dx08 # dc/g/Person_HealthInsurance-WithOneTypeOfHealthInsurance # Level 1 - ancestor_sv_id: Count_Person_WithOneTypeOfHealthInsurance source_sv_ids: # Population: 18 Years or Less, With One Type of Health Insurance + - dc/3jbqx1kf03nd4 # Population: 18 - 34 Years, With One Type of Health Insurance + - dc/3s8t9m1k1pgw5 # Population: 35 - 64 Years, With One Type of Health Insurance + - dc/hcvy4028mk048 # Population: 19 Years or Less, With One Type of Health Insurance + - dc/sjvp50nwdlpmf # Population: 19 - 34 Years, With One Type of Health Insurance + - dc/thcbepzex9zd9 # Population: 65 Years or More, With One Type of Health Insurance + - dc/v1w52jrtvw6m2 # dc/g/Person_HealthInsurance-WithPublicCoverage # Level 3 - ancestor_sv_id: Count_Person_WithPublicCoverage source_sv_ids: # Population: 18 Years or Less, With Disability, With Public Coverage + - dc/0f66x8keewypb # Population: 19 Years or Less, With Disability, With Public Coverage + - dc/1s4eg9f8m6328 # Population: 19 Years or Less, No Disability, With Public Coverage + - dc/35b9fkeh2qlf # Population: 65 Years or More, No Disability, With Public Coverage + - dc/7cj9xhqe5262d # Population: 19 - 64 Years, No Disability, With Public Coverage + - dc/84cnddqz2egl6 # Population: 65 Years or More, With Disability, With Public Coverage + - dc/nqtzsgv2g5h09 # Population: 18 - 64 Years, With Disability, With Public Coverage + - dc/qekes8xvrvz18 # Population: 18 Years or Less, No Disability, With Public Coverage + - dc/r7975mdk9v7kh # Population: 18 - 64 Years, No Disability, With Public Coverage + - dc/t41xetm1g0q6h # Population: 19 - 64 Years, With Disability, With Public Coverage + - dc/zkcy0f52ewh04 # # Ability to speak English @@ -326,83 +390,137 @@ calculations: - ancestor_sv_id: Count_Person_Years25Onwards_EducationalAttainment_5ThAnd6ThGrade source_sv_ids: # Population: 5th And 6th Grade, Female + - Count_Person_25OrMoreYears_EducationalAttainment5ThAnd6ThGrade_Female # Population: 5th And 6th Grade, Male + - Count_Person_25OrMoreYears_EducationalAttainment5ThAnd6ThGrade_Male # dc/g/Person_EducationalAttainment-7ThAnd8ThGrade # Level 1 - ancestor_sv_id: Count_Person_Years25Onwards_EducationalAttainment_7ThAnd8ThGrade source_sv_ids: # Population: 7th And 8th Grade, Female + - Count_Person_25OrMoreYears_EducationalAttainment7ThAnd8ThGrade_Female # Population: 7th And 8th Grade, Male + - Count_Person_25OrMoreYears_EducationalAttainment7ThAnd8ThGrade_Male # dc/g/Person_EducationalAttainment-9ThTo12ThGradeNoDiploma # Level 3 - ancestor_sv_id: Count_Person_EducationalAttainment_9ThTo12ThGradeNoDiploma source_sv_ids: # Population: 18 - 24 Years, 9th To 12th Grade No Diploma, Female + - dc/g0y4zefyr10n7 # Population: 18 - 24 Years, 9th To 12th Grade No Diploma, Male + - dc/0kptwgt2r2j97 # Population: 25 - 34 Years, 9th To 12th Grade No Diploma, Female + - dc/4f3r13e4gkhq6 # Population: 25 - 34 Years, 9th To 12th Grade No Diploma, Male + - dc/tcv30t9ydkldc # Population: 35 - 44 Years, 9th To 12th Grade No Diploma, Female + - dc/7k5lylyz4l2th # Population: 35 - 44 Years, 9th To 12th Grade No Diploma, Male + - dc/nrmj89cggsfg6 # Population: 45 - 64 Years, 9th To 12th Grade No Diploma, Female + - dc/m3g8esl121v73 # Population: 45 - 64 Years, 9th To 12th Grade No Diploma, Male + - dc/3hfl00nblgvg5 # Population: 65 Years or More, 9th To 12th Grade No Diploma, Female + - dc/5sps7rmylm73b # Population: 65 Years or More, 9th To 12th Grade No Diploma, Male + - dc/07hctc6f9e2k9 # dc/g/Person_EducationalAttainment-LessThan9ThGrade # Level 3 - ancestor_sv_id: Count_Person_EducationalAttainment_LessThan9ThGrade source_sv_ids: # Population: 18 - 24 Years, Less Than 9th Grade, Female + - dc/935ftchsz31b1 # Population: 18 - 24 Years, Less Than 9th Grade, Male + - dc/n6v6b8vh8jnd5 # Population: 25 - 34 Years, Less Than 9th Grade, Female + - dc/dr8wcfnpxyrj7 # Population: 25 - 34 Years, Less Than 9th Grade, Male + - dc/k8g20v452g617 # Population: 35 - 44 Years, Less Than 9th Grade, Female + - dc/y051838618t9c # Population: 35 - 44 Years, Less Than 9th Grade, Male + - dc/cpx5ll8vth2n7 # Population: 45 - 64 Years, Less Than 9th Grade, Female + - dc/b841qxd3smpt # Population: 45 - 64 Years, Less Than 9th Grade, Male + - dc/5vhc0r8m4x5g9 # Population: 65 Years or More, Less Than 9th Grade, Female + - dc/292723k92k5tb # Population: 65 Years or More, Less Than 9th Grade, Male + - dc/68pblb53csteb # dc/g/Person_EducationalAttainment-LessThanHighSchoolDiploma # Level 3 - ancestor_sv_id: Count_Person_EducationalAttainment_LessThanHighSchoolDiploma source_sv_ids: # Population: Less Than High School Diploma, Male, Two or More Races + - dc/3g5g2k9n19l45 # Population: Less Than High School Diploma, Male, Asian Alone + - dc/z3z44q7edbyp7 # Population: Less Than High School Diploma, Female, Hispanic or Latino + - dc/0p89e7q72edr1 # Population: Less Than High School Diploma, Male, White Alone + - dc/70kcr1l5jldsb # Population: Less Than High School Diploma, Female, Asian Alone + - dc/nbfp13v6t9v87 # Population: Less Than High School Diploma, Male, Hispanic or Latino + - dc/szpdtvh1v6p49 # Population: Less Than High School Diploma, Male, Some Other Race Alone + - dc/pyflt85w79j35 # Population: Less Than High School Diploma, Female, White Alone Not Hispanic or Latino + - dc/vhxsz5bplmef4 # Population: Less Than High School Diploma, Male, Black or African American Alone + - dc/k51n39gvd22t5 # Population: Less Than High School Diploma, Female, Black or African American Alone + - dc/m1trmbvh8sygd # Population: Less Than High School Diploma, Male, Native Hawaiian or Other Pacific Islander Alone + - dc/7n0468s56spt8 # Population: Less Than High School Diploma, Male, American Indian or Alaska Native Alone + - dc/5e4qydg4slyvd # Population: Less Than High School Diploma, Female, Native Hawaiian or Other Pacific Islander Alone + - dc/y95cbfgtlqrmb # Population: Less Than High School Diploma, Female, Some Other Race Alone + - dc/p7978m8tbsns6 # Population: Less Than High School Diploma, Female, American Indian or Alaska Native Alone + - dc/lyzcfc37eet64 # Population: Less Than High School Diploma, Female, White Alone + - dc/m4q40nln3qms # Population: Less Than High School Diploma, Male, White Alone Not Hispanic or Latino + - dc/yfrrvevrmyr74 # Population: Less Than High School Diploma, Female, Two or More Races + - dc/29l3m1z7d3n7c # dc/g/Person_EducationalAttainment-NurseryTo4ThGrade # Level 1 - ancestor_sv_id: Count_Person_Years25Onwards_EducationalAttainment_NurseryTo4ThGrade source_sv_ids: # Population: Nursery To 4th Grade, Female + - Count_Person_25OrMoreYears_EducationalAttainmentNurseryTo4ThGrade_Female # Population: Nursery To 4th Grade, Male + - Count_Person_25OrMoreYears_EducationalAttainmentNurseryTo4ThGrade_Male # dc/g/Person_EducationalAttainment-SomeCollegeNoDegree # Level 3 - ancestor_sv_id: Count_Person_EducationalAttainment_SomeCollegeNoDegree source_sv_ids: # Population: 18 - 24 Years, Some College No Degree, Female + - dc/x8sydp80d61v9 # Population: 18 - 24 Years, Some College No Degree, Male + - dc/bxs5x91p5jbf # Population: 25 - 34 Years, Some College No Degree, Female + - dc/z8j7q9303cpx8 # Population: 25 - 34 Years, Some College No Degree, Male + - dc/d4p0m2p51v7d3 # Population: 35 - 44 Years, Some College No Degree, Female + - dc/66nscflhly519 # Population: 35 - 44 Years, Some College No Degree, Male + - dc/q4eclxrhd5mrd # Population: 45 - 64 Years, Some College No Degree, Female + - dc/dsmks2epl3ve # Population: 45 - 64 Years, Some College No Degree, Male + - dc/6ms49l0q9tq44 # Population: 65 Years or More, Some College No Degree, Female + - dc/c0vyt03x0h0v4 # Population: 65 Years or More, Some College No Degree, Male + - dc/5s08n6kdt26db - ancestor_sv_id: Count_Person_EducationalAttainment_1StTo12ThGrade source_sv_ids: - Count_Person_EducationalAttainment1StGrade diff --git a/pipeline/workflow/ingestion-helper/aggregation/schema.json b/pipeline/workflow/ingestion-helper/aggregation/schema.json index 57805f320..f6cad1f71 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/schema.json +++ b/pipeline/workflow/ingestion-helper/aggregation/schema.json @@ -34,7 +34,120 @@ "entity_aggregation": { "type": "object" }, "stat_var_series_aggregation": { "type": "object" }, "stat_var_calculation": { "type": "object" } - } + }, + "allOf": [ + { + "if": { "properties": { "type": { "const": "PLACE_AGGREGATION" } } }, + "then": { + "required": ["place_aggregation"], + "properties": { + "place_aggregation": { + "type": "object", + "required": ["from_place_types", "to_place_types"], + "properties": { + "from_place_types": { "type": "string" }, + "to_place_types": { "type": "string" }, + "allow_multiple_to_places": { "type": "boolean" } + } + } + } + } + }, + { + "if": { "properties": { "type": { "const": "STAT_VAR_AGGREGATION" } } }, + "then": { + "required": ["stat_var_aggregation"], + "properties": { + "stat_var_aggregation": { + "type": "object", + "required": ["aggregations"], + "properties": { + "aggregations": { + "type": "array", + "items": { + "type": "object", + "required": ["ancestor_sv_id", "source_sv_ids"], + "properties": { + "ancestor_sv_id": { "type": "string" }, + "source_sv_ids": { + "type": "array", + "items": { "type": "string" }, + "minItems": 1 + }, + "skip_all_sources_present_check": { "type": "boolean" } + } + } + } + } + } + } + } + }, + { + "if": { "properties": { "type": { "const": "ENTITY_AGGREGATION" } } }, + "then": { + "required": ["entity_aggregation"], + "properties": { + "entity_aggregation": { + "type": "object", + "required": ["entity_types", "location_props", "date_prop", "agg_date_formats"], + "properties": { + "entity_types": { + "type": "array", + "items": { "type": "string" }, + "minItems": 1 + }, + "location_props": { + "type": "array", + "items": { "type": "string" }, + "minItems": 1 + }, + "date_prop": { "type": "string" }, + "agg_date_formats": { + "type": "array", + "items": { "type": "string" }, + "minItems": 1 + }, + "constraints": { + "type": "array", + "items": { "type": "string" } + } + } + } + } + } + }, + { + "if": { "properties": { "type": { "const": "STAT_VAR_SERIES_AGGREGATION" } } }, + "then": { + "required": ["stat_var_series_aggregation"], + "properties": { + "stat_var_series_aggregation": { + "type": "object", + "required": ["aggr_funcs"], + "properties": { + "aggr_funcs": { "type": "array" } + } + } + } + } + }, + { + "if": { "properties": { "type": { "const": "STAT_VAR_CALCULATION" } } }, + "then": { + "required": ["stat_var_calculation"], + "properties": { + "stat_var_calculation": { + "type": "object", + "required": ["calculations"], + "properties": { + "calculations": { "type": "array" } + } + } + } + } + } + ] } } } From d4f6690b709648cb78e6b4489039de9862c4836b Mon Sep 17 00:00:00 2001 From: Sandeep Tuniki Date: Thu, 2 Jul 2026 16:05:27 +0530 Subject: [PATCH 21/33] test(aggregation): update validator_test.py for calculations schema --- .../aggregation/validator_test.py | 279 +++++++++--------- 1 file changed, 140 insertions(+), 139 deletions(-) diff --git a/pipeline/workflow/ingestion-helper/aggregation/validator_test.py b/pipeline/workflow/ingestion-helper/aggregation/validator_test.py index 617e43a97..35f94ef04 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/validator_test.py +++ b/pipeline/workflow/ingestion-helper/aggregation/validator_test.py @@ -33,59 +33,86 @@ class TestValidatorSuccess(unittest.TestCase): def setUp(self): self.schema_path = os.path.join(os.path.dirname(__file__), "schema.json") self.tmpdir = tempfile.TemporaryDirectory() - self.config_path = os.path.join(self.tmpdir.name, "aggregation.yaml") + self.config_path = os.path.join(self.tmpdir.name, "config.yaml") def tearDown(self): self.tmpdir.cleanup() def test_validate_config_success_all_types(self): - """Verifies that a comprehensive, valid config with all types passes validation.""" + """Verifies that a comprehensive, valid config with all 6 calculation types passes validation.""" valid_all_types_yaml = textwrap.dedent("""\ - aggregations: - - type: linked_edges - imports: ["*"] - stage: 1 - disabled: false - - - type: place - source_type: County - destination_type: State - allow_multiple_to_places: true - imports: ["ImportA", "ImportB"] - stage: 2 - - - type: stat_var - ancestor_sv_id: Count_Person - source_sv_ids: ["Count_Person_Male", "Count_Person_Female"] - skip_all_sources_present_check: true - output_import_name: "Aggregated_Pop" - imports: ["ImportC"] - stage: 3 - - - type: entity - entity_types: ["MortalityEvent"] - location_props: ["location"] - date_prop: "date" - agg_date_formats: ["%Y"] - imports: ["ImportD"] - - - type: provenance_summary - imports: ["*"] - - - type: stat_var_groups - imports: ["*"] + calculations: + - type: PLACE_AGGREGATION + input_imports: + - CensusACS5YearSurvey + output_import: CensusACS5YearSurvey_AggCountry + round: 1 + place_aggregation: + from_place_types: State + to_place_types: Country + + - type: STAT_VAR_AGGREGATION + output_import: CensusACS5YearSurvey_HealthInsurance_StatVarAgg + input_imports: + - CensusACS5YearSurvey + stat_var_aggregation: + aggregations: + - ancestor_sv_id: Count_Person_NoHealthInsurance + source_sv_ids: + - dc/y0dvhk0sggzef + + - type: ENTITY_AGGREGATION + output_import: FireFAMWEB_Agg + input_imports: + - FireFAMWEB + entity_aggregation: + entity_types: + - BurnedArea + location_props: + - location + date_prop: startDate + agg_date_formats: + - "%Y" + + - type: STAT_VAR_SERIES_AGGREGATION + input_imports: + - NASA_NEXDCP30 + round: 1 + output_import: NASA_NEXDCP30_AggrDiffStats + stat_var_series_aggregation: + aggr_funcs: + - max_diff_across_measurement_methods: {} + + - type: STAT_VAR_CALCULATION + input_imports: + - EIA_Electricity + output_import: Energy_StatVarCalculation + stat_var_calculation: + calculations: + - input1: + sv_regex: Annual_Emissions + operation: DIVIDE + output: + sv: Annual_Emissions_Per_Capita + + - type: SUPER_ENUM_AGGREGATION + input_imports: + - CensusACS5YearSurvey + output_import: CensusACS5YearSurvey_SuperEnum """) with open(self.config_path, "w") as f: f.write(valid_all_types_yaml) - aggregations = validate_config(self.config_path, self.schema_path) - - self.assertEqual(len(aggregations), 6) - self.assertEqual(aggregations[0]["type"], "linked_edges") - self.assertEqual(aggregations[1]["source_type"], "County") - self.assertEqual(aggregations[2]["ancestor_sv_id"], "Count_Person") - self.assertEqual(aggregations[3]["entity_types"], ["MortalityEvent"]) + calculations = validate_config(self.config_path, self.schema_path) + + self.assertEqual(len(calculations), 6) + self.assertEqual(calculations[0]["type"], "PLACE_AGGREGATION") + self.assertEqual(calculations[1]["type"], "STAT_VAR_AGGREGATION") + self.assertEqual(calculations[2]["type"], "ENTITY_AGGREGATION") + self.assertEqual(calculations[3]["type"], "STAT_VAR_SERIES_AGGREGATION") + self.assertEqual(calculations[4]["type"], "STAT_VAR_CALCULATION") + self.assertEqual(calculations[5]["type"], "SUPER_ENUM_AGGREGATION") class TestValidatorSchemaConstraints(unittest.TestCase): @@ -94,7 +121,7 @@ class TestValidatorSchemaConstraints(unittest.TestCase): def setUp(self): self.schema_path = os.path.join(os.path.dirname(__file__), "schema.json") self.tmpdir = tempfile.TemporaryDirectory() - self.config_path = os.path.join(self.tmpdir.name, "aggregation.yaml") + self.config_path = os.path.join(self.tmpdir.name, "config.yaml") def tearDown(self): self.tmpdir.cleanup() @@ -102,35 +129,36 @@ def tearDown(self): def test_validate_config_missing_type(self): """Verifies that missing the required 'type' field raises ValidationError.""" invalid_missing_type_yaml = textwrap.dedent("""\ - aggregations: - - imports: ["*"] + calculations: + - input_imports: + - ImportA """) with open(self.config_path, "w") as f: f.write(invalid_missing_type_yaml) with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: validate_config(self.config_path, self.schema_path) - self.assertIn("'type' is a required property", ctx.exception.message) + self.assertEqual(ctx.exception.validator, "required") - def test_validate_config_missing_imports(self): - """Verifies that missing the required 'imports' field raises ValidationError.""" + def test_validate_config_missing_input_imports(self): + """Verifies that missing the required 'input_imports' field raises ValidationError.""" invalid_missing_imports_yaml = textwrap.dedent("""\ - aggregations: - - type: linked_edges + calculations: + - type: SUPER_ENUM_AGGREGATION """) with open(self.config_path, "w") as f: f.write(invalid_missing_imports_yaml) with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: validate_config(self.config_path, self.schema_path) - self.assertIn("'imports' is a required property", ctx.exception.message) + self.assertIn("'input_imports' is a required property", ctx.exception.message) - def test_validate_config_invalid_imports_type(self): - """Verifies that imports field being a string instead of an array raises ValidationError.""" + def test_validate_config_invalid_input_imports_type(self): + """Verifies that input_imports field being a string instead of an array raises ValidationError.""" invalid_imports_type_yaml = textwrap.dedent("""\ - aggregations: - - type: linked_edges - imports: "*" + calculations: + - type: SUPER_ENUM_AGGREGATION + input_imports: "SingleImportString" """) with open(self.config_path, "w") as f: f.write(invalid_imports_type_yaml) @@ -139,42 +167,12 @@ def test_validate_config_invalid_imports_type(self): validate_config(self.config_path, self.schema_path) self.assertIn("is not of type 'array'", ctx.exception.message) - def test_validate_config_invalid_stage_type(self): - """Verifies that stage field being a string instead of an integer raises ValidationError.""" - invalid_stage_type_yaml = textwrap.dedent("""\ - aggregations: - - type: linked_edges - imports: ["*"] - stage: "first" - """) - with open(self.config_path, "w") as f: - f.write(invalid_stage_type_yaml) - - with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: - validate_config(self.config_path, self.schema_path) - self.assertIn("is not of type 'integer'", ctx.exception.message) - - def test_validate_config_invalid_stage_value(self): - """Verifies that a stage value of 0 (minimum is 1) raises ValidationError.""" - invalid_stage_value_yaml = textwrap.dedent("""\ - aggregations: - - type: linked_edges - imports: ["*"] - stage: 0 - """) - with open(self.config_path, "w") as f: - f.write(invalid_stage_value_yaml) - - with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: - validate_config(self.config_path, self.schema_path) - self.assertIn("is less than the minimum of 1", ctx.exception.message) - - def test_validate_config_empty_imports_list(self): - """Verifies that an empty imports list raises ValidationError.""" + def test_validate_config_empty_input_imports_list(self): + """Verifies that an empty input_imports list raises ValidationError.""" invalid_empty_imports_yaml = textwrap.dedent("""\ - aggregations: - - type: linked_edges - imports: [] + calculations: + - type: SUPER_ENUM_AGGREGATION + input_imports: [] """) with open(self.config_path, "w") as f: f.write(invalid_empty_imports_yaml) @@ -183,17 +181,17 @@ def test_validate_config_empty_imports_list(self): validate_config(self.config_path, self.schema_path) self.assertIn("should be non-empty", ctx.exception.message) - def test_validate_config_missing_aggregations_key(self): - """Verifies that missing the required 'aggregations' root key raises ValidationError.""" - missing_aggregations_yaml = textwrap.dedent("""\ + def test_validate_config_missing_calculations_key(self): + """Verifies that missing the required 'calculations' root key raises ValidationError.""" + missing_calculations_yaml = textwrap.dedent("""\ some_other_key: [] """) with open(self.config_path, "w") as f: - f.write(missing_aggregations_yaml) + f.write(missing_calculations_yaml) with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: validate_config(self.config_path, self.schema_path) - self.assertIn("'aggregations' is a required property", ctx.exception.message) + self.assertIn("'calculations' is a required property", ctx.exception.message) def test_validate_config_empty_file(self): """Verifies that a completely empty configuration file raises ValidationError.""" @@ -203,58 +201,61 @@ def test_validate_config_empty_file(self): with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: validate_config(self.config_path, self.schema_path) - self.assertIn("'aggregations' is a required property", ctx.exception.message) + self.assertIn("'calculations' is a required property", ctx.exception.message) class TestValidatorConditionalDependencies(unittest.TestCase): - """Verifies type-specific conditional dependencies (OneOf / dependencies).""" + """Verifies type-specific conditional sub-block dependencies.""" def setUp(self): self.schema_path = os.path.join(os.path.dirname(__file__), "schema.json") self.tmpdir = tempfile.TemporaryDirectory() - self.config_path = os.path.join(self.tmpdir.name, "aggregation.yaml") + self.config_path = os.path.join(self.tmpdir.name, "config.yaml") def tearDown(self): self.tmpdir.cleanup() - def test_validate_config_place_missing_field(self): - """Verifies that a place step missing the required 'source_type' raises ValidationError.""" - invalid_place_missing_field_yaml = textwrap.dedent("""\ - aggregations: - - type: place - destination_type: State - imports: ["*"] + def test_validate_config_place_missing_subblock(self): + """Verifies that a PLACE_AGGREGATION step missing 'place_aggregation' raises ValidationError.""" + invalid_place_missing_yaml = textwrap.dedent("""\ + calculations: + - type: PLACE_AGGREGATION + input_imports: + - ImportA """) with open(self.config_path, "w") as f: - f.write(invalid_place_missing_field_yaml) + f.write(invalid_place_missing_yaml) with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: validate_config(self.config_path, self.schema_path) - self.assertIn("is not valid under any of the given schemas", ctx.exception.message) - - def test_validate_config_stat_var_missing_field(self): - """Verifies that a stat_var step missing the required 'source_sv_ids' raises ValidationError.""" - invalid_stat_var_missing_field_yaml = textwrap.dedent("""\ - aggregations: - - type: stat_var - ancestor_sv_id: Count_Person - imports: ["*"] + self.assertIn("'place_aggregation' is a required property", ctx.exception.message) + + def test_validate_config_stat_var_missing_subblock(self): + """Verifies that a STAT_VAR_AGGREGATION step missing 'stat_var_aggregation' raises ValidationError.""" + invalid_stat_var_missing_yaml = textwrap.dedent("""\ + calculations: + - type: STAT_VAR_AGGREGATION + input_imports: + - ImportA """) with open(self.config_path, "w") as f: - f.write(invalid_stat_var_missing_field_yaml) + f.write(invalid_stat_var_missing_yaml) with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: validate_config(self.config_path, self.schema_path) - self.assertIn("is not valid under any of the given schemas", ctx.exception.message) + self.assertIn("'stat_var_aggregation' is a required property", ctx.exception.message) def test_validate_config_stat_var_empty_source_svs(self): - """Verifies that a stat_var step with an empty source_sv_ids array raises ValidationError.""" + """Verifies that an aggregation item with an empty source_sv_ids array raises ValidationError.""" invalid_stat_var_empty_svs_yaml = textwrap.dedent("""\ - aggregations: - - type: stat_var - ancestor_sv_id: Count_Person - source_sv_ids: [] - imports: ["*"] + calculations: + - type: STAT_VAR_AGGREGATION + input_imports: + - ImportA + stat_var_aggregation: + aggregations: + - ancestor_sv_id: Count_Person + source_sv_ids: [] """) with open(self.config_path, "w") as f: f.write(invalid_stat_var_empty_svs_yaml) @@ -263,20 +264,20 @@ def test_validate_config_stat_var_empty_source_svs(self): validate_config(self.config_path, self.schema_path) self.assertIn("should be non-empty", ctx.exception.message) - def test_validate_config_entity_missing_field(self): - """Verifies that an entity step missing the required 'location_props' raises ValidationError.""" - invalid_entity_missing_field_yaml = textwrap.dedent("""\ - aggregations: - - type: entity - entity_types: ["Event"] - imports: ["*"] + def test_validate_config_entity_missing_subblock(self): + """Verifies that an ENTITY_AGGREGATION step missing 'entity_aggregation' raises ValidationError.""" + invalid_entity_missing_yaml = textwrap.dedent("""\ + calculations: + - type: ENTITY_AGGREGATION + input_imports: + - ImportA """) with open(self.config_path, "w") as f: - f.write(invalid_entity_missing_field_yaml) + f.write(invalid_entity_missing_yaml) with self.assertRaises(jsonschema.exceptions.ValidationError) as ctx: validate_config(self.config_path, self.schema_path) - self.assertIn("is not valid under any of the given schemas", ctx.exception.message) + self.assertIn("'entity_aggregation' is a required property", ctx.exception.message) class TestValidatorErrorsAndFileSystem(unittest.TestCase): @@ -285,7 +286,7 @@ class TestValidatorErrorsAndFileSystem(unittest.TestCase): def setUp(self): self.schema_path = os.path.join(os.path.dirname(__file__), "schema.json") self.tmpdir = tempfile.TemporaryDirectory() - self.config_path = os.path.join(self.tmpdir.name, "aggregation.yaml") + self.config_path = os.path.join(self.tmpdir.name, "config.yaml") def tearDown(self): self.tmpdir.cleanup() @@ -293,10 +294,10 @@ def tearDown(self): def test_validate_config_yaml_syntax_error(self): """Verifies that malformed YAML syntax raises YAMLError.""" malformed_yaml = textwrap.dedent("""\ - aggregations: - - type: linked_edges - imports: - - "*" + calculations: + - type: PLACE_AGGREGATION + input_imports: + - "ImportA" """) with open(self.config_path, "w") as f: f.write(malformed_yaml) @@ -313,7 +314,7 @@ def test_validate_config_missing_config_file(self): def test_validate_config_missing_schema_file(self): """Verifies that a missing schema file path raises FileNotFoundError.""" with open(self.config_path, "w") as f: - f.write("aggregations: []") + f.write("calculations: []") with self.assertRaises(FileNotFoundError) as ctx: validate_config(self.config_path, "non_existent_schema.json") From c1cafeeafe1d6c311d33ceb413fc651a9f1e542c Mon Sep 17 00:00:00 2001 From: Sandeep Tuniki Date: Thu, 2 Jul 2026 19:35:01 +0530 Subject: [PATCH 22/33] feat(aggregation): update orchestrator to process per-import isolated calculations with synchronized round execution --- .../aggregation/orchestrator.py | 335 ++++++++++-------- .../aggregation/orchestrator_test.py | 153 +++----- 2 files changed, 231 insertions(+), 257 deletions(-) diff --git a/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py b/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py index 4ef337aa8..e2c20c575 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py +++ b/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py @@ -12,31 +12,37 @@ # See the License for the specific language governing permissions and # limitations under the License. +"""Aggregation orchestrator for Data Commons ingestion workflow.""" + import logging import os +import time from typing import Any, Dict, List, Optional from .bq_executor import BigQueryExecutor from .linked_edge_generator import LinkedEdgeGenerator +from .place_aggregation_generator import PlaceAggregationGenerator from .provenance_summary_generator import ProvenanceSummaryGenerator from .stat_var_aggregator import StatVarAggregator -from .place_aggregation_generator import PlaceAggregationGenerator from .stat_var_group_generator import StatVarGroupGenerator from .validator import validate_config class AggregationOrchestrator: - """Orchestrates the overall aggregation workflow.""" - - def __init__(self, - connection_id: str, - project_id: str, - instance_id: str, - database_id: str, - location: Optional[str] = None, - is_base_dc: bool = True, - config_file_path: Optional[str] = None) -> None: - """Initializes the orchestrator and loads/validates the configuration. + """Orchestrates the overall aggregation workflow across multi-round execution.""" + + def __init__( + self, + connection_id: str, + project_id: str, + instance_id: str, + database_id: str, + location: Optional[str] = None, + is_base_dc: bool = True, + config_dir: Optional[str] = None, + config_file_path: Optional[str] = None + ) -> None: + """Initializes the orchestrator and loads/validates configuration files. Args: connection_id: BigQuery connection ID to Spanner. @@ -45,175 +51,212 @@ def __init__(self, database_id: Spanner Database ID. location: BigQuery location. is_base_dc: Whether this is running in the base Data Commons environment. - config_file_path: Optional custom path to the aggregation.yaml file. - If not specified, defaults to the aggregation.yaml in the parent directory. + config_dir: Directory containing aggregation YAML configs (default: configs/). + config_file_path: Optional path to single config file or directory. """ - self.executor = BigQueryExecutor(connection_id=connection_id, - project_id=project_id, - instance_id=instance_id, - database_id=database_id, - location=location, - run_sequential=False) - + self.executor = BigQueryExecutor( + connection_id=connection_id, + project_id=project_id, + instance_id=instance_id, + database_id=database_id, + location=location, + run_sequential=False + ) self.is_base_dc = is_base_dc - # Resolve paths for default config and schema + # Resolve paths for config directory and schema curr_dir = os.path.dirname(os.path.abspath(__file__)) - if not config_file_path: - config_file_path = os.path.join(curr_dir, "..", "aggregation.yaml") + target_config = config_dir or config_file_path or os.path.join(curr_dir, "configs") schema_file_path = os.path.join(curr_dir, "schema.json") # Load and validate configuration - self.aggregations = validate_config(config_file_path, schema_file_path) + self.calculations = validate_config(target_config, schema_file_path) - def execute_stage(self, stage_num: int, active_imports: List[str]) -> List[str]: - """Executes all enabled aggregations in the specified stage in parallel. + def run(self, active_imports: List[str]) -> None: + """Executes aggregations independently for each active import. - Args: - stage_num: The stage number to execute. - active_imports: The list of active import names in this run. + Blocks and synchronizes round progression for each import: + Round 1 -> Wait -> Round 2 -> Wait -> Round 3 -> Wait. - Returns: - A list of BigQuery job IDs submitted for this stage. + Args: + active_imports: List of active import dataset names to process. """ - logging.info(f"Starting Aggregation Orchestration for Stage {stage_num}") - logging.info(f"Active imports in this run: {active_imports}") - jobs = [] + logging.info(f"Starting Aggregation Orchestrator run for active imports: {active_imports}") - for config in self.aggregations: - # 1. Skip if disabled - if config.get("disabled", False): - continue + for single_import in active_imports: + logging.info(f"=== Starting Aggregation Pipeline for Import: '{single_import}' ===") + active_rounds = self.get_active_rounds_for_import(single_import) - # 2. Filter by stage - if config.get("stage", 1) != stage_num: + if not active_rounds: + logging.info(f"No aggregation steps configured for import '{single_import}'. Skipping.") continue - # 3. Filter by active imports - applicable_imports = self._get_applicable_imports(config, active_imports) - if not applicable_imports: - continue + for round_num in active_rounds: + logging.info(f"--- Triggering Round {round_num} for import '{single_import}' ---") + self._execute_and_synchronize_round(single_import, round_num) - # 4. Route to correct generator helper - step_type = config["type"] - logging.info(f"Triggering step '{step_type}' in Stage {stage_num}...") - - step_jobs = [] - if step_type == "place": - step_jobs = self._trigger_place(config, applicable_imports) - elif step_type == "stat_var": - step_jobs = self._trigger_stat_var(config, applicable_imports) - elif step_type == "linked_edges": - step_jobs = self._trigger_linked_edges(config, applicable_imports) - elif step_type == "provenance_summary": - step_jobs = self._trigger_provenance_summary(config, applicable_imports) - elif step_type == "stat_var_groups": - step_jobs = self._trigger_stat_var_groups(config, applicable_imports) - else: - raise ValueError(f"Unsupported or unimplemented aggregation step type: {step_type}") + logging.info(f"=== Successfully completed all aggregation rounds for Import: '{single_import}' ===") - # Collect BQ jobs - for job in step_jobs: - if job and job.job_id: - jobs.append(job.job_id) - - logging.info(f"=== Stage {stage_num} initiated successfully. Submitted {len(jobs)} BigQuery jobs: {jobs} ===") - return jobs - - def has_stage(self, stage_num: int, active_imports: List[str]) -> bool: - """Checks if there are any active, enabled aggregations configured for the stage. + def get_active_rounds_for_import(self, single_import: str) -> List[int]: + """Returns a sorted list of unique active round numbers for a single import. Args: - stage_num: The stage number to check. - active_imports: The list of active import names. + single_import: The active import dataset name. Returns: - True if the stage has at least one aggregation that will run, False otherwise. + Sorted list of round numbers (e.g., [1, 2, 3]). """ - for config in self.aggregations: - if config.get("disabled", False): - continue - if config.get("stage", 1) != stage_num: - continue - - # Check if it applies to any active imports - if self._get_applicable_imports(config, active_imports): - return True - - return False - - def get_active_stages(self, active_imports: List[str]) -> List[int]: - """Returns a sorted list of unique, active, and enabled stage numbers. + rounds = set() + for calc in self.calculations: + if self._calc_applies_to_import(calc, single_import): + rounds.add(calc.get("round", 1)) + return sorted(list(rounds)) + + def get_active_rounds(self, active_imports: List[str]) -> List[int]: + """Returns a sorted list of unique active round numbers across active imports.""" + rounds = set() + for single_import in active_imports: + rounds.update(self.get_active_rounds_for_import(single_import)) + return sorted(list(rounds)) + + def execute_round(self, stage_num: int, active_imports: List[str]) -> List[str]: + """Executes and collects BigQuery job IDs for a given round/stage. Args: - active_imports: The list of active import names. + stage_num: The round/stage number to execute. + active_imports: List of active import dataset names. Returns: - A sorted list of unique active stage numbers. + List of BigQuery job IDs submitted. """ - stages = set() - for config in self.aggregations: - step_type = config.get("type") - stage_num = config.get("stage", 1) - - if config.get("disabled", False): - logging.info(f"[Config Scan] Skipping step '{step_type}' in Stage {stage_num} because it is disabled.") + job_ids = [] + for single_import in active_imports: + jobs = self._dispatch_round_steps(single_import, stage_num) + for job in jobs: + if job and getattr(job, "job_id", None): + job_ids.append(job.job_id) + return job_ids + + def _execute_and_synchronize_round(self, single_import: str, round_num: int) -> None: + """Triggers round steps for a single import and blocks until completion.""" + jobs = self._dispatch_round_steps(single_import, round_num) + job_ids = [job.job_id for job in jobs if job and getattr(job, "job_id", None)] + + if not job_ids: + logging.info(f"No BigQuery jobs submitted for Round {round_num} (import: '{single_import}').") + return + + logging.info(f"Submitted {len(job_ids)} job(s) for Round {round_num} (import: '{single_import}'): {job_ids}") + self._wait_for_jobs(job_ids) + + def _dispatch_round_steps(self, single_import: str, round_num: int) -> List[Any]: + """Dispatches matching calculation steps for an import and round number.""" + jobs = [] + for calc in self.calculations: + if calc.get("round", 1) != round_num: continue - - applicable_imports = self._get_applicable_imports(config, active_imports) - if not applicable_imports: - logging.info(f"[Config Scan] Skipping step '{step_type}' in Stage {stage_num} because it does not apply to active imports: {active_imports}.") + if not self._calc_applies_to_import(calc, single_import): continue - logging.info(f"[Config Scan] Step '{step_type}' in Stage {stage_num} is ACTIVE for imports: {applicable_imports}.") - stages.add(stage_num) - - sorted_stages = sorted(list(stages)) - logging.info(f"[Config Scan] Active stages resolved: {sorted_stages}") - return sorted_stages + step_type = calc["type"] + logging.info(f"Triggering '{step_type}' (Round {round_num}) for import '{single_import}'...") + + step_jobs = [] + if step_type == "PLACE_AGGREGATION" or step_type == "place": + step_jobs = self._trigger_place(calc, [single_import]) + elif step_type == "STAT_VAR_AGGREGATION" or step_type == "stat_var": + step_jobs = self._trigger_stat_var(calc, [single_import]) + elif step_type == "linked_edges": + step_jobs = self._trigger_linked_edges(calc, [single_import]) + elif step_type == "provenance_summary": + step_jobs = self._trigger_provenance_summary(calc, [single_import]) + elif step_type == "stat_var_groups": + step_jobs = self._trigger_stat_var_groups(calc, [single_import]) + else: + logging.warning( + f"Calculation type '{step_type}' configured for import '{single_import}' has no active generator handler." + ) + + jobs.extend(step_jobs) + return jobs - def check_jobs_status(self, job_ids: List[str]) -> Dict[str, Any]: - """Checks the status of the specified BigQuery job IDs. + def _wait_for_jobs(self, job_ids: List[str], poll_interval: int = 5) -> None: + """Blocks until all specified BigQuery job IDs complete successfully. - Delegates to the BigQueryExecutor's get_jobs_status. + Args: + job_ids: List of BigQuery job IDs to wait for. + poll_interval: Seconds between polling checks. + + Raises: + RuntimeError: If any job fails. """ - try: - return self.executor.get_jobs_status(job_ids) - except Exception as e: - logging.error(f"Failed to check jobs status: {e}") - raise e + if not job_ids: + return + + logging.info(f"Waiting for {len(job_ids)} BigQuery job(s) to complete: {job_ids}") + while True: + status_info = self.executor.get_jobs_status(job_ids) + status = status_info.get("status") + + if status == "DONE": + logging.info("All BigQuery jobs in round completed successfully.") + return + elif status == "FAILED": + error_msg = status_info.get("error", "One or more BigQuery jobs failed.") + logging.error(f"Round execution failed: {error_msg}") + raise RuntimeError(f"Aggregation execution failed: {error_msg}") + + time.sleep(poll_interval) def _trigger_place(self, config: Dict[str, Any], applicable_imports: List[str]) -> List[Any]: """Triggers place-level rollup aggregations.""" - source_type = config["source_type"] - destination_type = config["destination_type"] - logging.info( - f" -> Place Rollup: {source_type} -> {destination_type} for imports {applicable_imports}" - ) + place_cfg = config.get("place_aggregation", {}) + from_type = place_cfg.get("from_place_types") or config.get("source_type") + to_type = place_cfg.get("to_place_types") or config.get("destination_type") + + logging.info(f" -> Place Rollup: {from_type} -> {to_type} for imports {applicable_imports}") generator = PlaceAggregationGenerator(self.executor, self.is_base_dc) job = generator.aggregate_places( import_names=applicable_imports, - source_type=source_type, - destination_type=destination_type, - allow_multiple_to_places=config.get("allow_multiple_to_places", False) + source_type=from_type, + destination_type=to_type, + allow_multiple_to_places=place_cfg.get("allow_multiple_to_places", False) ) return [job] if job else [] def _trigger_stat_var(self, config: Dict[str, Any], applicable_imports: List[str]) -> List[Any]: """Triggers statistical variable aggregations.""" - ancestor_sv = config["ancestor_sv_id"] - source_svs = config["source_sv_ids"] - logging.info( - f" -> Stat Var Aggregation: ancestor '{ancestor_sv}' (sources: {source_svs}) for imports {applicable_imports}" - ) + stat_cfg = config.get("stat_var_aggregation", {}) + aggregations = stat_cfg.get("aggregations", []) + output_import_name = config.get("output_import") or config.get("output_import_name") + + # Backwards compatibility fallback for single item config + if not aggregations and "ancestor_sv_id" in config: + aggregations = [{ + "ancestor_sv_id": config["ancestor_sv_id"], + "source_sv_ids": config["source_sv_ids"], + "skip_all_sources_present_check": config.get("skip_all_sources_present_check", False) + }] + generator = StatVarAggregator(self.executor, self.is_base_dc) - return generator.aggregate_stat_vars( - ancestor_sv=ancestor_sv, - source_svs=source_svs, - import_names=applicable_imports, - output_import_name=config.get("output_import_name"), - skip_all_sources_present_check=config.get("skip_all_sources_present_check", False) - ) + jobs = [] + + for item in aggregations: + ancestor_sv = item["ancestor_sv_id"] + source_svs = item["source_sv_ids"] + logging.info( + f" -> Stat Var Aggregation: ancestor '{ancestor_sv}' (sources: {source_svs}) for imports {applicable_imports}" + ) + item_jobs = generator.aggregate_stat_vars( + ancestor_sv=ancestor_sv, + source_svs=source_svs, + import_names=applicable_imports, + output_import_name=output_import_name, + skip_all_sources_present_check=item.get("skip_all_sources_present_check", False) + ) + jobs.extend(item_jobs) + + return jobs def _trigger_linked_edges(self, config: Dict[str, Any], applicable_imports: List[str]) -> List[Any]: """Triggers linked edge aggregations.""" @@ -233,13 +276,13 @@ def _trigger_stat_var_groups(self, config: Dict[str, Any], applicable_imports: L generator = StatVarGroupGenerator(self.executor, self.is_base_dc) return generator.run_all(applicable_imports) - def _get_applicable_imports(self, config: Dict[str, Any], active_imports: List[str]) -> List[str]: - """Determines which active imports apply to this aggregation config.""" - configured_imports = config["imports"] - - # Explicit wildcard check - if "*" in configured_imports: - return active_imports - - # Intersection of configured and active imports - return list(set(configured_imports).intersection(active_imports)) + def _calc_applies_to_import(self, calc: Dict[str, Any], single_import: str) -> bool: + """Determines if a calculation step applies to a single import.""" + if calc.get("disabled", False): + return False + + configured_imports = calc.get("input_imports") or calc.get("imports", []) + if "*" in configured_imports or single_import in configured_imports: + return True + + return False diff --git a/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py b/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py index a0b2627f4..999ca8d36 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py +++ b/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py @@ -14,7 +14,6 @@ """Unit tests for the AggregationOrchestrator class.""" -import json import os import sys import tempfile @@ -27,41 +26,38 @@ from aggregation import AggregationOrchestrator VALID_CONFIG_YAML = textwrap.dedent("""\ - aggregations: - - type: linked_edges - imports: ["*"] - stage: 1 - - - type: place - source_type: County - destination_type: State - allow_multiple_to_places: false - imports: ["USFed_Census"] - stage: 1 - - - type: place - source_type: State - destination_type: Country - imports: ["*"] - stage: 2 - disabled: true - - - type: stat_var - ancestor_sv_id: Count_Person - source_sv_ids: ["Count_Person_Male", "Count_Person_Female"] - skip_all_sources_present_check: true - imports: ["USFed_Census"] - stage: 2 + calculations: + - type: PLACE_AGGREGATION + input_imports: + - USFed_Census + output_import: USFed_Census_AggState + round: 1 + place_aggregation: + from_place_types: County + to_place_types: State + + - type: STAT_VAR_AGGREGATION + input_imports: + - USFed_Census + output_import: USFed_Census_StatVarAgg + round: 2 + stat_var_aggregation: + aggregations: + - ancestor_sv_id: Count_Person + source_sv_ids: + - Count_Person_Male + - Count_Person_Female + skip_all_sources_present_check: true """) @patch('aggregation.orchestrator.BigQueryExecutor') class TestOrchestratorScanning(unittest.TestCase): - """Tests the stage scanning and active stage resolution methods.""" + """Tests round scanning and active round resolution methods.""" def setUp(self): self.tmpdir = tempfile.TemporaryDirectory() - config_path = os.path.join(self.tmpdir.name, "aggregation.yaml") + config_path = os.path.join(self.tmpdir.name, "config.yaml") with open(config_path, "w") as f: f.write(VALID_CONFIG_YAML) @@ -76,41 +72,24 @@ def setUp(self): def tearDown(self): self.tmpdir.cleanup() - def test_has_stage(self, mock_executor): - """Tests the has_stage method for active, disabled, and non-matching stages.""" - self.assertTrue(self.orchestrator.has_stage(1, ["AnyImport"])) - self.assertTrue(self.orchestrator.has_stage(1, ["USFed_Census"])) + def test_get_active_rounds_for_import(self, mock_executor): + """Tests getting active rounds for matching and non-matching imports.""" + rounds = self.orchestrator.get_active_rounds_for_import("USFed_Census") + self.assertEqual(rounds, [1, 2]) - self.assertFalse(self.orchestrator.has_stage(2, ["OtherImport"])) - self.assertTrue(self.orchestrator.has_stage(2, ["USFed_Census"])) - - self.assertFalse(self.orchestrator.has_stage(3, ["USFed_Census"])) - - def test_get_active_stages(self, mock_executor): - """Tests that get_active_stages correctly extracts, filters, and sorts active stages.""" - stages = self.orchestrator.get_active_stages(active_imports=["USFed_Census"]) - self.assertEqual(stages, [1, 2]) - - stages = self.orchestrator.get_active_stages(active_imports=["OtherImport"]) - self.assertEqual(stages, [1]) + rounds = self.orchestrator.get_active_rounds_for_import("OtherImport") + self.assertEqual(rounds, []) @patch('aggregation.orchestrator.BigQueryExecutor') @patch('aggregation.orchestrator.PlaceAggregationGenerator') @patch('aggregation.orchestrator.StatVarAggregator') -@patch('aggregation.orchestrator.LinkedEdgeGenerator') -@patch('aggregation.orchestrator.ProvenanceSummaryGenerator') -@patch('aggregation.orchestrator.StatVarGroupGenerator') class TestOrchestratorExecution(unittest.TestCase): - """Tests stage execution, verifying parallel job submission and routing. - - These tests execute stages, so they mock the executor and all five generators - to verify correct parameters are passed and jobs are collected. - """ + """Tests round execution, verifying job submission and synchronization.""" def setUp(self): self.tmpdir = tempfile.TemporaryDirectory() - config_path = os.path.join(self.tmpdir.name, "aggregation.yaml") + config_path = os.path.join(self.tmpdir.name, "config.yaml") with open(config_path, "w") as f: f.write(VALID_CONFIG_YAML) @@ -125,25 +104,21 @@ def setUp(self): def tearDown(self): self.tmpdir.cleanup() - def test_execute_stage_1(self, mock_svg_gen, mock_prov_gen, mock_edge_gen, - mock_sv_agg, mock_place_gen, mock_executor): - """Tests executing Stage 1, verifying parallel job submission and wildcard resolution.""" + def test_run_synchronized_pipeline(self, mock_sv_agg, mock_place_gen, mock_executor_cls): + """Tests complete synchronized run pipeline for an import across rounds.""" mock_job1 = MagicMock() - mock_job1.job_id = "job-edge-1" - mock_edge_gen.return_value.run_all.return_value = [mock_job1] + mock_job1.job_id = "job-place-1" + mock_place_gen.return_value.aggregate_places.return_value = mock_job1 mock_job2 = MagicMock() - mock_job2.job_id = "job-place-1" - mock_place_gen.return_value.aggregate_places.return_value = mock_job2 + mock_job2.job_id = "job-sv-1" + mock_sv_agg.return_value.aggregate_stat_vars.return_value = [mock_job2] - job_ids = self.orchestrator.execute_stage(stage_num=1, active_imports=["USFed_Census"]) + self.orchestrator.executor = MagicMock() + self.orchestrator.executor.get_jobs_status.return_value = {"status": "DONE"} - self.assertEqual(len(job_ids), 2) - self.assertIn("job-edge-1", job_ids) - self.assertIn("job-place-1", job_ids) + self.orchestrator.run(active_imports=["USFed_Census"]) - mock_edge_gen.return_value.run_all.assert_called_once_with(["USFed_Census"]) - mock_place_gen.return_value.aggregate_places.assert_called_once_with( import_names=["USFed_Census"], source_type="County", @@ -151,58 +126,14 @@ def test_execute_stage_1(self, mock_svg_gen, mock_prov_gen, mock_edge_gen, allow_multiple_to_places=False ) - def test_execute_stage_2_with_disabled_and_filtering(self, mock_svg_gen, mock_prov_gen, mock_edge_gen, - mock_sv_agg, mock_place_gen, mock_executor): - """Tests Stage 2, verifying that disabled steps are skipped and non-matching imports are filtered.""" - mock_job_sv = MagicMock() - mock_job_sv.job_id = "job-sv-1" - mock_sv_agg.return_value.aggregate_stat_vars.return_value = [mock_job_sv] - - job_ids = self.orchestrator.execute_stage(stage_num=2, active_imports=["OtherImport"]) - self.assertEqual(len(job_ids), 0) - mock_place_gen.return_value.aggregate_places.assert_not_called() - mock_sv_agg.return_value.aggregate_stat_vars.assert_not_called() - - job_ids = self.orchestrator.execute_stage(stage_num=2, active_imports=["USFed_Census"]) - - self.assertEqual(job_ids, ["job-sv-1"]) - mock_place_gen.return_value.aggregate_places.assert_not_called() mock_sv_agg.return_value.aggregate_stat_vars.assert_called_once_with( ancestor_sv="Count_Person", source_svs=["Count_Person_Male", "Count_Person_Female"], import_names=["USFed_Census"], - output_import_name=None, + output_import_name="USFed_Census_StatVarAgg", skip_all_sources_present_check=True ) - def test_execute_stage_unsupported_type(self, *mocks): - """Tests that an unsupported aggregation step type raises ValueError.""" - unimplemented_config = textwrap.dedent("""\ - aggregations: - - type: entity - entity_types: ["MortalityEvent"] - location_props: ["location"] - imports: ["*"] - stage: 1 - """) - - with tempfile.TemporaryDirectory() as local_tmpdir: - local_config_path = os.path.join(local_tmpdir, "aggregation.yaml") - with open(local_config_path, "w") as f: - f.write(unimplemented_config) - - local_orchestrator = AggregationOrchestrator( - connection_id="conn", - project_id="proj", - instance_id="inst", - database_id="db", - config_file_path=local_config_path - ) - - with self.assertRaises(ValueError) as ctx: - local_orchestrator.execute_stage(stage_num=1, active_imports=["USFed_Census"]) - self.assertIn("Unsupported or unimplemented aggregation step type: entity", str(ctx.exception)) - if __name__ == '__main__': unittest.main() From 969f1955f2ec83930872a91d0f227526bd0c7dd8 Mon Sep 17 00:00:00 2001 From: Sandeep Tuniki Date: Thu, 2 Jul 2026 19:41:48 +0530 Subject: [PATCH 23/33] refactor(aggregation): rename round to stage across configs, schema, orchestrator, and tests --- .../aggregation/configs/place.yaml | 34 ++++----- .../aggregation/configs/statvar_series.yaml | 20 +++--- .../aggregation/orchestrator.py | 72 +++++++++---------- .../aggregation/orchestrator_test.py | 22 +++--- .../ingestion-helper/aggregation/schema.json | 2 +- .../aggregation/validator_test.py | 4 +- 6 files changed, 77 insertions(+), 77 deletions(-) diff --git a/pipeline/workflow/ingestion-helper/aggregation/configs/place.yaml b/pipeline/workflow/ingestion-helper/aggregation/configs/place.yaml index 892a2b128..cf90fb7bb 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/configs/place.yaml +++ b/pipeline/workflow/ingestion-helper/aggregation/configs/place.yaml @@ -5,7 +5,7 @@ calculations: place_aggregation: from_place_types: State to_place_types: Country - round: 1 + stage: 1 output_import: CensusACS5YearSurvey_AggCountry - type: PLACE_AGGREGATION @@ -14,7 +14,7 @@ calculations: place_aggregation: from_place_types: State to_place_types: Country - round: 1 + stage: 1 output_import: CensusSAHIE_AggCountry - type: PLACE_AGGREGATION @@ -23,7 +23,7 @@ calculations: place_aggregation: from_place_types: County to_place_types: State - round: 1 + stage: 1 output_import: CDCMortality_AggState - type: PLACE_AGGREGATION @@ -32,7 +32,7 @@ calculations: place_aggregation: from_place_types: State to_place_types: Country - round: 2 + stage: 2 output_import: CDCMortality_AggState_AggCountry # - type: PLACE_AGGREGATION @@ -41,7 +41,7 @@ calculations: # place_aggregation: # from_place_types: State # to_place_types: Country - # round: 1 + # stage: 1 # output_import: FBIGovCrime_AggCountry - type: PLACE_AGGREGATION @@ -50,7 +50,7 @@ calculations: place_aggregation: from_place_types: County to_place_types: State - round: 1 + stage: 1 output_import: DEA_ARCOS_AggState - type: PLACE_AGGREGATION @@ -59,7 +59,7 @@ calculations: place_aggregation: from_place_types: State to_place_types: Country - round: 2 + stage: 2 output_import: DEA_ARCOS_AggState_AggCountry - type: PLACE_AGGREGATION @@ -68,7 +68,7 @@ calculations: place_aggregation: from_place_types: CensusBlockGroup to_place_types: CensusTract - round: 1 + stage: 1 output_import: EPA_EJSCREEN_AggCensusTract - type: PLACE_AGGREGATION @@ -77,7 +77,7 @@ calculations: place_aggregation: from_place_types: CensusTract to_place_types: County - round: 2 + stage: 2 output_import: EPA_EJSCREEN_AggCensusTract_AggCounty - type: PLACE_AGGREGATION @@ -86,7 +86,7 @@ calculations: place_aggregation: from_place_types: CensusBlockGroup to_place_types: CensusTract - round: 1 + stage: 1 output_import: DeepSolar_AggCensusTract - type: PLACE_AGGREGATION @@ -95,7 +95,7 @@ calculations: place_aggregation: from_place_types: CensusTract to_place_types: County - round: 2 + stage: 2 output_import: DeepSolar_AggCensusTract_AggCounty - type: PLACE_AGGREGATION @@ -104,7 +104,7 @@ calculations: place_aggregation: from_place_types: EpaReportingFacility to_place_types: County - round: 1 + stage: 1 output_import: EPA_GHGRP_AggCounty - type: PLACE_AGGREGATION @@ -113,7 +113,7 @@ calculations: place_aggregation: from_place_types: EpaReportingFacility to_place_types: CensusZipCodeTabulationArea - round: 1 + stage: 1 output_import: EPA_GHGRP_AggCensusZipCodeTabulationArea - type: PLACE_AGGREGATION @@ -122,7 +122,7 @@ calculations: place_aggregation: from_place_types: County to_place_types: State - round: 2 + stage: 2 output_import: EPA_GHGRP_AggCounty_AggState - type: PLACE_AGGREGATION @@ -132,7 +132,7 @@ calculations: from_place_types: GeoGridPlace_0.25Deg to_place_types: County allow_multiple_to_places: true - round: 1 + stage: 1 output_import: RFF_USGridGeo_WeatherVariabilityForecast_AggCounty - type: PLACE_AGGREGATION @@ -142,7 +142,7 @@ calculations: from_place_types: State to_place_types: Country allow_multiple_to_places: true - round: 1 + stage: 1 output_import: India_RBIStateDomesticProduct_AggCountry - type: PLACE_AGGREGATION @@ -152,5 +152,5 @@ calculations: from_place_types: State to_place_types: Country allow_multiple_to_places: true - round: 1 + stage: 1 output_import: India_RBIStateDomesticProduct_StatVarAgg_AggCountry diff --git a/pipeline/workflow/ingestion-helper/aggregation/configs/statvar_series.yaml b/pipeline/workflow/ingestion-helper/aggregation/configs/statvar_series.yaml index 4359a8f93..c565076d9 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/configs/statvar_series.yaml +++ b/pipeline/workflow/ingestion-helper/aggregation/configs/statvar_series.yaml @@ -2,7 +2,7 @@ calculations: - type: STAT_VAR_SERIES_AGGREGATION input_imports: - NASA_NEXDCP30 - round: 1 + stage: 1 output_import: NASA_NEXDCP30_AggrDiffStats stat_var_series_aggregation: aggr_funcs: @@ -16,7 +16,7 @@ calculations: - type: STAT_VAR_SERIES_AGGREGATION input_imports: - NASA_NEXGDDP_Subnational - round: 1 + stage: 1 output_import: NASA_NEXGDDP_Subnational_AggrDiffStats stat_var_series_aggregation: aggr_funcs: @@ -32,7 +32,7 @@ calculations: - type: STAT_VAR_SERIES_AGGREGATION input_imports: - NASA_NEXGDDP_Country - round: 1 + stage: 1 output_import: NASA_NEXGDDP_Country_AggrDiffStats stat_var_series_aggregation: aggr_funcs: @@ -46,7 +46,7 @@ calculations: - type: STAT_VAR_SERIES_AGGREGATION input_imports: - NASA_NEXGDDP_CMIP6_Subnational - round: 1 + stage: 1 output_import: NASA_NEXGDDP_CMIP6_Subnational_AggrDiffStats stat_var_series_aggregation: aggr_funcs: @@ -62,7 +62,7 @@ calculations: - type: STAT_VAR_SERIES_AGGREGATION input_imports: - NASA_NEXGDDP_CMIP6_IpccPlaces50 - round: 1 + stage: 1 output_import: NASA_NEXGDDP_CMIP6_IpccPlaces50_AggrDiffStats stat_var_series_aggregation: aggr_funcs: @@ -78,7 +78,7 @@ calculations: - type: STAT_VAR_SERIES_AGGREGATION input_imports: - NASA_NEXGDDP_CMIP6_Subnational - round: 2 + stage: 2 output_import: NASA_NEXGDDP_CMIP6_Subnational_AggrStatsAcrossModels stat_var_series_aggregation: aggr_funcs: @@ -92,7 +92,7 @@ calculations: - type: STAT_VAR_SERIES_AGGREGATION input_imports: - NASA_NEXGDDP_CMIP6_IpccPlaces50 - round: 2 + stage: 2 output_import: NASA_NEXGDDP_CMIP6_IpccPlaces50_AggrStatsAcrossModels stat_var_series_aggregation: aggr_funcs: @@ -106,7 +106,7 @@ calculations: - type: STAT_VAR_SERIES_AGGREGATION input_imports: - NASA_NEXGDDP_CMIP6_Subnational_AggrStatsAcrossModels - round: 3 + stage: 3 output_import: NASA_NEXGDDP_CMIP6_Subnational_AggrOverTimeOnStatsAcrossModels stat_var_series_aggregation: aggr_funcs: @@ -209,7 +209,7 @@ calculations: - type: STAT_VAR_SERIES_AGGREGATION input_imports: - NASA_NEXGDDP_CMIP6_IpccPlaces50_AggrStatsAcrossModels - round: 3 + stage: 3 output_import: NASA_NEXGDDP_CMIP6_IpccPlaces50_AggrOverTimeOnStatsAcrossModels stat_var_series_aggregation: aggr_funcs: @@ -312,7 +312,7 @@ calculations: - type: STAT_VAR_SERIES_AGGREGATION input_imports: - NASA_NEXGDDP_IpccPlaces50 - round: 1 + stage: 1 output_import: NASA_NEXGDDP_IpccPlaces50_AggrDiffStats stat_var_series_aggregation: aggr_funcs: diff --git a/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py b/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py index e2c20c575..0f104ab49 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py +++ b/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py @@ -29,7 +29,7 @@ class AggregationOrchestrator: - """Orchestrates the overall aggregation workflow across multi-round execution.""" + """Orchestrates the overall aggregation workflow across multi-stage execution.""" def __init__( self, @@ -75,8 +75,8 @@ def __init__( def run(self, active_imports: List[str]) -> None: """Executes aggregations independently for each active import. - Blocks and synchronizes round progression for each import: - Round 1 -> Wait -> Round 2 -> Wait -> Round 3 -> Wait. + Blocks and synchronizes stage progression for each import: + Stage 1 -> Wait -> Stage 2 -> Wait -> Stage 3 -> Wait. Args: active_imports: List of active import dataset names to process. @@ -85,45 +85,45 @@ def run(self, active_imports: List[str]) -> None: for single_import in active_imports: logging.info(f"=== Starting Aggregation Pipeline for Import: '{single_import}' ===") - active_rounds = self.get_active_rounds_for_import(single_import) + active_stages = self.get_active_stages_for_import(single_import) - if not active_rounds: + if not active_stages: logging.info(f"No aggregation steps configured for import '{single_import}'. Skipping.") continue - for round_num in active_rounds: - logging.info(f"--- Triggering Round {round_num} for import '{single_import}' ---") - self._execute_and_synchronize_round(single_import, round_num) + for stage_num in active_stages: + logging.info(f"--- Triggering Stage {stage_num} for import '{single_import}' ---") + self._execute_and_synchronize_stage(single_import, stage_num) - logging.info(f"=== Successfully completed all aggregation rounds for Import: '{single_import}' ===") + logging.info(f"=== Successfully completed all aggregation stages for Import: '{single_import}' ===") - def get_active_rounds_for_import(self, single_import: str) -> List[int]: - """Returns a sorted list of unique active round numbers for a single import. + def get_active_stages_for_import(self, single_import: str) -> List[int]: + """Returns a sorted list of unique active stage numbers for a single import. Args: single_import: The active import dataset name. Returns: - Sorted list of round numbers (e.g., [1, 2, 3]). + Sorted list of stage numbers (e.g., [1, 2, 3]). """ - rounds = set() + stages = set() for calc in self.calculations: if self._calc_applies_to_import(calc, single_import): - rounds.add(calc.get("round", 1)) - return sorted(list(rounds)) + stages.add(calc.get("stage", 1)) + return sorted(list(stages)) - def get_active_rounds(self, active_imports: List[str]) -> List[int]: - """Returns a sorted list of unique active round numbers across active imports.""" - rounds = set() + def get_active_stages(self, active_imports: List[str]) -> List[int]: + """Returns a sorted list of unique active stage numbers across active imports.""" + stages = set() for single_import in active_imports: - rounds.update(self.get_active_rounds_for_import(single_import)) - return sorted(list(rounds)) + stages.update(self.get_active_stages_for_import(single_import)) + return sorted(list(stages)) - def execute_round(self, stage_num: int, active_imports: List[str]) -> List[str]: - """Executes and collects BigQuery job IDs for a given round/stage. + def execute_stage(self, stage_num: int, active_imports: List[str]) -> List[str]: + """Executes and collects BigQuery job IDs for a given stage. Args: - stage_num: The round/stage number to execute. + stage_num: The stage number to execute. active_imports: List of active import dataset names. Returns: @@ -131,35 +131,35 @@ def execute_round(self, stage_num: int, active_imports: List[str]) -> List[str]: """ job_ids = [] for single_import in active_imports: - jobs = self._dispatch_round_steps(single_import, stage_num) + jobs = self._dispatch_stage_steps(single_import, stage_num) for job in jobs: if job and getattr(job, "job_id", None): job_ids.append(job.job_id) return job_ids - def _execute_and_synchronize_round(self, single_import: str, round_num: int) -> None: - """Triggers round steps for a single import and blocks until completion.""" - jobs = self._dispatch_round_steps(single_import, round_num) + def _execute_and_synchronize_stage(self, single_import: str, stage_num: int) -> None: + """Triggers stage steps for a single import and blocks until completion.""" + jobs = self._dispatch_stage_steps(single_import, stage_num) job_ids = [job.job_id for job in jobs if job and getattr(job, "job_id", None)] if not job_ids: - logging.info(f"No BigQuery jobs submitted for Round {round_num} (import: '{single_import}').") + logging.info(f"No BigQuery jobs submitted for Stage {stage_num} (import: '{single_import}').") return - logging.info(f"Submitted {len(job_ids)} job(s) for Round {round_num} (import: '{single_import}'): {job_ids}") + logging.info(f"Submitted {len(job_ids)} job(s) for Stage {stage_num} (import: '{single_import}'): {job_ids}") self._wait_for_jobs(job_ids) - def _dispatch_round_steps(self, single_import: str, round_num: int) -> List[Any]: - """Dispatches matching calculation steps for an import and round number.""" + def _dispatch_stage_steps(self, single_import: str, stage_num: int) -> List[Any]: + """Dispatches matching calculation steps for an import and stage number.""" jobs = [] for calc in self.calculations: - if calc.get("round", 1) != round_num: + if calc.get("stage", 1) != stage_num: continue if not self._calc_applies_to_import(calc, single_import): continue step_type = calc["type"] - logging.info(f"Triggering '{step_type}' (Round {round_num}) for import '{single_import}'...") + logging.info(f"Triggering '{step_type}' (Stage {stage_num}) for import '{single_import}'...") step_jobs = [] if step_type == "PLACE_AGGREGATION" or step_type == "place": @@ -199,13 +199,13 @@ def _wait_for_jobs(self, job_ids: List[str], poll_interval: int = 5) -> None: status = status_info.get("status") if status == "DONE": - logging.info("All BigQuery jobs in round completed successfully.") + logging.info("All BigQuery jobs in stage completed successfully.") return elif status == "FAILED": error_msg = status_info.get("error", "One or more BigQuery jobs failed.") - logging.error(f"Round execution failed: {error_msg}") + logging.error(f"Stage execution failed: {error_msg}") raise RuntimeError(f"Aggregation execution failed: {error_msg}") - + time.sleep(poll_interval) def _trigger_place(self, config: Dict[str, Any], applicable_imports: List[str]) -> List[Any]: diff --git a/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py b/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py index 999ca8d36..c724248d2 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py +++ b/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py @@ -31,7 +31,7 @@ input_imports: - USFed_Census output_import: USFed_Census_AggState - round: 1 + stage: 1 place_aggregation: from_place_types: County to_place_types: State @@ -40,7 +40,7 @@ input_imports: - USFed_Census output_import: USFed_Census_StatVarAgg - round: 2 + stage: 2 stat_var_aggregation: aggregations: - ancestor_sv_id: Count_Person @@ -53,7 +53,7 @@ @patch('aggregation.orchestrator.BigQueryExecutor') class TestOrchestratorScanning(unittest.TestCase): - """Tests round scanning and active round resolution methods.""" + """Tests stage scanning and active stage resolution methods.""" def setUp(self): self.tmpdir = tempfile.TemporaryDirectory() @@ -72,20 +72,20 @@ def setUp(self): def tearDown(self): self.tmpdir.cleanup() - def test_get_active_rounds_for_import(self, mock_executor): - """Tests getting active rounds for matching and non-matching imports.""" - rounds = self.orchestrator.get_active_rounds_for_import("USFed_Census") - self.assertEqual(rounds, [1, 2]) + def test_get_active_stages_for_import(self, mock_executor): + """Tests getting active stages for matching and non-matching imports.""" + stages = self.orchestrator.get_active_stages_for_import("USFed_Census") + self.assertEqual(stages, [1, 2]) - rounds = self.orchestrator.get_active_rounds_for_import("OtherImport") - self.assertEqual(rounds, []) + stages = self.orchestrator.get_active_stages_for_import("OtherImport") + self.assertEqual(stages, []) @patch('aggregation.orchestrator.BigQueryExecutor') @patch('aggregation.orchestrator.PlaceAggregationGenerator') @patch('aggregation.orchestrator.StatVarAggregator') class TestOrchestratorExecution(unittest.TestCase): - """Tests round execution, verifying job submission and synchronization.""" + """Tests stage execution, verifying job submission and synchronization.""" def setUp(self): self.tmpdir = tempfile.TemporaryDirectory() @@ -105,7 +105,7 @@ def tearDown(self): self.tmpdir.cleanup() def test_run_synchronized_pipeline(self, mock_sv_agg, mock_place_gen, mock_executor_cls): - """Tests complete synchronized run pipeline for an import across rounds.""" + """Tests complete synchronized run pipeline for an import across stages.""" mock_job1 = MagicMock() mock_job1.job_id = "job-place-1" mock_place_gen.return_value.aggregate_places.return_value = mock_job1 diff --git a/pipeline/workflow/ingestion-helper/aggregation/schema.json b/pipeline/workflow/ingestion-helper/aggregation/schema.json index f6cad1f71..2d66ff0a0 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/schema.json +++ b/pipeline/workflow/ingestion-helper/aggregation/schema.json @@ -28,7 +28,7 @@ "minItems": 1 }, "output_import": { "type": "string" }, - "round": { "type": "integer" }, + "stage": { "type": "integer" }, "place_aggregation": { "type": "object" }, "stat_var_aggregation": { "type": "object" }, "entity_aggregation": { "type": "object" }, diff --git a/pipeline/workflow/ingestion-helper/aggregation/validator_test.py b/pipeline/workflow/ingestion-helper/aggregation/validator_test.py index 35f94ef04..01b01d4eb 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/validator_test.py +++ b/pipeline/workflow/ingestion-helper/aggregation/validator_test.py @@ -46,7 +46,7 @@ def test_validate_config_success_all_types(self): input_imports: - CensusACS5YearSurvey output_import: CensusACS5YearSurvey_AggCountry - round: 1 + stage: 1 place_aggregation: from_place_types: State to_place_types: Country @@ -77,7 +77,7 @@ def test_validate_config_success_all_types(self): - type: STAT_VAR_SERIES_AGGREGATION input_imports: - NASA_NEXDCP30 - round: 1 + stage: 1 output_import: NASA_NEXDCP30_AggrDiffStats stat_var_series_aggregation: aggr_funcs: From cb0e2da0339296b0bc5523036b5f2eb34df06148 Mon Sep 17 00:00:00 2001 From: Sandeep Tuniki Date: Thu, 2 Jul 2026 19:53:06 +0530 Subject: [PATCH 24/33] feat(aggregation): add common.yaml config, remove legacy aggregation.yaml, and update README --- .../ingestion-helper/aggregation.yaml | 17 --- .../ingestion-helper/aggregation/README.md | 138 ++++++------------ .../aggregation/configs/common.yaml | 18 +++ .../aggregation/orchestrator.py | 10 +- .../ingestion-helper/aggregation/schema.json | 5 +- 5 files changed, 73 insertions(+), 115 deletions(-) delete mode 100644 pipeline/workflow/ingestion-helper/aggregation.yaml create mode 100644 pipeline/workflow/ingestion-helper/aggregation/configs/common.yaml diff --git a/pipeline/workflow/ingestion-helper/aggregation.yaml b/pipeline/workflow/ingestion-helper/aggregation.yaml deleted file mode 100644 index b8786aa51..000000000 --- a/pipeline/workflow/ingestion-helper/aggregation.yaml +++ /dev/null @@ -1,17 +0,0 @@ -# Data Commons Aggregation Configuration. See the README for details. - -aggregations: - # Generates linkedContainedInPlace, linkedMemberOf, etc. - - type: linked_edges - imports: ["*"] - stage: 1 - - # Generates summary statistics in the Cache table - - type: provenance_summary - imports: ["*"] - stage: 1 - - # Generates the Statistical Variable hierarchy/verticals - - type: stat_var_groups - imports: ["*"] - stage: 1 diff --git a/pipeline/workflow/ingestion-helper/aggregation/README.md b/pipeline/workflow/ingestion-helper/aggregation/README.md index ec61e5a40..53e96feda 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/README.md +++ b/pipeline/workflow/ingestion-helper/aggregation/README.md @@ -1,136 +1,88 @@ # Aggregations -This module orchestrates the execution of Data Commons aggregations through BigQuery Federation. The aggregations include place rollups, statistical variable aggregations, linked edges, and metadata summaries. +This module orchestrates the execution of Data Commons aggregations through BigQuery Federation. The aggregations include place rollups, statistical variable aggregations, entity aggregations, linked edges, and metadata summaries. ## Core Concepts * **Sequential Stages**: Aggregations are executed sequentially by their `stage` number (e.g., Stage 1 steps are guaranteed to complete before Stage 2 steps begin). This is useful when later steps depend on the output of earlier ones. * **Parallel Execution**: All aggregation steps configured in the same stage are executed in parallel to maximize performance. +* **Per-Import Isolation**: Aggregations run independently for each active import dataset. --- -## Configuration Guide (`aggregation.yaml`) +## Configuration Guide (`configs/*.yaml`) -The entire aggregation pipeline is configured via `aggregation.yaml`. This file defines which aggregations run, what their dependencies are, and in what order they execute. +The aggregation pipeline is configured via YAML files in the `configs/` directory (`place.yaml`, `statvar.yaml`, `common.yaml`, etc.). Each file defines a top-level `calculations:` list. ### Common Configuration Fields -Every step in the configuration supports these common fields: -* `type` (string, Required): The type of aggregation step to run. +Every calculation step supports these common fields: +* `type` (string, Required): The type of calculation step (e.g. `PLACE_AGGREGATION`, `STAT_VAR_AGGREGATION`, `LINKED_EDGES`). * `stage` (integer, Optional, default: 1): The sequential stage number. Steps in lower stages are guaranteed to finish before higher stages start. -* `imports` (list of strings, Required): The list of import names this step applies to. Use `["*"]` (wildcard) to apply the step to **all** imports in the current run. +* `input_imports` (list of strings, Required): The list of import names this step applies to. Use `["*"]` (wildcard) to apply the step to **all** active imports. +* `output_import` (string, Optional): The output import dataset name to write aggregated observations under. * `disabled` (boolean, Optional, default: false): Set to `true` to temporarily disable a step without deleting it. --- -### Supported Aggregation Types +### Supported Calculation Types -#### 1. Place (`place`) +#### 1. Place Aggregation (`PLACE_AGGREGATION`) Aggregates and rolls up statistical data from a smaller place type (source) to a larger place type (destination). * **Fields**: - * `source_type` (string, Required): The source place type (e.g., `County`). - * `destination_type` (string, Required): The destination place type (e.g., `State`). + * `from_place_types` (string, Required): The source place type (e.g., `State`). + * `to_place_types` (string, Required): The destination place type (e.g., `Country`). * `allow_multiple_to_places` (boolean, Optional, default: false): Allows mapping to multiple parent places if true. * **Example**: ```yaml - - type: place + - type: PLACE_AGGREGATION stage: 1 - imports: ["USFed_Census"] - source_type: County - destination_type: State + input_imports: + - CensusACS5YearSurvey + output_import: CensusACS5YearSurvey_AggCountry + place_aggregation: + from_place_types: State + to_place_types: Country ``` -#### 2. Statistical Variable Aggregation (`stat_var`) -Aggregates raw statistical variables into a summarized ancestor variable (e.g., summing up individual age group counts to get a total population count). +#### 2. Statistical Variable Aggregation (`STAT_VAR_AGGREGATION`) +Aggregates raw statistical variables into a summarized ancestor variable. * **Fields**: * `ancestor_sv_id` (string, Required): The ID of the parent/summary statistical variable (e.g., `Count_Person`). * `source_sv_ids` (list of strings, Required): The list of individual statistical variables to sum up. - * `output_import_name` (string, Optional): Custom import name to write output under. * `skip_all_sources_present_check` (boolean, Optional, default: false): If true, aggregates even if some source variables are missing. * **Example**: ```yaml - - type: stat_var + - type: STAT_VAR_AGGREGATION stage: 2 - imports: ["USFed_Census"] - ancestor_sv_id: Count_Person - source_sv_ids: - - Count_Person_Male - - Count_Person_Female + input_imports: + - CensusACS5YearSurvey + output_import: CensusACS5YearSurvey_StatVarAgg + stat_var_aggregation: + aggregations: + - ancestor_sv_id: Count_Person + source_sv_ids: + - Count_Person_Male + - Count_Person_Female ``` -#### 3. Linked Edges (`linked_edges`) -Constructs and aggregates structural graph links (edges) between nodes in the Data Commons graph. +#### 3. Common Aggregations (`LINKED_EDGES`, `PROVENANCE_SUMMARY`, `STAT_VAR_GROUPS`) +Common graph structure, lineage, and UI group hierarchy rollups defined in `common.yaml`. * **Example**: ```yaml - - type: linked_edges + - type: LINKED_EDGES stage: 1 - imports: ["*"] # Runs for all imports + input_imports: + - "*" ``` -#### 4. Provenance Summary (`provenance_summary`) -Generates metadata and provenance summaries for all aggregated statistical observations, establishing data lineage. -* **Example**: - ```yaml - - type: provenance_summary - stage: 3 - imports: ["USFed_Census"] - ``` - -#### 5. Statistical Variable Groups (`stat_var_groups`) -Aggregates and structures statistical variables into hierarchical groups for display in the Data Commons UI. -* **Example**: - ```yaml - - type: stat_var_groups - stage: 3 - imports: ["*"] - ``` - ---- - -### Example `aggregation.yaml` - -This example demonstrates a typical multi-stage aggregation workflow. - -```yaml -# aggregation.yaml -aggregations: - # Stage 1: Parallel Place Rollups and Linked Edges - - type: linked_edges - stage: 1 - imports: ["*"] - - - type: place - stage: 1 - imports: ["USFed_Census"] - source_type: County - destination_type: State - - # Stage 2: Parallel Stat Var Aggregations (Depends on Stage 1 completing) - - type: stat_var - stage: 2 - imports: ["USFed_Census"] - ancestor_sv_id: Count_Person - source_sv_ids: - - Count_Person_Male - - Count_Person_Female - - # Stage 3: Metadata and UI Summaries (Depends on Stage 2 completing) - - type: provenance_summary - stage: 3 - imports: ["USFed_Census"] - - - type: stat_var_groups - stage: 3 - imports: ["*"] -``` - --- ## Local Configuration Validation -The orchestrator strictly validates the `aggregation.yaml` file on startup against a strict JSON Schema (`schema.json`). If there is any syntax error, type mismatch, or missing required field, the service will fail to start. +The orchestrator strictly validates configuration files against `schema.json`. If there is any syntax error, type mismatch, or missing required field, validation will fail. ### Running the Validator Locally -You can validate your `aggregation.yaml` file locally using the built-in CLI tool before committing or deploying changes. +You can validate all configuration files locally using the built-in CLI tool: 1. **Navigate to the ingestion-helper root**: ```bash @@ -138,10 +90,12 @@ You can validate your `aggregation.yaml` file locally using the built-in CLI too ``` 2. **Run the validator**: ```bash - python3 -m aggregation.validator --config ../aggregation.yaml - - # sample output... - # Validating 'aggregation.yaml' against 'schema.json'... - # [SUCCESS] Configuration is valid! - # Parsed 5 aggregation steps successfully. + python3 -m aggregation.validator + + # Sample output: + # Validating 7 configuration file(s) in 'aggregation/configs' against 'schema.json'... + # ✓ common.yaml (3 calculation steps) + # ✓ place.yaml (16 calculation steps) + # ✓ statvar.yaml (21 calculation steps) + # [SUCCESS] All 7 configuration file(s) passed validation! (64 calculation steps total) ``` diff --git a/pipeline/workflow/ingestion-helper/aggregation/configs/common.yaml b/pipeline/workflow/ingestion-helper/aggregation/configs/common.yaml new file mode 100644 index 000000000..59986da18 --- /dev/null +++ b/pipeline/workflow/ingestion-helper/aggregation/configs/common.yaml @@ -0,0 +1,18 @@ +calculations: + # Generates linkedContainedInPlace, linkedMemberOf, etc. + - type: LINKED_EDGES + input_imports: + - "*" + stage: 1 + + # Generates summary statistics in the Cache table + - type: PROVENANCE_SUMMARY + input_imports: + - "*" + stage: 1 + + # Generates the Statistical Variable hierarchy/verticals + - type: STAT_VAR_GROUPS + input_imports: + - "*" + stage: 1 diff --git a/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py b/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py index 0f104ab49..33e64c7f3 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py +++ b/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py @@ -162,15 +162,15 @@ def _dispatch_stage_steps(self, single_import: str, stage_num: int) -> List[Any] logging.info(f"Triggering '{step_type}' (Stage {stage_num}) for import '{single_import}'...") step_jobs = [] - if step_type == "PLACE_AGGREGATION" or step_type == "place": + if step_type == "PLACE_AGGREGATION": step_jobs = self._trigger_place(calc, [single_import]) - elif step_type == "STAT_VAR_AGGREGATION" or step_type == "stat_var": + elif step_type == "STAT_VAR_AGGREGATION": step_jobs = self._trigger_stat_var(calc, [single_import]) - elif step_type == "linked_edges": + elif step_type == "LINKED_EDGES": step_jobs = self._trigger_linked_edges(calc, [single_import]) - elif step_type == "provenance_summary": + elif step_type == "PROVENANCE_SUMMARY": step_jobs = self._trigger_provenance_summary(calc, [single_import]) - elif step_type == "stat_var_groups": + elif step_type == "STAT_VAR_GROUPS": step_jobs = self._trigger_stat_var_groups(calc, [single_import]) else: logging.warning( diff --git a/pipeline/workflow/ingestion-helper/aggregation/schema.json b/pipeline/workflow/ingestion-helper/aggregation/schema.json index 2d66ff0a0..8c0e85b8b 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/schema.json +++ b/pipeline/workflow/ingestion-helper/aggregation/schema.json @@ -19,7 +19,10 @@ "ENTITY_AGGREGATION", "STAT_VAR_SERIES_AGGREGATION", "STAT_VAR_CALCULATION", - "SUPER_ENUM_AGGREGATION" + "SUPER_ENUM_AGGREGATION", + "LINKED_EDGES", + "PROVENANCE_SUMMARY", + "STAT_VAR_GROUPS" ] }, "input_imports": { From 78d4418c6431d218fd436b619d375b304d09dca8 Mon Sep 17 00:00:00 2001 From: Sandeep Tuniki Date: Thu, 2 Jul 2026 20:23:12 +0530 Subject: [PATCH 25/33] revert(workflow): restore spanner-ingestion-workflow.yaml to origin/master --- .../workflow/spanner-ingestion-workflow.yaml | 48 +++++++------------ 1 file changed, 17 insertions(+), 31 deletions(-) diff --git a/pipeline/workflow/spanner-ingestion-workflow.yaml b/pipeline/workflow/spanner-ingestion-workflow.yaml index 4bc2d5c6e..d0a878067 100644 --- a/pipeline/workflow/spanner-ingestion-workflow.yaml +++ b/pipeline/workflow/spanner-ingestion-workflow.yaml @@ -121,52 +121,38 @@ main: run_aggregation_job: params: [import_list, helper_url] steps: - # 1. Start the aggregation and get the initial state - - initiate_aggregation: + - run_aggregation: call: http.post args: - url: ${helper_url + "/aggregation/initiate"} + url: ${helper_url + "/aggregation/run"} timeout: 300 auth: type: OIDC body: importList: ${import_list} - result: initiate_response - # 2. Store the state in a workflow variable - - assign_state: - assign: - - state: ${initiate_response.body} - # 3. State Check loop - - check_status_loop: - switch: - # Exit successfully if done - - condition: ${state.status == "SUCCEEDED"} - return: "OK" - # Raise error if failed - - condition: ${state.status == "FAILED"} - raise: ${state.error} - next: poll_and_wait - # 4. Sleep and Poll - - poll_and_wait: + result: aggregation_response + - check_aggregation_status_loop: steps: - - wait_step: + - wait_for_aggregation: call: sys.sleep args: seconds: 300 - # Pass the state back to the server, get the new state - - poll_server: + - check_aggregation_status: call: http.post args: - url: ${helper_url + "/aggregation/poll"} + url: ${helper_url + "/aggregation/status"} auth: type: OIDC - body: ${state} - result: poll_response - - update_state: - assign: - - state: ${poll_response.body} - next: check_status_loop - + body: + jobIds: ${aggregation_response.body.jobIds} + result: aggregation_status_response + - evaluate_aggregation_status: + switch: + - condition: ${aggregation_status_response.body.status == "DONE"} + return: 'OK' + - condition: ${aggregation_status_response.body.status == "FAILED"} + raise: ${aggregation_status_response.body.error} + next: check_aggregation_status_loop # This sub-workflow launches a Dataflow job and waits for it to complete. run_dataflow_job: From dc008b6d5ac2e66562aa006847a8be42c79bd910 Mon Sep 17 00:00:00 2001 From: Sandeep Tuniki Date: Thu, 2 Jul 2026 20:24:48 +0530 Subject: [PATCH 26/33] revert(workflow): restore spanner-ingestion-workflow.yaml and routes/aggregation.py to origin/master --- .../ingestion-helper/routes/aggregation.py | 215 +++--------------- 1 file changed, 35 insertions(+), 180 deletions(-) diff --git a/pipeline/workflow/ingestion-helper/routes/aggregation.py b/pipeline/workflow/ingestion-helper/routes/aggregation.py index f15e8a4e1..7433f906d 100644 --- a/pipeline/workflow/ingestion-helper/routes/aggregation.py +++ b/pipeline/workflow/ingestion-helper/routes/aggregation.py @@ -13,70 +13,44 @@ # limitations under the License. import logging -from typing import Any, Dict, List, Optional from fastapi import APIRouter, HTTPException -from pydantic import BaseModel, Field - +from utils.aggregation import AggregationUtils import config +from typing import Any, Dict, List, Optional +from pydantic import BaseModel, Field from routes.models import BaseResponse, ResponseStatus -from aggregation import AggregationOrchestrator from utils.logging import log_start - - -class AggregationWorkflowState(BaseModel): - """Represents the execution state of a multi-stage aggregation pipeline run. - - This state object is passed back and forth between the client (Google Cloud - Workflows) and the helper service endpoints to durably maintain the progress - of a stateless, sequential aggregation run across multiple stages. - """ - status: str = Field(..., description="Overall status of the run: RUNNING, SUCCEEDED, FAILED") - current_stage: int = Field(..., description="The stage currently executing") - active_job_ids: List[str] = Field(default_factory=list, description="BQ job IDs running in the current stage") - import_list: List[Dict[str, Any]] = Field(default_factory=list, description="Original list of imports") - error: Optional[str] = Field(default=None, description="Detailed error message if failed") - -class InitiateRequest(BaseModel): - importList: List[Dict[str, Any]] = Field(default_factory=list) - -# TODO: Remove AggregationRequest once all consumers migrate to /initiate and /poll class AggregationRequest(BaseModel): - """Temporary request model for compatibility run endpoint.""" importList: List[Dict[str, Any]] = Field(default_factory=list) - -# TODO: Remove AggregationStatusRequest once all consumers migrate to /initiate and /poll class AggregationStatusRequest(BaseModel): - """Temporary request model for compatibility status endpoint.""" jobIds: List[str] = Field(default_factory=list) - -# TODO: Remove AggregationResponse once all consumers migrate to /initiate and /poll class AggregationResponse(BaseResponse): - """Temporary response model for compatibility run endpoint.""" jobIds: List[str] = Field(default_factory=list, description="BigQuery job IDs submitted for async aggregation") - -# TODO: Remove AggregationStatusResponse once all consumers migrate to /initiate and /poll class AggregationStatusResponse(BaseResponse): - """Temporary response model for compatibility status endpoint.""" error: Optional[str] = Field(default=None, description="Detailed error message if failed") failedJobs: Optional[List[str]] = Field(default_factory=list, description="List of failed BigQuery job IDs") - - router = APIRouter(prefix="/aggregation", tags=["aggregation"]) - -def _get_orchestrator() -> AggregationOrchestrator: - """Helper to initialize the orchestrator using global config.""" +@router.post("/run", response_model=AggregationResponse) +@log_start +def run_aggregation(req: AggregationRequest): + """Runs aggregation logic asynchronously for the specified imports, returning BigQuery job IDs.""" + if not req.importList: + logging.info("Empty import list. Skipping aggregation.") + return AggregationResponse(status=ResponseStatus.SUBMITTED, jobIds=[]) + if not all([config.SPANNER_CONNECTION_ID, config.SPANNER_PROJECT_ID, config.SPANNER_INSTANCE_ID, config.SPANNER_GRAPH_DATABASE_ID]): raise HTTPException( status_code=400, detail="Missing required configuration environment variables: SPANNER_CONNECTION_ID, SPANNER_PROJECT_ID, SPANNER_INSTANCE_ID, or SPANNER_GRAPH_DATABASE_ID" ) - return AggregationOrchestrator( + + aggregation = AggregationUtils( connection_id=config.SPANNER_CONNECTION_ID, project_id=config.SPANNER_PROJECT_ID, instance_id=config.SPANNER_INSTANCE_ID, @@ -84,158 +58,39 @@ def _get_orchestrator() -> AggregationOrchestrator: location=config.LOCATION, is_base_dc=config.IS_BASE_DC, ) - - - -@router.post("/initiate", response_model=AggregationWorkflowState) -@log_start -def initiate_aggregation(req: InitiateRequest): - """Initiates the aggregation run by executing Stage 1 and returning the initial state.""" - if not req.importList: - logging.info("Empty import list. Skipping aggregation.") - return AggregationWorkflowState(status="SUCCEEDED", current_stage=0, active_job_ids=[], import_list=[]) - try: - orchestrator = _get_orchestrator() - import_names = [item.get('importName') for item in req.importList if item.get('importName')] - - active_stages = orchestrator.get_active_stages(import_names) - if not active_stages: - logging.info("No stages have active aggregations for the current imports. Completing immediately.") - return AggregationWorkflowState(status="SUCCEEDED", current_stage=0, active_job_ids=[], import_list=req.importList) - - first_stage = active_stages[0] - - logging.info(f"Initiating aggregation at Stage {first_stage}") - job_ids = orchestrator.execute_stage(first_stage, import_names) - - return AggregationWorkflowState( - status="RUNNING", - current_stage=first_stage, - active_job_ids=job_ids, - import_list=req.importList - ) - except Exception as e: - logging.error(f"Failed to initiate aggregation: {e}") - raise HTTPException(status_code=500, detail=f"Failed to initiate aggregation: {str(e)}") - - -@router.post("/poll", response_model=AggregationWorkflowState) -@log_start -def poll_aggregation(state: AggregationWorkflowState): - """Checks progress of active jobs and transitions to the next stage if complete.""" - if state.status != "RUNNING": - return state # Already in a terminal state - - try: - orchestrator = _get_orchestrator() - import_names = [item.get('importName') for item in state.import_list if item.get('importName')] - - # 1. Check status of active jobs in BigQuery - if not state.active_job_ids: - bq_status = {"status": "DONE"} - else: - logging.info(f"Polling status for jobs in Stage {state.current_stage}: {state.active_job_ids}") - bq_status = orchestrator.check_jobs_status(state.active_job_ids) - - # Case A: Any job failed - if bq_status["status"] == "FAILED": - logging.error(f"Stage {state.current_stage} failed with error: {bq_status.get('error')}") - return AggregationWorkflowState( - status="FAILED", - current_stage=state.current_stage, - active_job_ids=[], - import_list=state.import_list, - error=bq_status.get("error") - ) - - # Case B: Jobs are still executing (explicitly check for DONE to transition) - if bq_status["status"] != "DONE": - logging.info(f"Stage {state.current_stage} is still executing (status: {bq_status['status']}).") - return state # Return unchanged - - # Case C: All jobs succeeded -> Find and execute the next active stage - active_stages = orchestrator.get_active_stages(import_names) - next_stages = [s for s in active_stages if s > state.current_stage] - - if next_stages: - next_stage = next_stages[0] - logging.info(f"Stage {state.current_stage} completed. Transitioning to Stage {next_stage}...") - new_job_ids = orchestrator.execute_stage(next_stage, import_names) - return AggregationWorkflowState( - status="RUNNING", - current_stage=next_stage, - active_job_ids=new_job_ids, - import_list=state.import_list - ) - - # If we exit the loop, there are no more active stages left - logging.info("All aggregation stages completed successfully!") - return AggregationWorkflowState( - status="SUCCEEDED", - current_stage=state.current_stage, - active_job_ids=[], - import_list=state.import_list - ) - - except Exception as e: - logging.error(f"Error during polling: {e}") - return AggregationWorkflowState( - status="FAILED", - current_stage=state.current_stage, - active_job_ids=[], - import_list=state.import_list, - error=f"Orchestrator error: {str(e)}" - ) - -# TODO: Remove the /run endpoint once all consumers migrate to /initiate and /poll -@router.post("/run", response_model=AggregationResponse, deprecated=True) -@log_start -def run_aggregation(req: AggregationRequest): - """Temporary endpoint. Runs ALL enabled aggregations in parallel (ignores stages). - - Please migrate to /initiate and /poll endpoints. - """ - if not req.importList: - logging.info("Empty import list. Skipping temporary aggregation.") - return AggregationResponse(status=ResponseStatus.SUBMITTED, jobIds=[]) - - try: - orchestrator = _get_orchestrator() - import_names = [item.get('importName') for item in req.importList if item.get('importName')] - - # Compatibility Mode: Submit ALL enabled stages in parallel - job_ids = [] - active_stages = orchestrator.get_active_stages(import_names) - for stage_num in active_stages: - job_ids.extend(orchestrator.execute_stage(stage_num, import_names)) - + job_ids = aggregation.run_aggregation(req.importList) return AggregationResponse(status=ResponseStatus.SUBMITTED, jobIds=job_ids) except Exception as e: - logging.error(f"Temporary aggregation failed: {e}") - raise HTTPException(status_code=500, detail=f"Temporary aggregation failed: {str(e)}") + raise HTTPException(status_code=500, detail=f"Aggregation failed: {str(e)}") - -# TODO: Remove the /status endpoint once all consumers migrate to /initiate and /poll -@router.post("/status", response_model=AggregationStatusResponse, deprecated=True) -@log_start -def get_aggregation_status(req: AggregationStatusRequest): - """Temporary endpoint. Checks the status of the submitted BigQuery jobs. - - Please migrate to /initiate and /poll endpoints. - """ +@router.post("/status", response_model=AggregationStatusResponse) +def check_aggregation_status(req: AggregationStatusRequest): + """Checks the status of the submitted aggregation BigQuery jobs.""" if not req.jobIds: logging.info("Empty jobIds. Returning status DONE.") return AggregationStatusResponse(status=ResponseStatus.DONE) + if not all([config.SPANNER_CONNECTION_ID, config.SPANNER_PROJECT_ID, config.SPANNER_INSTANCE_ID, config.SPANNER_GRAPH_DATABASE_ID]): + raise HTTPException( + status_code=400, + detail="Missing required configuration environment variables." + ) + + aggregation = AggregationUtils( + connection_id=config.SPANNER_CONNECTION_ID, + project_id=config.SPANNER_PROJECT_ID, + instance_id=config.SPANNER_INSTANCE_ID, + database_id=config.SPANNER_GRAPH_DATABASE_ID, + location=config.LOCATION, + is_base_dc=config.IS_BASE_DC, + ) try: - orchestrator = _get_orchestrator() - status_info = orchestrator.check_jobs_status(req.jobIds) + status_info = aggregation.check_aggregation_status(req.jobIds) return AggregationStatusResponse( status=ResponseStatus.from_str(status_info.get("status", "ERROR")), error=status_info.get("error"), - failedJobs=status_info.get("failed_jobs", []) + failedJobs=status_info.get("failedJobs", []) ) except Exception as e: - logging.error(f"Temporary check status failed: {e}") - raise HTTPException(status_code=500, detail=f"Temporary check status failed: {str(e)}") + raise HTTPException(status_code=500, detail=f"Aggregation status check failed: {str(e)}") From 086db67b638f10155d722e49d80ed04dfb9b97e5 Mon Sep 17 00:00:00 2001 From: Sandeep Tuniki Date: Thu, 2 Jul 2026 20:42:13 +0530 Subject: [PATCH 27/33] feat(aggregation): add informative name property to all calculation steps across YAML configs --- .../aggregation/configs/common.yaml | 9 ++- .../aggregation/configs/entity.yaml | 26 ++++---- .../aggregation/configs/place.yaml | 57 +++++++++------- .../aggregation/configs/statvar.yaml | 65 ++++++++++++------- .../configs/statvar_calculation.yaml | 6 +- .../aggregation/configs/statvar_series.yaml | 30 ++++++--- .../aggregation/configs/super_enum.yaml | 15 +++-- .../aggregation/orchestrator.py | 3 +- .../ingestion-helper/aggregation/schema.json | 1 + 9 files changed, 132 insertions(+), 80 deletions(-) diff --git a/pipeline/workflow/ingestion-helper/aggregation/configs/common.yaml b/pipeline/workflow/ingestion-helper/aggregation/configs/common.yaml index 59986da18..a066c3f0d 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/configs/common.yaml +++ b/pipeline/workflow/ingestion-helper/aggregation/configs/common.yaml @@ -1,18 +1,21 @@ calculations: # Generates linkedContainedInPlace, linkedMemberOf, etc. - - type: LINKED_EDGES + - name: "Global: ContainedInPlace & MemberOf Graph Linked Edges" + type: LINKED_EDGES input_imports: - "*" stage: 1 # Generates summary statistics in the Cache table - - type: PROVENANCE_SUMMARY + - name: "Global: Cache Provenance & Lineage Summary" + type: PROVENANCE_SUMMARY input_imports: - "*" stage: 1 # Generates the Statistical Variable hierarchy/verticals - - type: STAT_VAR_GROUPS + - name: "Global: StatVar Group Hierarchy & Verticals" + type: STAT_VAR_GROUPS input_imports: - "*" stage: 1 diff --git a/pipeline/workflow/ingestion-helper/aggregation/configs/entity.yaml b/pipeline/workflow/ingestion-helper/aggregation/configs/entity.yaml index dcf30500d..4abb8737c 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/configs/entity.yaml +++ b/pipeline/workflow/ingestion-helper/aggregation/configs/entity.yaml @@ -1,7 +1,8 @@ calculations: # Earthquakes - - type: ENTITY_AGGREGATION + - name: "EarthquakeUSGS: Earthquake Event Aggregation" + type: ENTITY_AGGREGATION output_import: EarthquakeUSGS_Agg input_imports: - EarthquakeUSGS @@ -30,7 +31,8 @@ calculations: - "magnitude: [8 9 M]" # Fires - - type: ENTITY_AGGREGATION + - name: "FireFAMWEB: Wildland Fire Event Aggregation" + type: ENTITY_AGGREGATION output_import: FireFAMWEB_Agg input_imports: - FireFAMWEB @@ -43,7 +45,8 @@ calculations: agg_date_formats: - YYYY - - type: ENTITY_AGGREGATION + - name: "FireWFIGS: Fire Incident Event Aggregation" + type: ENTITY_AGGREGATION output_import: FireWFIGS_Agg input_imports: - FireWFIGS @@ -59,7 +62,8 @@ calculations: - YYYY-MM - YYYY - - type: ENTITY_AGGREGATION + - name: "NASA_VIIRSActiveFiresEvents: Fire Event S2 Cell Aggregation" + type: ENTITY_AGGREGATION output_import: NASA_VIIRSActiveFiresEvents_Agg # Aggregate event counts for S2-cells of level 13 input_imports: @@ -75,7 +79,8 @@ calculations: - YYYY-MM # Storms - - type: ENTITY_AGGREGATION + - name: "StormNOAA: Storm & Extreme Weather Event Aggregation" + type: ENTITY_AGGREGATION output_import: StormNOAA_Agg input_imports: - StormNOAA @@ -86,11 +91,6 @@ calculations: agg_date_formats: - YYYY - YYYY-MM - # The following query is used to get all the types: - # - # SELECT DISTINCT object_id - # FROM `datcom-store.dc_kg_latest.Triple` - # WHERE prov_id='dc/svnv9g3' AND predicate = 'typeOf' entity_types: - MarineHighWindEvent - HighWindEvent @@ -156,7 +156,8 @@ calculations: - StormEpisode # Floods - - type: ENTITY_AGGREGATION + - name: "DynamicWorld_FloodEvents: Flood Event S2 Cell Aggregation" + type: ENTITY_AGGREGATION output_import: DynamicWorld_FloodEvents_Agg # Aggregate event counts for S2-cells of level 13 input_imports: @@ -172,7 +173,8 @@ calculations: - YYYY-MM # Heat/Cold Temperature Events counts - - type: ENTITY_AGGREGATION + - name: "TemperatureEvents: Heat & Cold Event Aggregation" + type: ENTITY_AGGREGATION output_import: TemperatureEvents_Agg input_imports: - TemperatureEvents diff --git a/pipeline/workflow/ingestion-helper/aggregation/configs/place.yaml b/pipeline/workflow/ingestion-helper/aggregation/configs/place.yaml index cf90fb7bb..23943f141 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/configs/place.yaml +++ b/pipeline/workflow/ingestion-helper/aggregation/configs/place.yaml @@ -1,5 +1,6 @@ calculations: - - type: PLACE_AGGREGATION + - name: "CensusACS5YearSurvey: State -> Country Place Rollup" + type: PLACE_AGGREGATION input_imports: - CensusACS5YearSurvey place_aggregation: @@ -8,7 +9,8 @@ calculations: stage: 1 output_import: CensusACS5YearSurvey_AggCountry - - type: PLACE_AGGREGATION + - name: "CensusSAHIE: State -> Country Place Rollup" + type: PLACE_AGGREGATION input_imports: - CensusSAHIE place_aggregation: @@ -17,7 +19,8 @@ calculations: stage: 1 output_import: CensusSAHIE_AggCountry - - type: PLACE_AGGREGATION + - name: "CDCMortality: County -> State Place Rollup" + type: PLACE_AGGREGATION input_imports: - CDCMortality place_aggregation: @@ -26,7 +29,8 @@ calculations: stage: 1 output_import: CDCMortality_AggState - - type: PLACE_AGGREGATION + - name: "CDCMortality: State -> Country Place Rollup" + type: PLACE_AGGREGATION input_imports: - CDCMortality_AggState place_aggregation: @@ -35,16 +39,8 @@ calculations: stage: 2 output_import: CDCMortality_AggState_AggCountry - # - type: PLACE_AGGREGATION - # input_imports: - # - FBIGovCrime - # place_aggregation: - # from_place_types: State - # to_place_types: Country - # stage: 1 - # output_import: FBIGovCrime_AggCountry - - - type: PLACE_AGGREGATION + - name: "DEA_ARCOS: County -> State Place Rollup" + type: PLACE_AGGREGATION input_imports: - DEA_ARCOS place_aggregation: @@ -53,7 +49,8 @@ calculations: stage: 1 output_import: DEA_ARCOS_AggState - - type: PLACE_AGGREGATION + - name: "DEA_ARCOS: State -> Country Place Rollup" + type: PLACE_AGGREGATION input_imports: - DEA_ARCOS_AggState place_aggregation: @@ -62,7 +59,8 @@ calculations: stage: 2 output_import: DEA_ARCOS_AggState_AggCountry - - type: PLACE_AGGREGATION + - name: "EPA_EJSCREEN: CensusBlockGroup -> CensusTract Place Rollup" + type: PLACE_AGGREGATION input_imports: - EPA_EJSCREEN place_aggregation: @@ -71,7 +69,8 @@ calculations: stage: 1 output_import: EPA_EJSCREEN_AggCensusTract - - type: PLACE_AGGREGATION + - name: "EPA_EJSCREEN: CensusTract -> County Place Rollup" + type: PLACE_AGGREGATION input_imports: - EPA_EJSCREEN_AggCensusTract place_aggregation: @@ -80,7 +79,8 @@ calculations: stage: 2 output_import: EPA_EJSCREEN_AggCensusTract_AggCounty - - type: PLACE_AGGREGATION + - name: "DeepSolar: CensusBlockGroup -> CensusTract Place Rollup" + type: PLACE_AGGREGATION input_imports: - DeepSolar place_aggregation: @@ -89,7 +89,8 @@ calculations: stage: 1 output_import: DeepSolar_AggCensusTract - - type: PLACE_AGGREGATION + - name: "DeepSolar: CensusTract -> County Place Rollup" + type: PLACE_AGGREGATION input_imports: - DeepSolar_AggCensusTract place_aggregation: @@ -98,7 +99,8 @@ calculations: stage: 2 output_import: DeepSolar_AggCensusTract_AggCounty - - type: PLACE_AGGREGATION + - name: "EPA_GHGRP: EpaReportingFacility -> County Place Rollup" + type: PLACE_AGGREGATION input_imports: - EPA_GHGRP place_aggregation: @@ -107,7 +109,8 @@ calculations: stage: 1 output_import: EPA_GHGRP_AggCounty - - type: PLACE_AGGREGATION + - name: "EPA_GHGRP: EpaReportingFacility -> CensusZipCodeTabulationArea Place Rollup" + type: PLACE_AGGREGATION input_imports: - EPA_GHGRP place_aggregation: @@ -116,7 +119,8 @@ calculations: stage: 1 output_import: EPA_GHGRP_AggCensusZipCodeTabulationArea - - type: PLACE_AGGREGATION + - name: "EPA_GHGRP: County -> State Place Rollup" + type: PLACE_AGGREGATION input_imports: - EPA_GHGRP_AggCounty place_aggregation: @@ -125,7 +129,8 @@ calculations: stage: 2 output_import: EPA_GHGRP_AggCounty_AggState - - type: PLACE_AGGREGATION + - name: "RFF_USGridGeo: GeoGrid -> County Place Rollup" + type: PLACE_AGGREGATION input_imports: - RFF_USGridGeo_WeatherVariabilityForecast place_aggregation: @@ -135,7 +140,8 @@ calculations: stage: 1 output_import: RFF_USGridGeo_WeatherVariabilityForecast_AggCounty - - type: PLACE_AGGREGATION + - name: "India_RBIStateDomesticProduct: State -> Country Place Rollup" + type: PLACE_AGGREGATION input_imports: - India_RBIStateDomesticProduct place_aggregation: @@ -145,7 +151,8 @@ calculations: stage: 1 output_import: India_RBIStateDomesticProduct_AggCountry - - type: PLACE_AGGREGATION + - name: "India_RBIStateDomesticProduct: StatVarAgg State -> Country Place Rollup" + type: PLACE_AGGREGATION input_imports: - India_RBIStateDomesticProduct_StatVarAgg place_aggregation: diff --git a/pipeline/workflow/ingestion-helper/aggregation/configs/statvar.yaml b/pipeline/workflow/ingestion-helper/aggregation/configs/statvar.yaml index c25e7fae2..9fd864a48 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/configs/statvar.yaml +++ b/pipeline/workflow/ingestion-helper/aggregation/configs/statvar.yaml @@ -4,7 +4,8 @@ calculations: # Health Insurance Coverage # ------------------------- # - - type: STAT_VAR_AGGREGATION + - name: "CensusACS5YearSurvey: Health Insurance SV Aggregation" + type: STAT_VAR_AGGREGATION output_import: CensusACS5YearSurvey_HealthInsurance_StatVarAgg input_imports: - CensusACS5YearSurvey @@ -271,7 +272,8 @@ calculations: # NOTE: Values SpeakEnglishLessThanVeryWell and SpeakEnglishVeryWell already # have population values from a different table. # - - type: STAT_VAR_AGGREGATION + - name: "CensusACS5YearSurvey: Ability To Speak English SV Aggregation" + type: STAT_VAR_AGGREGATION output_import: CensusACS5YearSurvey_AbilityToSpeakEnglish_StatVarAgg input_imports: - CensusACS5YearSurvey @@ -318,7 +320,8 @@ calculations: # aggregate to total and by gender. # (https://data.census.gov/cedsci/table?tid=ACSDT5Y2019.B23001) # - - type: STAT_VAR_AGGREGATION + - name: "CensusACS5YearSurvey: In Armed Forces SV Aggregation" + type: STAT_VAR_AGGREGATION output_import: CensusACS5YearSurvey_InArmedForces_StatVarAgg input_imports: - CensusACS5YearSurvey @@ -378,7 +381,8 @@ calculations: - dc/2zj8jthd4f563 # Education - CensusACS5YearSurvey - - type: STAT_VAR_AGGREGATION + - name: "CensusACS5YearSurvey: Education SV Aggregation" + type: STAT_VAR_AGGREGATION output_import: CensusACS5YearSurvey_Education_StatVarAgg input_imports: - CensusACS5YearSurvey @@ -593,8 +597,9 @@ calculations: - dc/dc9v9h3q8l8n7 # Education - ACSED5YrSurvey - - type: STAT_VAR_AGGREGATION - output_import: ACSED5YrSurvey_StatVarAgg + - name: "CensusSAHIE_AggCountry: Health Insurance SV Aggregation" + type: STAT_VAR_AGGREGATION + output_import: CensusSAHIE_AggCountry_StatVarAgg input_imports: - ACSED5YrSurvey stat_var_aggregation: @@ -621,7 +626,8 @@ calculations: - dc/bstxmnb4k1wrb # Demographics - - type: STAT_VAR_AGGREGATION + - name: "CensusACS5YearSurvey: Demographics SV Aggregation" + type: STAT_VAR_AGGREGATION output_import: CensusACS5YearSurvey_Demographics_StatVarAgg input_imports: - CensusACS5YearSurvey @@ -638,7 +644,8 @@ calculations: - Count_Person_0To4Years_Female # Agriculture - - type: STAT_VAR_AGGREGATION + - name: "USDA_AgricultureCensus: Agriculture Producer SV Aggregation" + type: STAT_VAR_AGGREGATION output_import: USDA_AgricultureCensus_Agriculture_StatVarAgg input_imports: - USDA_AgricultureCensus @@ -659,7 +666,8 @@ calculations: # Employment # ------------------------ - - type: STAT_VAR_AGGREGATION + - name: "CensusACS5YearSurvey: Employment & Industry SV Aggregation" + type: STAT_VAR_AGGREGATION output_import: CensusACS5YearSurvey_Employment_StatVarAgg input_imports: - CensusACS5YearSurvey @@ -849,7 +857,8 @@ calculations: # } # Crime - - type: STAT_VAR_AGGREGATION + - name: "USNationalPrisonerStatistics: Crime & Correctional Facility SV Aggregation" + type: STAT_VAR_AGGREGATION output_import: USNationalPrisonerStatistics_Crime_StatVarAgg input_imports: - USNationalPrisonerStatistics @@ -861,7 +870,8 @@ calculations: - dc/91vy0sf20wlg9 # WithOwnChildrenUnder18. - - type: STAT_VAR_AGGREGATION + - name: "CensusACS5YearSurvey_SubjectTables_S1251: Children & Household SV Aggregation" + type: STAT_VAR_AGGREGATION output_import: CensusACS5YearSurvey_SubjectTables_S1251_StatVarAgg input_imports: - CensusACS5YearSurvey_SubjectTables_S1251 @@ -881,7 +891,8 @@ calculations: - Count_Person_WithOwnChildrenUnder18_Male # Marriage - - type: STAT_VAR_AGGREGATION + - name: "CensusACS5YearSurvey_SubjectTables_S1201: Marital Status SV Aggregation" + type: STAT_VAR_AGGREGATION output_import: CensusACS5YearSurvey_SubjectTables_S1201_StatVarAgg input_imports: - CensusACS5YearSurvey_SubjectTables_S1201 @@ -905,7 +916,8 @@ calculations: - Count_Person_InLaborForce_Male_Widowed # Employment by business ownership type. - - type: STAT_VAR_AGGREGATION + - name: "CensusACS5YearSurvey_SubjectTables_S2408: Business Ownership Employment SV Aggregation" + type: STAT_VAR_AGGREGATION output_import: CensusACS5YearSurvey_SubjectTables_S2408_StatVarAgg input_imports: - CensusACS5YearSurvey_SubjectTables_S2408 @@ -921,7 +933,8 @@ calculations: - Count_Person_PrivatelyOwnedForProfitEstablishment_Female_PaidWorker # US Citizen by Naturalization - - type: STAT_VAR_AGGREGATION + - name: "CensusACS5YearSurvey_SubjectTables_S0504: Naturalized Citizenship SV Aggregation" + type: STAT_VAR_AGGREGATION output_import: CensusACS5YearSurvey_SubjectTables_S0504_StatVarAgg input_imports: - CensusACS5YearSurvey_SubjectTables_S0504 @@ -1051,7 +1064,8 @@ calculations: - Count_Person_USCitizenByNaturalization_DateOfEntry2010OrLater_ForeignBorn # HousingUnit HomeValue. - - type: STAT_VAR_AGGREGATION + - name: "CensusACS5YearSurvey: Home Value SV Aggregation" + type: STAT_VAR_AGGREGATION output_import: CensusACS5YearSurvey_HousingUnit_HomeValue_StatVarAgg input_imports: - CensusACS5YearSurvey @@ -1101,7 +1115,8 @@ calculations: - Count_HousingUnit_HomeValue750000To999999USDollar skip_all_sources_present_check: true - - type: STAT_VAR_AGGREGATION + - name: "CensusACS5YearSurvey: Person Age Groups SV Aggregation" + type: STAT_VAR_AGGREGATION output_import: CensusACS5YearSurvey_Person_Age_StatVarAgg input_imports: - CensusACS5YearSurvey @@ -1116,7 +1131,8 @@ calculations: - Count_Person_62To64Years skip_all_sources_present_check: true - - type: STAT_VAR_AGGREGATION + - name: "CensusACS5YearSurvey: Individual Income SV Aggregation" + type: STAT_VAR_AGGREGATION output_import: CensusACS5YearSurvey_Income_StatVarAgg input_imports: - CensusACS5YearSurvey @@ -1307,7 +1323,8 @@ calculations: # Population: Male, 10,000 - 12,499 USD, White Alone Not Hispanic or Latino, Worked Full Time - dc/5jp07brw3g26h - - type: STAT_VAR_AGGREGATION + - name: "DEA_ARCOS: Opioid Prescription SV Aggregation" + type: STAT_VAR_AGGREGATION output_import: IndiaNSS_HealthAilments_StatVarAgg input_imports: - IndiaNSS_HealthAilments @@ -1618,7 +1635,8 @@ calculations: - Count_MedicalConditionIncident_70OrMoreYears_Male_ConditionSkinDisease_AsAFractionOf_Count_Person skip_all_sources_present_check: true - - type: STAT_VAR_AGGREGATION + - name: "OECDRegionalDemography: Person Age Groups SV Aggregation" + type: STAT_VAR_AGGREGATION output_import: OECDRegionalDemography_Person_Age_StatVarAgg input_imports: - OECDRegionalDemography_Population @@ -1703,7 +1721,8 @@ calculations: - Count_Person_80OrMoreYears skip_all_sources_present_check: true - - type: STAT_VAR_AGGREGATION + - name: "CensusACS5YearSurvey_SubjectTables_S0801: Work Commute SV Aggregation" + type: STAT_VAR_AGGREGATION output_import: CensusACS5YearSurvey_SubjectTables_S0801_StatVarAgg input_imports: - CensusACS5YearSurvey_SubjectTables_S0801 @@ -1819,7 +1838,8 @@ calculations: - Count_Person_Years16Onwards_35To44Minute_WorkCommute_Employed_Female_WorkedOutsideOfHome - Count_Person_Years16Onwards_45To59Minute_WorkCommute_Employed_Female_WorkedOutsideOfHome - Count_Person_Years16Onwards_60OrMoreMinute_WorkCommute_Employed_Female_WorkedOutsideOfHome - - type: STAT_VAR_AGGREGATION + - name: "CensusACS5YearSurvey_SubjectTables_S0804: Means of Transportation Work Commute SV Aggregation" + type: STAT_VAR_AGGREGATION output_import: CensusACS5YearSurvey_SubjectTables_S0804_StatVarAgg input_imports: - CensusACS5YearSurvey_SubjectTables_S0804 @@ -1858,7 +1878,8 @@ calculations: - Count_Person_Years16Onwards_CarTruckOrVanCarpooled_35To44Minute_WorkCommute_Employed_WorkedOutsideOfHome - Count_Person_Years16Onwards_CarTruckOrVanCarpooled_45To59Minute_WorkCommute_Employed_WorkedOutsideOfHome - Count_Person_Years16Onwards_CarTruckOrVanCarpooled_60OrMoreMinute_WorkCommute_Employed_WorkedOutsideOfHome - - type: STAT_VAR_AGGREGATION + - name: "India_RBIStateDomesticProduct: Gross Value Added SV Aggregation" + type: STAT_VAR_AGGREGATION output_import: India_RBIStateDomesticProduct_StatVarAgg input_imports: - India_RBIStateDomesticProduct diff --git a/pipeline/workflow/ingestion-helper/aggregation/configs/statvar_calculation.yaml b/pipeline/workflow/ingestion-helper/aggregation/configs/statvar_calculation.yaml index 12408e433..7602fb587 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/configs/statvar_calculation.yaml +++ b/pipeline/workflow/ingestion-helper/aggregation/configs/statvar_calculation.yaml @@ -1,7 +1,8 @@ calculations: # Energy. - - type: STAT_VAR_CALCULATION + - name: "EIA_Electricity: Annual Emissions Per Capita StatVar Calculation" + type: STAT_VAR_CALCULATION input_imports: - EPA_GHGRP_AggCounty - EPA_GHGRP_AggCensusZipCodeTabulationArea @@ -39,7 +40,8 @@ calculations: # note that the output uses sv_prefix and measurement_method_prefix which is not # the case above, for example. Whenever sv and measurement_method are set in the # output, they are preferred. Otherwise, the *_prefix is used. - - type: STAT_VAR_CALCULATION + - name: "Climate: NASA CMIP6 Temperature Modeling Diffs Calculation" + type: STAT_VAR_CALCULATION input_imports: - NASA_NEXGDDP_CMIP6_Subnational - NASA_NEXGDDP_CMIP6_IpccPlaces50 diff --git a/pipeline/workflow/ingestion-helper/aggregation/configs/statvar_series.yaml b/pipeline/workflow/ingestion-helper/aggregation/configs/statvar_series.yaml index c565076d9..f8ef1793c 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/configs/statvar_series.yaml +++ b/pipeline/workflow/ingestion-helper/aggregation/configs/statvar_series.yaml @@ -1,5 +1,6 @@ calculations: - - type: STAT_VAR_SERIES_AGGREGATION + - name: "NASA_NEXDCP30: Measurement Methods & Base Date Diff Series Aggregation" + type: STAT_VAR_SERIES_AGGREGATION input_imports: - NASA_NEXDCP30 stage: 1 @@ -13,7 +14,8 @@ calculations: - "1990" - "2006" - - type: STAT_VAR_SERIES_AGGREGATION + - name: "NASA_NEXGDDP_Subnational: Measurement Methods & Base Date Diff Series Aggregation" + type: STAT_VAR_SERIES_AGGREGATION input_imports: - NASA_NEXGDDP_Subnational stage: 1 @@ -29,7 +31,8 @@ calculations: - "1990" - "2006" - - type: STAT_VAR_SERIES_AGGREGATION + - name: "NASA_NEXGDDP_Country: Measurement Methods & Base Date Diff Series Aggregation" + type: STAT_VAR_SERIES_AGGREGATION input_imports: - NASA_NEXGDDP_Country stage: 1 @@ -43,7 +46,8 @@ calculations: - "1990" - "2006" - - type: STAT_VAR_SERIES_AGGREGATION + - name: "NASA_NEXGDDP_CMIP6_Subnational: Measurement Methods & Base Date Diff Series Aggregation" + type: STAT_VAR_SERIES_AGGREGATION input_imports: - NASA_NEXGDDP_CMIP6_Subnational stage: 1 @@ -59,7 +63,8 @@ calculations: - "1990" - "2006" - - type: STAT_VAR_SERIES_AGGREGATION + - name: "NASA_NEXGDDP_CMIP6_IpccPlaces50: Measurement Methods & Base Date Diff Series Aggregation" + type: STAT_VAR_SERIES_AGGREGATION input_imports: - NASA_NEXGDDP_CMIP6_IpccPlaces50 stage: 1 @@ -75,7 +80,8 @@ calculations: - "1990" - "2006" - - type: STAT_VAR_SERIES_AGGREGATION + - name: "NASA_NEXGDDP_CMIP6_Subnational: Stats Across Models Series Aggregation" + type: STAT_VAR_SERIES_AGGREGATION input_imports: - NASA_NEXGDDP_CMIP6_Subnational stage: 2 @@ -89,7 +95,8 @@ calculations: - OPERATOR_PERCENTILE90 - OPERATOR_PERCENTILE10 - - type: STAT_VAR_SERIES_AGGREGATION + - name: "NASA_NEXGDDP_CMIP6_IpccPlaces50: Stats Across Models Series Aggregation" + type: STAT_VAR_SERIES_AGGREGATION input_imports: - NASA_NEXGDDP_CMIP6_IpccPlaces50 stage: 2 @@ -103,7 +110,8 @@ calculations: - OPERATOR_PERCENTILE90 - OPERATOR_PERCENTILE10 - - type: STAT_VAR_SERIES_AGGREGATION + - name: "NASA_NEXGDDP_CMIP6_Subnational: Over Time Stats Across Models Series Aggregation" + type: STAT_VAR_SERIES_AGGREGATION input_imports: - NASA_NEXGDDP_CMIP6_Subnational_AggrStatsAcrossModels stage: 3 @@ -206,7 +214,8 @@ calculations: comparison: OPERATOR_LE output_cprop: minTemperature - - type: STAT_VAR_SERIES_AGGREGATION + - name: "NASA_NEXGDDP_CMIP6_IpccPlaces50: Over Time Stats Across Models Series Aggregation" + type: STAT_VAR_SERIES_AGGREGATION input_imports: - NASA_NEXGDDP_CMIP6_IpccPlaces50_AggrStatsAcrossModels stage: 3 @@ -309,7 +318,8 @@ calculations: comparison: OPERATOR_LE output_cprop: minTemperature - - type: STAT_VAR_SERIES_AGGREGATION + - name: "NASA_NEXGDDP_IpccPlaces50: Measurement Methods & Base Date Diff Series Aggregation" + type: STAT_VAR_SERIES_AGGREGATION input_imports: - NASA_NEXGDDP_IpccPlaces50 stage: 1 diff --git a/pipeline/workflow/ingestion-helper/aggregation/configs/super_enum.yaml b/pipeline/workflow/ingestion-helper/aggregation/configs/super_enum.yaml index 661d51dcf..cd62f1877 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/configs/super_enum.yaml +++ b/pipeline/workflow/ingestion-helper/aggregation/configs/super_enum.yaml @@ -1,25 +1,30 @@ calculations: - - type: SUPER_ENUM_AGGREGATION + - name: "CensusACS5YearSurvey: Super Enum Mapping Aggregation" + type: SUPER_ENUM_AGGREGATION input_imports: - CensusACS5YearSurvey output_import: CensusACS5YearSurvey_SuperEnum - - type: SUPER_ENUM_AGGREGATION + - name: "CensusACS5YearSurvey_AggCountry: Super Enum Mapping Aggregation" + type: SUPER_ENUM_AGGREGATION input_imports: - CensusACS5YearSurvey_AggCountry output_import: CensusACS5YearSurvey_AggCountry_SuperEnum - - type: SUPER_ENUM_AGGREGATION + - name: "NCES_PrivateSchoolStats: Super Enum Mapping Aggregation" + type: SUPER_ENUM_AGGREGATION input_imports: - NCES_PrivateSchoolStats output_import: NCES_PrivateSchoolStats_SuperEnum - - type: SUPER_ENUM_AGGREGATION + - name: "NCES_PublicSchoolStats: Super Enum Mapping Aggregation" + type: SUPER_ENUM_AGGREGATION input_imports: - NCES_PublicSchoolStats output_import: NCES_PublicSchoolStats_SuperEnum - - type: SUPER_ENUM_AGGREGATION + - name: "NCES_SchoolDistrictStats: Super Enum Mapping Aggregation" + type: SUPER_ENUM_AGGREGATION input_imports: - NCES_SchoolDistrictStats output_import: NCES_SchoolDistrictStats_SuperEnum diff --git a/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py b/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py index 33e64c7f3..52f9de8a8 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py +++ b/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py @@ -159,7 +159,8 @@ def _dispatch_stage_steps(self, single_import: str, stage_num: int) -> List[Any] continue step_type = calc["type"] - logging.info(f"Triggering '{step_type}' (Stage {stage_num}) for import '{single_import}'...") + calc_name = calc.get("name") or step_type + logging.info(f"Triggering step: '{calc_name}' (Stage {stage_num}) for import '{single_import}'...") step_jobs = [] if step_type == "PLACE_AGGREGATION": diff --git a/pipeline/workflow/ingestion-helper/aggregation/schema.json b/pipeline/workflow/ingestion-helper/aggregation/schema.json index 8c0e85b8b..ab59d0f15 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/schema.json +++ b/pipeline/workflow/ingestion-helper/aggregation/schema.json @@ -11,6 +11,7 @@ "type": "object", "required": ["type", "input_imports"], "properties": { + "name": { "type": "string" }, "type": { "type": "string", "enum": [ From fc4f2c44ffe605ee2863a0a6aa12274f42039d03 Mon Sep 17 00:00:00 2001 From: Sandeep Tuniki Date: Thu, 2 Jul 2026 20:50:52 +0530 Subject: [PATCH 28/33] style(aggregation): reorder keys in YAML configs for consistent field structure --- .../aggregation/configs/common.yaml | 6 +- .../aggregation/configs/entity.yaml | 14 ++-- .../aggregation/configs/place.yaml | 64 +++++++++---------- .../aggregation/configs/statvar.yaml | 42 ++++++------ .../aggregation/configs/statvar_series.yaml | 20 +++--- 5 files changed, 73 insertions(+), 73 deletions(-) diff --git a/pipeline/workflow/ingestion-helper/aggregation/configs/common.yaml b/pipeline/workflow/ingestion-helper/aggregation/configs/common.yaml index a066c3f0d..27cc90dac 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/configs/common.yaml +++ b/pipeline/workflow/ingestion-helper/aggregation/configs/common.yaml @@ -2,20 +2,20 @@ calculations: # Generates linkedContainedInPlace, linkedMemberOf, etc. - name: "Global: ContainedInPlace & MemberOf Graph Linked Edges" type: LINKED_EDGES + stage: 1 input_imports: - "*" - stage: 1 # Generates summary statistics in the Cache table - name: "Global: Cache Provenance & Lineage Summary" type: PROVENANCE_SUMMARY + stage: 1 input_imports: - "*" - stage: 1 # Generates the Statistical Variable hierarchy/verticals - name: "Global: StatVar Group Hierarchy & Verticals" type: STAT_VAR_GROUPS + stage: 1 input_imports: - "*" - stage: 1 diff --git a/pipeline/workflow/ingestion-helper/aggregation/configs/entity.yaml b/pipeline/workflow/ingestion-helper/aggregation/configs/entity.yaml index 4abb8737c..0f970c8c6 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/configs/entity.yaml +++ b/pipeline/workflow/ingestion-helper/aggregation/configs/entity.yaml @@ -3,9 +3,9 @@ calculations: # Earthquakes - name: "EarthquakeUSGS: Earthquake Event Aggregation" type: ENTITY_AGGREGATION - output_import: EarthquakeUSGS_Agg input_imports: - EarthquakeUSGS + output_import: EarthquakeUSGS_Agg entity_aggregation: entity_types: - EarthquakeEvent @@ -33,9 +33,9 @@ calculations: # Fires - name: "FireFAMWEB: Wildland Fire Event Aggregation" type: ENTITY_AGGREGATION - output_import: FireFAMWEB_Agg input_imports: - FireFAMWEB + output_import: FireFAMWEB_Agg entity_aggregation: entity_types: - WildlandFireEvent @@ -47,9 +47,9 @@ calculations: - name: "FireWFIGS: Fire Incident Event Aggregation" type: ENTITY_AGGREGATION - output_import: FireWFIGS_Agg input_imports: - FireWFIGS + output_import: FireWFIGS_Agg entity_aggregation: entity_types: - FireIncidentComplexEvent @@ -64,10 +64,10 @@ calculations: - name: "NASA_VIIRSActiveFiresEvents: Fire Event S2 Cell Aggregation" type: ENTITY_AGGREGATION - output_import: NASA_VIIRSActiveFiresEvents_Agg # Aggregate event counts for S2-cells of level 13 input_imports: - NASA_VIIRSActiveFiresEvents + output_import: NASA_VIIRSActiveFiresEvents_Agg entity_aggregation: entity_types: - FireEvent @@ -81,9 +81,9 @@ calculations: # Storms - name: "StormNOAA: Storm & Extreme Weather Event Aggregation" type: ENTITY_AGGREGATION - output_import: StormNOAA_Agg input_imports: - StormNOAA + output_import: StormNOAA_Agg entity_aggregation: location_props: - affectedPlace @@ -158,10 +158,10 @@ calculations: # Floods - name: "DynamicWorld_FloodEvents: Flood Event S2 Cell Aggregation" type: ENTITY_AGGREGATION - output_import: DynamicWorld_FloodEvents_Agg # Aggregate event counts for S2-cells of level 13 input_imports: - DynamicWorld_FloodEvents + output_import: DynamicWorld_FloodEvents_Agg entity_aggregation: entity_types: - FloodEvent @@ -175,9 +175,9 @@ calculations: # Heat/Cold Temperature Events counts - name: "TemperatureEvents: Heat & Cold Event Aggregation" type: ENTITY_AGGREGATION - output_import: TemperatureEvents_Agg input_imports: - TemperatureEvents + output_import: TemperatureEvents_Agg entity_aggregation: entity_types: - HeatTemperatureEvent diff --git a/pipeline/workflow/ingestion-helper/aggregation/configs/place.yaml b/pipeline/workflow/ingestion-helper/aggregation/configs/place.yaml index 23943f141..1072f43cd 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/configs/place.yaml +++ b/pipeline/workflow/ingestion-helper/aggregation/configs/place.yaml @@ -1,163 +1,163 @@ calculations: - name: "CensusACS5YearSurvey: State -> Country Place Rollup" type: PLACE_AGGREGATION + stage: 1 input_imports: - CensusACS5YearSurvey + output_import: CensusACS5YearSurvey_AggCountry place_aggregation: from_place_types: State to_place_types: Country - stage: 1 - output_import: CensusACS5YearSurvey_AggCountry - name: "CensusSAHIE: State -> Country Place Rollup" type: PLACE_AGGREGATION + stage: 1 input_imports: - CensusSAHIE + output_import: CensusSAHIE_AggCountry place_aggregation: from_place_types: State to_place_types: Country - stage: 1 - output_import: CensusSAHIE_AggCountry - name: "CDCMortality: County -> State Place Rollup" type: PLACE_AGGREGATION + stage: 1 input_imports: - CDCMortality + output_import: CDCMortality_AggState place_aggregation: from_place_types: County to_place_types: State - stage: 1 - output_import: CDCMortality_AggState - name: "CDCMortality: State -> Country Place Rollup" type: PLACE_AGGREGATION + stage: 2 input_imports: - CDCMortality_AggState + output_import: CDCMortality_AggState_AggCountry place_aggregation: from_place_types: State to_place_types: Country - stage: 2 - output_import: CDCMortality_AggState_AggCountry - name: "DEA_ARCOS: County -> State Place Rollup" type: PLACE_AGGREGATION + stage: 1 input_imports: - DEA_ARCOS + output_import: DEA_ARCOS_AggState place_aggregation: from_place_types: County to_place_types: State - stage: 1 - output_import: DEA_ARCOS_AggState - name: "DEA_ARCOS: State -> Country Place Rollup" type: PLACE_AGGREGATION + stage: 2 input_imports: - DEA_ARCOS_AggState + output_import: DEA_ARCOS_AggState_AggCountry place_aggregation: from_place_types: State to_place_types: Country - stage: 2 - output_import: DEA_ARCOS_AggState_AggCountry - name: "EPA_EJSCREEN: CensusBlockGroup -> CensusTract Place Rollup" type: PLACE_AGGREGATION + stage: 1 input_imports: - EPA_EJSCREEN + output_import: EPA_EJSCREEN_AggCensusTract place_aggregation: from_place_types: CensusBlockGroup to_place_types: CensusTract - stage: 1 - output_import: EPA_EJSCREEN_AggCensusTract - name: "EPA_EJSCREEN: CensusTract -> County Place Rollup" type: PLACE_AGGREGATION + stage: 2 input_imports: - EPA_EJSCREEN_AggCensusTract + output_import: EPA_EJSCREEN_AggCensusTract_AggCounty place_aggregation: from_place_types: CensusTract to_place_types: County - stage: 2 - output_import: EPA_EJSCREEN_AggCensusTract_AggCounty - name: "DeepSolar: CensusBlockGroup -> CensusTract Place Rollup" type: PLACE_AGGREGATION + stage: 1 input_imports: - DeepSolar + output_import: DeepSolar_AggCensusTract place_aggregation: from_place_types: CensusBlockGroup to_place_types: CensusTract - stage: 1 - output_import: DeepSolar_AggCensusTract - name: "DeepSolar: CensusTract -> County Place Rollup" type: PLACE_AGGREGATION + stage: 2 input_imports: - DeepSolar_AggCensusTract + output_import: DeepSolar_AggCensusTract_AggCounty place_aggregation: from_place_types: CensusTract to_place_types: County - stage: 2 - output_import: DeepSolar_AggCensusTract_AggCounty - name: "EPA_GHGRP: EpaReportingFacility -> County Place Rollup" type: PLACE_AGGREGATION + stage: 1 input_imports: - EPA_GHGRP + output_import: EPA_GHGRP_AggCounty place_aggregation: from_place_types: EpaReportingFacility to_place_types: County - stage: 1 - output_import: EPA_GHGRP_AggCounty - name: "EPA_GHGRP: EpaReportingFacility -> CensusZipCodeTabulationArea Place Rollup" type: PLACE_AGGREGATION + stage: 1 input_imports: - EPA_GHGRP + output_import: EPA_GHGRP_AggCensusZipCodeTabulationArea place_aggregation: from_place_types: EpaReportingFacility to_place_types: CensusZipCodeTabulationArea - stage: 1 - output_import: EPA_GHGRP_AggCensusZipCodeTabulationArea - name: "EPA_GHGRP: County -> State Place Rollup" type: PLACE_AGGREGATION + stage: 2 input_imports: - EPA_GHGRP_AggCounty + output_import: EPA_GHGRP_AggCounty_AggState place_aggregation: from_place_types: County to_place_types: State - stage: 2 - output_import: EPA_GHGRP_AggCounty_AggState - name: "RFF_USGridGeo: GeoGrid -> County Place Rollup" type: PLACE_AGGREGATION + stage: 1 input_imports: - RFF_USGridGeo_WeatherVariabilityForecast + output_import: RFF_USGridGeo_WeatherVariabilityForecast_AggCounty place_aggregation: from_place_types: GeoGridPlace_0.25Deg to_place_types: County allow_multiple_to_places: true - stage: 1 - output_import: RFF_USGridGeo_WeatherVariabilityForecast_AggCounty - name: "India_RBIStateDomesticProduct: State -> Country Place Rollup" type: PLACE_AGGREGATION + stage: 1 input_imports: - India_RBIStateDomesticProduct + output_import: India_RBIStateDomesticProduct_AggCountry place_aggregation: from_place_types: State to_place_types: Country allow_multiple_to_places: true - stage: 1 - output_import: India_RBIStateDomesticProduct_AggCountry - name: "India_RBIStateDomesticProduct: StatVarAgg State -> Country Place Rollup" type: PLACE_AGGREGATION + stage: 1 input_imports: - India_RBIStateDomesticProduct_StatVarAgg + output_import: India_RBIStateDomesticProduct_StatVarAgg_AggCountry place_aggregation: from_place_types: State to_place_types: Country allow_multiple_to_places: true - stage: 1 - output_import: India_RBIStateDomesticProduct_StatVarAgg_AggCountry diff --git a/pipeline/workflow/ingestion-helper/aggregation/configs/statvar.yaml b/pipeline/workflow/ingestion-helper/aggregation/configs/statvar.yaml index 9fd864a48..2ced6d221 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/configs/statvar.yaml +++ b/pipeline/workflow/ingestion-helper/aggregation/configs/statvar.yaml @@ -6,10 +6,10 @@ calculations: # - name: "CensusACS5YearSurvey: Health Insurance SV Aggregation" type: STAT_VAR_AGGREGATION - output_import: CensusACS5YearSurvey_HealthInsurance_StatVarAgg input_imports: - CensusACS5YearSurvey - CensusACS5YearSurvey_AggCountry + output_import: CensusACS5YearSurvey_HealthInsurance_StatVarAgg stat_var_aggregation: aggregations: # 1. Census Table B18135 includes insurance coverage by disability-status and @@ -274,10 +274,10 @@ calculations: # - name: "CensusACS5YearSurvey: Ability To Speak English SV Aggregation" type: STAT_VAR_AGGREGATION - output_import: CensusACS5YearSurvey_AbilityToSpeakEnglish_StatVarAgg input_imports: - CensusACS5YearSurvey - CensusACS5YearSurvey_AggCountry + output_import: CensusACS5YearSurvey_AbilityToSpeakEnglish_StatVarAgg stat_var_aggregation: aggregations: - ancestor_sv_id: Count_Person_SpeakEnglishNotAtAll @@ -322,10 +322,10 @@ calculations: # - name: "CensusACS5YearSurvey: In Armed Forces SV Aggregation" type: STAT_VAR_AGGREGATION - output_import: CensusACS5YearSurvey_InArmedForces_StatVarAgg input_imports: - CensusACS5YearSurvey - CensusACS5YearSurvey_AggCountry + output_import: CensusACS5YearSurvey_InArmedForces_StatVarAgg stat_var_aggregation: aggregations: # Women in armed forces @@ -383,10 +383,10 @@ calculations: # Education - CensusACS5YearSurvey - name: "CensusACS5YearSurvey: Education SV Aggregation" type: STAT_VAR_AGGREGATION - output_import: CensusACS5YearSurvey_Education_StatVarAgg input_imports: - CensusACS5YearSurvey - CensusACS5YearSurvey_AggCountry + output_import: CensusACS5YearSurvey_Education_StatVarAgg stat_var_aggregation: aggregations: # dc/g/Person_EducationalAttainment-5ThAnd6ThGrade @@ -599,9 +599,9 @@ calculations: # Education - ACSED5YrSurvey - name: "CensusSAHIE_AggCountry: Health Insurance SV Aggregation" type: STAT_VAR_AGGREGATION - output_import: CensusSAHIE_AggCountry_StatVarAgg input_imports: - ACSED5YrSurvey + output_import: CensusSAHIE_AggCountry_StatVarAgg stat_var_aggregation: aggregations: - ancestor_sv_id: Count_Parent_Occupation_Management_Business_Science_Arts @@ -628,10 +628,10 @@ calculations: # Demographics - name: "CensusACS5YearSurvey: Demographics SV Aggregation" type: STAT_VAR_AGGREGATION - output_import: CensusACS5YearSurvey_Demographics_StatVarAgg input_imports: - CensusACS5YearSurvey - CensusACS5YearSurvey_AggCountry + output_import: CensusACS5YearSurvey_Demographics_StatVarAgg stat_var_aggregation: aggregations: - ancestor_sv_id: Count_Person_85OrMoreYears @@ -646,9 +646,9 @@ calculations: # Agriculture - name: "USDA_AgricultureCensus: Agriculture Producer SV Aggregation" type: STAT_VAR_AGGREGATION - output_import: USDA_AgricultureCensus_Agriculture_StatVarAgg input_imports: - USDA_AgricultureCensus + output_import: USDA_AgricultureCensus_Agriculture_StatVarAgg stat_var_aggregation: aggregations: - ancestor_sv_id: Count_Person_Producer @@ -668,10 +668,10 @@ calculations: - name: "CensusACS5YearSurvey: Employment & Industry SV Aggregation" type: STAT_VAR_AGGREGATION - output_import: CensusACS5YearSurvey_Employment_StatVarAgg input_imports: - CensusACS5YearSurvey - CensusACS5YearSurvey_AggCountry + output_import: CensusACS5YearSurvey_Employment_StatVarAgg stat_var_aggregation: aggregations: - ancestor_sv_id: Count_Person_16OrMoreYears_Civilian_Employed_InLaborForce_NAICSAgricultureForestryFishingHunting @@ -859,9 +859,9 @@ calculations: # Crime - name: "USNationalPrisonerStatistics: Crime & Correctional Facility SV Aggregation" type: STAT_VAR_AGGREGATION - output_import: USNationalPrisonerStatistics_Crime_StatVarAgg input_imports: - USNationalPrisonerStatistics + output_import: USNationalPrisonerStatistics_Crime_StatVarAgg stat_var_aggregation: aggregations: - ancestor_sv_id: Count_Person_CorrectionalFacilityLocation_OutOfState @@ -872,9 +872,9 @@ calculations: # WithOwnChildrenUnder18. - name: "CensusACS5YearSurvey_SubjectTables_S1251: Children & Household SV Aggregation" type: STAT_VAR_AGGREGATION - output_import: CensusACS5YearSurvey_SubjectTables_S1251_StatVarAgg input_imports: - CensusACS5YearSurvey_SubjectTables_S1251 + output_import: CensusACS5YearSurvey_SubjectTables_S1251_StatVarAgg stat_var_aggregation: aggregations: - ancestor_sv_id: Count_Person_WithOwnChildrenUnder18_Female @@ -893,9 +893,9 @@ calculations: # Marriage - name: "CensusACS5YearSurvey_SubjectTables_S1201: Marital Status SV Aggregation" type: STAT_VAR_AGGREGATION - output_import: CensusACS5YearSurvey_SubjectTables_S1201_StatVarAgg input_imports: - CensusACS5YearSurvey_SubjectTables_S1201 + output_import: CensusACS5YearSurvey_SubjectTables_S1201_StatVarAgg stat_var_aggregation: aggregations: - ancestor_sv_id: Count_Person_InLaborForce_Divorced @@ -918,9 +918,9 @@ calculations: # Employment by business ownership type. - name: "CensusACS5YearSurvey_SubjectTables_S2408: Business Ownership Employment SV Aggregation" type: STAT_VAR_AGGREGATION - output_import: CensusACS5YearSurvey_SubjectTables_S2408_StatVarAgg input_imports: - CensusACS5YearSurvey_SubjectTables_S2408 + output_import: CensusACS5YearSurvey_SubjectTables_S2408_StatVarAgg stat_var_aggregation: aggregations: - ancestor_sv_id: Count_Person_PrivatelyOwnedNotForProfitEstablishment_PaidWorker @@ -935,9 +935,9 @@ calculations: # US Citizen by Naturalization - name: "CensusACS5YearSurvey_SubjectTables_S0504: Naturalized Citizenship SV Aggregation" type: STAT_VAR_AGGREGATION - output_import: CensusACS5YearSurvey_SubjectTables_S0504_StatVarAgg input_imports: - CensusACS5YearSurvey_SubjectTables_S0504 + output_import: CensusACS5YearSurvey_SubjectTables_S0504_StatVarAgg stat_var_aggregation: aggregations: - ancestor_sv_id: Count_Person_USCitizenByNaturalization_DateOfEntry2000OrLater_ForeignBorn @@ -1066,10 +1066,10 @@ calculations: # HousingUnit HomeValue. - name: "CensusACS5YearSurvey: Home Value SV Aggregation" type: STAT_VAR_AGGREGATION - output_import: CensusACS5YearSurvey_HousingUnit_HomeValue_StatVarAgg input_imports: - CensusACS5YearSurvey - CensusACS5YearSurvey_AggCountry + output_import: CensusACS5YearSurvey_HousingUnit_HomeValue_StatVarAgg stat_var_aggregation: aggregations: # Census ACS 5 year, Count_HousingUnit_HomeValue @@ -1117,10 +1117,10 @@ calculations: - name: "CensusACS5YearSurvey: Person Age Groups SV Aggregation" type: STAT_VAR_AGGREGATION - output_import: CensusACS5YearSurvey_Person_Age_StatVarAgg input_imports: - CensusACS5YearSurvey - CensusACS5YearSurvey_AggCountry + output_import: CensusACS5YearSurvey_Person_Age_StatVarAgg stat_var_aggregation: aggregations: # Census ACS 5 year, Age group @@ -1133,10 +1133,10 @@ calculations: - name: "CensusACS5YearSurvey: Individual Income SV Aggregation" type: STAT_VAR_AGGREGATION - output_import: CensusACS5YearSurvey_Income_StatVarAgg input_imports: - CensusACS5YearSurvey - CensusACS5YearSurvey_AggCountry + output_import: CensusACS5YearSurvey_Income_StatVarAgg stat_var_aggregation: aggregations: # The following 9 aggregations are: High income (100,000 USD or More), by race. @@ -1325,9 +1325,9 @@ calculations: - name: "DEA_ARCOS: Opioid Prescription SV Aggregation" type: STAT_VAR_AGGREGATION - output_import: IndiaNSS_HealthAilments_StatVarAgg input_imports: - IndiaNSS_HealthAilments + output_import: IndiaNSS_HealthAilments_StatVarAgg stat_var_aggregation: aggregations: - ancestor_sv_id: Count_MedicalConditionIncident_ConditionBloodDisease_AsAFractionOf_Count_Person @@ -1637,9 +1637,9 @@ calculations: - name: "OECDRegionalDemography: Person Age Groups SV Aggregation" type: STAT_VAR_AGGREGATION - output_import: OECDRegionalDemography_Person_Age_StatVarAgg input_imports: - OECDRegionalDemography_Population + output_import: OECDRegionalDemography_Person_Age_StatVarAgg stat_var_aggregation: aggregations: # OECD, Age group @@ -1723,9 +1723,9 @@ calculations: - name: "CensusACS5YearSurvey_SubjectTables_S0801: Work Commute SV Aggregation" type: STAT_VAR_AGGREGATION - output_import: CensusACS5YearSurvey_SubjectTables_S0801_StatVarAgg input_imports: - CensusACS5YearSurvey_SubjectTables_S0801 + output_import: CensusACS5YearSurvey_SubjectTables_S0801_StatVarAgg stat_var_aggregation: aggregations: # Census ACS 5 year, Work Commute group @@ -1840,9 +1840,9 @@ calculations: - Count_Person_Years16Onwards_60OrMoreMinute_WorkCommute_Employed_Female_WorkedOutsideOfHome - name: "CensusACS5YearSurvey_SubjectTables_S0804: Means of Transportation Work Commute SV Aggregation" type: STAT_VAR_AGGREGATION - output_import: CensusACS5YearSurvey_SubjectTables_S0804_StatVarAgg input_imports: - CensusACS5YearSurvey_SubjectTables_S0804 + output_import: CensusACS5YearSurvey_SubjectTables_S0804_StatVarAgg stat_var_aggregation: aggregations: - ancestor_sv_id: Count_Person_Years16Onwards_PublicTransportationExcludingTaxicab_WorkCommute_Employed_WorkedOutsideOfHome @@ -1880,9 +1880,9 @@ calculations: - Count_Person_Years16Onwards_CarTruckOrVanCarpooled_60OrMoreMinute_WorkCommute_Employed_WorkedOutsideOfHome - name: "India_RBIStateDomesticProduct: Gross Value Added SV Aggregation" type: STAT_VAR_AGGREGATION - output_import: India_RBIStateDomesticProduct_StatVarAgg input_imports: - India_RBIStateDomesticProduct + output_import: India_RBIStateDomesticProduct_StatVarAgg stat_var_aggregation: aggregations: - ancestor_sv_id: Nominal_Amount_EconomicActivity_GrossValueAdded diff --git a/pipeline/workflow/ingestion-helper/aggregation/configs/statvar_series.yaml b/pipeline/workflow/ingestion-helper/aggregation/configs/statvar_series.yaml index f8ef1793c..0d8b18280 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/configs/statvar_series.yaml +++ b/pipeline/workflow/ingestion-helper/aggregation/configs/statvar_series.yaml @@ -1,9 +1,9 @@ calculations: - name: "NASA_NEXDCP30: Measurement Methods & Base Date Diff Series Aggregation" type: STAT_VAR_SERIES_AGGREGATION + stage: 1 input_imports: - NASA_NEXDCP30 - stage: 1 output_import: NASA_NEXDCP30_AggrDiffStats stat_var_series_aggregation: aggr_funcs: @@ -16,9 +16,9 @@ calculations: - name: "NASA_NEXGDDP_Subnational: Measurement Methods & Base Date Diff Series Aggregation" type: STAT_VAR_SERIES_AGGREGATION + stage: 1 input_imports: - NASA_NEXGDDP_Subnational - stage: 1 output_import: NASA_NEXGDDP_Subnational_AggrDiffStats stat_var_series_aggregation: aggr_funcs: @@ -33,9 +33,9 @@ calculations: - name: "NASA_NEXGDDP_Country: Measurement Methods & Base Date Diff Series Aggregation" type: STAT_VAR_SERIES_AGGREGATION + stage: 1 input_imports: - NASA_NEXGDDP_Country - stage: 1 output_import: NASA_NEXGDDP_Country_AggrDiffStats stat_var_series_aggregation: aggr_funcs: @@ -48,9 +48,9 @@ calculations: - name: "NASA_NEXGDDP_CMIP6_Subnational: Measurement Methods & Base Date Diff Series Aggregation" type: STAT_VAR_SERIES_AGGREGATION + stage: 1 input_imports: - NASA_NEXGDDP_CMIP6_Subnational - stage: 1 output_import: NASA_NEXGDDP_CMIP6_Subnational_AggrDiffStats stat_var_series_aggregation: aggr_funcs: @@ -65,9 +65,9 @@ calculations: - name: "NASA_NEXGDDP_CMIP6_IpccPlaces50: Measurement Methods & Base Date Diff Series Aggregation" type: STAT_VAR_SERIES_AGGREGATION + stage: 1 input_imports: - NASA_NEXGDDP_CMIP6_IpccPlaces50 - stage: 1 output_import: NASA_NEXGDDP_CMIP6_IpccPlaces50_AggrDiffStats stat_var_series_aggregation: aggr_funcs: @@ -82,9 +82,9 @@ calculations: - name: "NASA_NEXGDDP_CMIP6_Subnational: Stats Across Models Series Aggregation" type: STAT_VAR_SERIES_AGGREGATION + stage: 2 input_imports: - NASA_NEXGDDP_CMIP6_Subnational - stage: 2 output_import: NASA_NEXGDDP_CMIP6_Subnational_AggrStatsAcrossModels stat_var_series_aggregation: aggr_funcs: @@ -97,9 +97,9 @@ calculations: - name: "NASA_NEXGDDP_CMIP6_IpccPlaces50: Stats Across Models Series Aggregation" type: STAT_VAR_SERIES_AGGREGATION + stage: 2 input_imports: - NASA_NEXGDDP_CMIP6_IpccPlaces50 - stage: 2 output_import: NASA_NEXGDDP_CMIP6_IpccPlaces50_AggrStatsAcrossModels stat_var_series_aggregation: aggr_funcs: @@ -112,9 +112,9 @@ calculations: - name: "NASA_NEXGDDP_CMIP6_Subnational: Over Time Stats Across Models Series Aggregation" type: STAT_VAR_SERIES_AGGREGATION + stage: 3 input_imports: - NASA_NEXGDDP_CMIP6_Subnational_AggrStatsAcrossModels - stage: 3 output_import: NASA_NEXGDDP_CMIP6_Subnational_AggrOverTimeOnStatsAcrossModels stat_var_series_aggregation: aggr_funcs: @@ -216,9 +216,9 @@ calculations: - name: "NASA_NEXGDDP_CMIP6_IpccPlaces50: Over Time Stats Across Models Series Aggregation" type: STAT_VAR_SERIES_AGGREGATION + stage: 3 input_imports: - NASA_NEXGDDP_CMIP6_IpccPlaces50_AggrStatsAcrossModels - stage: 3 output_import: NASA_NEXGDDP_CMIP6_IpccPlaces50_AggrOverTimeOnStatsAcrossModels stat_var_series_aggregation: aggr_funcs: @@ -320,9 +320,9 @@ calculations: - name: "NASA_NEXGDDP_IpccPlaces50: Measurement Methods & Base Date Diff Series Aggregation" type: STAT_VAR_SERIES_AGGREGATION + stage: 1 input_imports: - NASA_NEXGDDP_IpccPlaces50 - stage: 1 output_import: NASA_NEXGDDP_IpccPlaces50_AggrDiffStats stat_var_series_aggregation: aggr_funcs: From c8267c731d0a7578cdb5e5cec6617cd3e107d46f Mon Sep 17 00:00:00 2001 From: Sandeep Tuniki Date: Thu, 2 Jul 2026 20:54:18 +0530 Subject: [PATCH 29/33] style(aggregation): clean up comment indentation in statvar and statvar_calculation configs --- .../aggregation/configs/statvar.yaml | 287 +++++++++--------- .../configs/statvar_calculation.yaml | 22 +- 2 files changed, 153 insertions(+), 156 deletions(-) diff --git a/pipeline/workflow/ingestion-helper/aggregation/configs/statvar.yaml b/pipeline/workflow/ingestion-helper/aggregation/configs/statvar.yaml index 2ced6d221..f3992282a 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/configs/statvar.yaml +++ b/pipeline/workflow/ingestion-helper/aggregation/configs/statvar.yaml @@ -1,9 +1,6 @@ calculations: -# -# Health Insurance Coverage -# ------------------------- -# + # Health Insurance Coverage - name: "CensusACS5YearSurvey: Health Insurance SV Aggregation" type: STAT_VAR_AGGREGATION input_imports: @@ -12,11 +9,11 @@ calculations: output_import: CensusACS5YearSurvey_HealthInsurance_StatVarAgg stat_var_aggregation: aggregations: - # 1. Census Table B18135 includes insurance coverage by disability-status and - # age (https://data.census.gov/cedsci/table?tid=ACSDT5Y2019.B18135). - # We use that to compute total and by disability-status. - # - # Total uninsured + # 1. Census Table B18135 includes insurance coverage by disability-status and + # age (https://data.census.gov/cedsci/table?tid=ACSDT5Y2019.B18135). + # We use that to compute total and by disability-status. + # + # Total uninsured - ancestor_sv_id: Count_Person_NoHealthInsurance source_sv_ids: # No Disability @@ -27,7 +24,7 @@ calculations: - dc/bew8kj6l7tv93 - dc/96dqj47csvmy8 - dc/qr4s77egv27q2 - # Total insured + # Total insured - ancestor_sv_id: Count_Person_WithHealthInsurance source_sv_ids: # No Disability @@ -38,40 +35,40 @@ calculations: - dc/s5efzs4x817p5 - dc/9j39148yn79zf - dc/p3v76jcvdx919 - # Uninsured with no disability - # The following 4 aggregations come from parts of the above two aggregations. + # Uninsured with no disability + # The following 4 aggregations come from parts of the above two aggregations. - ancestor_sv_id: Count_Person_NoHealthInsurance_NoDisability source_sv_ids: # No Disability - dc/y0dvhk0sggzef - dc/kdg05h55y45y6 - dc/9drszqwd2nef7 - # Uninsured with disability + # Uninsured with disability - ancestor_sv_id: Count_Person_NoHealthInsurance_WithDisability source_sv_ids: # With Disability - dc/bew8kj6l7tv93 - dc/96dqj47csvmy8 - dc/qr4s77egv27q2 - # Insured with no disability + # Insured with no disability - ancestor_sv_id: Count_Person_WithHealthInsurance_NoDisability source_sv_ids: # No Disability - dc/32mhsxvq7qsm4 - dc/2s2dkbb7gz038 - dc/dc8vqzkx18x0c - # Insured with disability + # Insured with disability - ancestor_sv_id: Count_Person_WithHealthInsurance_WithDisability source_sv_ids: # With Disability - dc/s5efzs4x817p5 - dc/9j39148yn79zf - dc/p3v76jcvdx919 - # - # 2. Census Table B27001 includes insurance coverage by age and gender - # breakdown (https://data.census.gov/cedsci/table?tid=ACSDT5Y2019.B27001). We - # use that to compute gender breakdown. - # + # + # 2. Census Table B27001 includes insurance coverage by age and gender + # breakdown (https://data.census.gov/cedsci/table?tid=ACSDT5Y2019.B27001). We + # use that to compute gender breakdown. + # - ancestor_sv_id: Count_Person_Female_NoHealthInsurance source_sv_ids: - dc/jx2q10tbnwhf3 @@ -116,8 +113,8 @@ calculations: - dc/wp3txgscfen9c - dc/b3n86k0h3h2w - dc/j8se7wv2gbdfd - # dc/g/Person_HealthInsurance-NoPrivateHealthInsurance - # Level 3 + # dc/g/Person_HealthInsurance-NoPrivateHealthInsurance + # Level 3 - ancestor_sv_id: Count_Person_NoPrivateHealthInsurance source_sv_ids: # Population: 26 - 34 Years, Female, No Private Health Insurance @@ -168,8 +165,8 @@ calculations: - dc/wyfvejhn9fe7 # Population: 6 - 17 Years, Male, No Private Health Insurance - dc/yfmzp444fj5r5 - # dc/g/Person_HealthInsurance-NoPublicHealthInsurance - # Level 3 + # dc/g/Person_HealthInsurance-NoPublicHealthInsurance + # Level 3 - ancestor_sv_id: Count_Person_NoPublicHealthInsurance source_sv_ids: # Population: 26 - 34 Years, Female, No Public Health Insurance @@ -220,8 +217,8 @@ calculations: - dc/vxecjxxbmhy43 # Population: 25 - 34 Years, Female, No Public Health Insurance - dc/yjxmx3n02dx08 - # dc/g/Person_HealthInsurance-WithOneTypeOfHealthInsurance - # Level 1 + # dc/g/Person_HealthInsurance-WithOneTypeOfHealthInsurance + # Level 1 - ancestor_sv_id: Count_Person_WithOneTypeOfHealthInsurance source_sv_ids: # Population: 18 Years or Less, With One Type of Health Insurance @@ -236,8 +233,8 @@ calculations: - dc/thcbepzex9zd9 # Population: 65 Years or More, With One Type of Health Insurance - dc/v1w52jrtvw6m2 - # dc/g/Person_HealthInsurance-WithPublicCoverage - # Level 3 + # dc/g/Person_HealthInsurance-WithPublicCoverage + # Level 3 - ancestor_sv_id: Count_Person_WithPublicCoverage source_sv_ids: # Population: 18 Years or Less, With Disability, With Public Coverage @@ -261,17 +258,17 @@ calculations: # Population: 19 - 64 Years, With Disability, With Public Coverage - dc/zkcy0f52ewh04 - # - # Ability to speak English - # ------------------------ - # - # We aggregate abilityToSpeakEnglish over nativity (2) and languageSpokenAtHome - # (4) for a total of 8 source SVs from Census Table B16005 - # (https://data.census.gov/cedsci/table?tid=ACSDT5Y2019.B16005) - # - # NOTE: Values SpeakEnglishLessThanVeryWell and SpeakEnglishVeryWell already - # have population values from a different table. - # + # + # Ability to speak English + # ------------------------ + # + # We aggregate abilityToSpeakEnglish over nativity (2) and languageSpokenAtHome + # (4) for a total of 8 source SVs from Census Table B16005 + # (https://data.census.gov/cedsci/table?tid=ACSDT5Y2019.B16005) + # + # NOTE: Values SpeakEnglishLessThanVeryWell and SpeakEnglishVeryWell already + # have population values from a different table. + # - name: "CensusACS5YearSurvey: Ability To Speak English SV Aggregation" type: STAT_VAR_AGGREGATION input_imports: @@ -311,15 +308,15 @@ calculations: - dc/yx0mdj3dnk3mb - dc/1x48nzrne2f88 - # - # In Armed Forces - # --------------- - # - # Census Table B23001 includes count of people in armed forces (in which case - # In labor Forces is implied) broken down by gender and age. From that we - # aggregate to total and by gender. - # (https://data.census.gov/cedsci/table?tid=ACSDT5Y2019.B23001) - # + # + # In Armed Forces + # --------------- + # + # Census Table B23001 includes count of people in armed forces (in which case + # In labor Forces is implied) broken down by gender and age. From that we + # aggregate to total and by gender. + # (https://data.census.gov/cedsci/table?tid=ACSDT5Y2019.B23001) + # - name: "CensusACS5YearSurvey: In Armed Forces SV Aggregation" type: STAT_VAR_AGGREGATION input_imports: @@ -328,7 +325,7 @@ calculations: output_import: CensusACS5YearSurvey_InArmedForces_StatVarAgg stat_var_aggregation: aggregations: - # Women in armed forces + # Women in armed forces - ancestor_sv_id: Count_Person_Female_InArmedForces source_sv_ids: - dc/173smewzddlb @@ -341,7 +338,7 @@ calculations: - dc/8d182v9j04mg4 - dc/lb3lb4mg82mph - dc/g2m31qc7q1x64 - # Men in armed forces + # Men in armed forces - ancestor_sv_id: Count_Person_Male_InArmedForces source_sv_ids: - dc/vp1gqv00d2ql3 @@ -354,7 +351,7 @@ calculations: - dc/x63tjfw28tzvc - dc/fpx513jvf4xed - dc/2zj8jthd4f563 - # Armed forces population, from combining the above two sets of SVs. + # Armed forces population, from combining the above two sets of SVs. - ancestor_sv_id: Count_Person_InArmedForces source_sv_ids: # Female @@ -380,7 +377,7 @@ calculations: - dc/fpx513jvf4xed - dc/2zj8jthd4f563 - # Education - CensusACS5YearSurvey + # Education - CensusACS5YearSurvey - name: "CensusACS5YearSurvey: Education SV Aggregation" type: STAT_VAR_AGGREGATION input_imports: @@ -389,24 +386,24 @@ calculations: output_import: CensusACS5YearSurvey_Education_StatVarAgg stat_var_aggregation: aggregations: - # dc/g/Person_EducationalAttainment-5ThAnd6ThGrade - # Level 1 + # dc/g/Person_EducationalAttainment-5ThAnd6ThGrade + # Level 1 - ancestor_sv_id: Count_Person_Years25Onwards_EducationalAttainment_5ThAnd6ThGrade source_sv_ids: # Population: 5th And 6th Grade, Female - Count_Person_25OrMoreYears_EducationalAttainment5ThAnd6ThGrade_Female # Population: 5th And 6th Grade, Male - Count_Person_25OrMoreYears_EducationalAttainment5ThAnd6ThGrade_Male - # dc/g/Person_EducationalAttainment-7ThAnd8ThGrade - # Level 1 + # dc/g/Person_EducationalAttainment-7ThAnd8ThGrade + # Level 1 - ancestor_sv_id: Count_Person_Years25Onwards_EducationalAttainment_7ThAnd8ThGrade source_sv_ids: # Population: 7th And 8th Grade, Female - Count_Person_25OrMoreYears_EducationalAttainment7ThAnd8ThGrade_Female # Population: 7th And 8th Grade, Male - Count_Person_25OrMoreYears_EducationalAttainment7ThAnd8ThGrade_Male - # dc/g/Person_EducationalAttainment-9ThTo12ThGradeNoDiploma - # Level 3 + # dc/g/Person_EducationalAttainment-9ThTo12ThGradeNoDiploma + # Level 3 - ancestor_sv_id: Count_Person_EducationalAttainment_9ThTo12ThGradeNoDiploma source_sv_ids: # Population: 18 - 24 Years, 9th To 12th Grade No Diploma, Female @@ -429,8 +426,8 @@ calculations: - dc/5sps7rmylm73b # Population: 65 Years or More, 9th To 12th Grade No Diploma, Male - dc/07hctc6f9e2k9 - # dc/g/Person_EducationalAttainment-LessThan9ThGrade - # Level 3 + # dc/g/Person_EducationalAttainment-LessThan9ThGrade + # Level 3 - ancestor_sv_id: Count_Person_EducationalAttainment_LessThan9ThGrade source_sv_ids: # Population: 18 - 24 Years, Less Than 9th Grade, Female @@ -453,8 +450,8 @@ calculations: - dc/292723k92k5tb # Population: 65 Years or More, Less Than 9th Grade, Male - dc/68pblb53csteb - # dc/g/Person_EducationalAttainment-LessThanHighSchoolDiploma - # Level 3 + # dc/g/Person_EducationalAttainment-LessThanHighSchoolDiploma + # Level 3 - ancestor_sv_id: Count_Person_EducationalAttainment_LessThanHighSchoolDiploma source_sv_ids: # Population: Less Than High School Diploma, Male, Two or More Races @@ -493,16 +490,16 @@ calculations: - dc/yfrrvevrmyr74 # Population: Less Than High School Diploma, Female, Two or More Races - dc/29l3m1z7d3n7c - # dc/g/Person_EducationalAttainment-NurseryTo4ThGrade - # Level 1 + # dc/g/Person_EducationalAttainment-NurseryTo4ThGrade + # Level 1 - ancestor_sv_id: Count_Person_Years25Onwards_EducationalAttainment_NurseryTo4ThGrade source_sv_ids: # Population: Nursery To 4th Grade, Female - Count_Person_25OrMoreYears_EducationalAttainmentNurseryTo4ThGrade_Female # Population: Nursery To 4th Grade, Male - Count_Person_25OrMoreYears_EducationalAttainmentNurseryTo4ThGrade_Male - # dc/g/Person_EducationalAttainment-SomeCollegeNoDegree - # Level 3 + # dc/g/Person_EducationalAttainment-SomeCollegeNoDegree + # Level 3 - ancestor_sv_id: Count_Person_EducationalAttainment_SomeCollegeNoDegree source_sv_ids: # Population: 18 - 24 Years, Some College No Degree, Female @@ -540,7 +537,7 @@ calculations: - Count_Person_EducationalAttainment11ThGrade - Count_Person_EducationalAttainment12ThGradeNoDiploma - # The following 9 aggregations are: SomeCollegeOrAssociatesDegree, by race. + # The following 9 aggregations are: SomeCollegeOrAssociatesDegree, by race. - ancestor_sv_id: Count_Person_25OrMoreYears_SomeCollegeOrAssociatesDegree_AmericanIndianOrAlaskaNativeAlone source_sv_ids: # Population: Some College or Associates Degree, Female, American Indian or Alaska Native Alone @@ -596,7 +593,7 @@ calculations: # Population: Some College or Associates Degree, Male, White Alone Not Hispanic or Latino - dc/dc9v9h3q8l8n7 - # Education - ACSED5YrSurvey + # Education - ACSED5YrSurvey - name: "CensusSAHIE_AggCountry: Health Insurance SV Aggregation" type: STAT_VAR_AGGREGATION input_imports: @@ -625,7 +622,7 @@ calculations: # Count of Parent: 16 Years or More, Civilian, Public School, Employed, in Labor Force, Service Occupations - dc/bstxmnb4k1wrb - # Demographics + # Demographics - name: "CensusACS5YearSurvey: Demographics SV Aggregation" type: STAT_VAR_AGGREGATION input_imports: @@ -643,7 +640,7 @@ calculations: - Count_Person_0To4Years_Male - Count_Person_0To4Years_Female - # Agriculture + # Agriculture - name: "USDA_AgricultureCensus: Agriculture Producer SV Aggregation" type: STAT_VAR_AGGREGATION input_imports: @@ -662,9 +659,9 @@ calculations: - Count_Person_Producer_AsianAlone skip_all_sources_present_check: true - # - # Employment - # ------------------------ + # + # Employment + # ------------------------ - name: "CensusACS5YearSurvey: Employment & Industry SV Aggregation" type: STAT_VAR_AGGREGATION @@ -795,68 +792,68 @@ calculations: # Population: 16 Years or More, Civilian, Employed, in Labor Force, Male, Public Administration (NAICS/92) - dc/589p0gc36qem5 - # Census Table B23001 includes count of employed civilians broken down by gender - # and age. From that we aggregate over age to get count of employed civilians - # for each gender. - # (https://data.census.gov/cedsci/table?tid=ACSDT5Y2019.B23001) - # -# - type: STAT_VAR_AGGREGATION + # Census Table B23001 includes count of employed civilians broken down by gender + # and age. From that we aggregate over age to get count of employed civilians + # for each gender. + # (https://data.census.gov/cedsci/table?tid=ACSDT5Y2019.B23001) + # + # - type: STAT_VAR_AGGREGATION # type: STAT_VAR_AGGREGATION -# output_import: CensusACS5YearSurvey_Employment_StatVarAgg -# input_imports: -# - CensusACS5YearSurvey -# - CensusACS5YearSurvey_AggCountry -# stat_var_aggregation: -# aggregations: - # # Women Employed - # - # # The ancestor has "armedForcesStatus: dcs:Civilian", but the following sources - # # don't have. Fix it before adding this to aggregation. - # # dc/kz49wc5n3lhpd - # # dc/qcpg8c533syd6 - # # dc/zv26z833d7g3c -# - ancestor_sv_id: Count_Person_Female_Employed -# source_sv_ids: -# - dc/yt1fm72s1y7b7 -# - dc/hj65vwnt5csr7 -# - dc/0sd3x3bb4qet5 -# - dc/j4mzcc63n5zq5 -# - dc/56jhsezrzl049 -# - dc/ggx918j9p5tnf -# - dc/8rw47nf3ngle7 -# - dc/hplj99j7mbfsh -# - dc/ksmx7fwfkm8lb -# - dc/6706lc55kg5d -# - dc/qcpg8c533syd6 -# - dc/zv26z833d7g3c -# - dc/kz49wc5n3lhpd - # } - # - # # The ancestor has "armedForcesStatus: dcs:Civilian", but the following sources - # # don't have. Fix it before adding this to aggregation. - # # dc/5cxs4br0jz02c - # # dc/s909dd4r22fw - # # dc/xepldf55yq6s5 - # - # # Men Employed -# - ancestor_sv_id: Count_Person_Male_Employed -# source_sv_ids: -# - dc/2s6hps4z1qced -# - dc/xvtk9180lme1h -# - dc/twdr4c500yev5 -# - dc/n0btf3nglhqqc -# - dc/7qwvkhr4tsyt8 -# - dc/pkpm179bbh822 -# - dc/gyrw68q6x77l9 -# - dc/ly0fvmlf4mtf5 -# - dc/m020zpvzch2gd -# - dc/sgm602ncby3bf -# - dc/xepldf55yq6s5 -# - dc/s909dd4r22fw -# - dc/5cxs4br0jz02c - # } + # output_import: CensusACS5YearSurvey_Employment_StatVarAgg + # input_imports: + # - CensusACS5YearSurvey + # - CensusACS5YearSurvey_AggCountry + # stat_var_aggregation: + # aggregations: + # # Women Employed + # + # # The ancestor has "armedForcesStatus: dcs:Civilian", but the following sources + # # don't have. Fix it before adding this to aggregation. + # # dc/kz49wc5n3lhpd + # # dc/qcpg8c533syd6 + # # dc/zv26z833d7g3c + # - ancestor_sv_id: Count_Person_Female_Employed + # source_sv_ids: + # - dc/yt1fm72s1y7b7 + # - dc/hj65vwnt5csr7 + # - dc/0sd3x3bb4qet5 + # - dc/j4mzcc63n5zq5 + # - dc/56jhsezrzl049 + # - dc/ggx918j9p5tnf + # - dc/8rw47nf3ngle7 + # - dc/hplj99j7mbfsh + # - dc/ksmx7fwfkm8lb + # - dc/6706lc55kg5d + # - dc/qcpg8c533syd6 + # - dc/zv26z833d7g3c + # - dc/kz49wc5n3lhpd + # } + # + # # The ancestor has "armedForcesStatus: dcs:Civilian", but the following sources + # # don't have. Fix it before adding this to aggregation. + # # dc/5cxs4br0jz02c + # # dc/s909dd4r22fw + # # dc/xepldf55yq6s5 + # + # # Men Employed + # - ancestor_sv_id: Count_Person_Male_Employed + # source_sv_ids: + # - dc/2s6hps4z1qced + # - dc/xvtk9180lme1h + # - dc/twdr4c500yev5 + # - dc/n0btf3nglhqqc + # - dc/7qwvkhr4tsyt8 + # - dc/pkpm179bbh822 + # - dc/gyrw68q6x77l9 + # - dc/ly0fvmlf4mtf5 + # - dc/m020zpvzch2gd + # - dc/sgm602ncby3bf + # - dc/xepldf55yq6s5 + # - dc/s909dd4r22fw + # - dc/5cxs4br0jz02c + # } - # Crime + # Crime - name: "USNationalPrisonerStatistics: Crime & Correctional Facility SV Aggregation" type: STAT_VAR_AGGREGATION input_imports: @@ -869,7 +866,7 @@ calculations: - dc/qgv9d3frn35qc - dc/91vy0sf20wlg9 - # WithOwnChildrenUnder18. + # WithOwnChildrenUnder18. - name: "CensusACS5YearSurvey_SubjectTables_S1251: Children & Household SV Aggregation" type: STAT_VAR_AGGREGATION input_imports: @@ -890,7 +887,7 @@ calculations: - Count_Person_WithOwnChildrenUnder18_Female - Count_Person_WithOwnChildrenUnder18_Male - # Marriage + # Marriage - name: "CensusACS5YearSurvey_SubjectTables_S1201: Marital Status SV Aggregation" type: STAT_VAR_AGGREGATION input_imports: @@ -915,7 +912,7 @@ calculations: - Count_Person_InLaborForce_Female_Widowed - Count_Person_InLaborForce_Male_Widowed - # Employment by business ownership type. + # Employment by business ownership type. - name: "CensusACS5YearSurvey_SubjectTables_S2408: Business Ownership Employment SV Aggregation" type: STAT_VAR_AGGREGATION input_imports: @@ -932,7 +929,7 @@ calculations: - Count_Person_PrivatelyOwnedForProfitEstablishment_Male_PaidWorker - Count_Person_PrivatelyOwnedForProfitEstablishment_Female_PaidWorker - # US Citizen by Naturalization + # US Citizen by Naturalization - name: "CensusACS5YearSurvey_SubjectTables_S0504: Naturalized Citizenship SV Aggregation" type: STAT_VAR_AGGREGATION input_imports: @@ -1063,7 +1060,7 @@ calculations: - Count_Person_USCitizenByNaturalization_DateOfEntry2000OrLater_ForeignBorn - Count_Person_USCitizenByNaturalization_DateOfEntry2010OrLater_ForeignBorn - # HousingUnit HomeValue. + # HousingUnit HomeValue. - name: "CensusACS5YearSurvey: Home Value SV Aggregation" type: STAT_VAR_AGGREGATION input_imports: @@ -1072,7 +1069,7 @@ calculations: output_import: CensusACS5YearSurvey_HousingUnit_HomeValue_StatVarAgg stat_var_aggregation: aggregations: - # Census ACS 5 year, Count_HousingUnit_HomeValue + # Census ACS 5 year, Count_HousingUnit_HomeValue - ancestor_sv_id: Count_HousingUnit_HomeValueUpto49999USDollar source_sv_ids: - Count_HousingUnit_HomeValueUpto10000USDollar @@ -1123,7 +1120,7 @@ calculations: output_import: CensusACS5YearSurvey_Person_Age_StatVarAgg stat_var_aggregation: aggregations: - # Census ACS 5 year, Age group + # Census ACS 5 year, Age group - ancestor_sv_id: Count_Person_55To64Years source_sv_ids: - Count_Person_55To59Years @@ -1139,7 +1136,7 @@ calculations: output_import: CensusACS5YearSurvey_Income_StatVarAgg stat_var_aggregation: aggregations: - # The following 9 aggregations are: High income (100,000 USD or More), by race. + # The following 9 aggregations are: High income (100,000 USD or More), by race. - ancestor_sv_id: Count_Person_16OrMoreYears_USDollar100000Onwards_WithIncome_AmericanIndianOrAlaskaNativeAlone source_sv_ids: # Population: Female, 100,000 USD or More, American Indian or Alaska Native Alone, Not Worked Full Time @@ -1231,7 +1228,7 @@ calculations: # Population: Male, 100,000 USD or More, White Alone Not Hispanic or Latino, Worked Full Time - dc/ekh1g39v9sgj4 - # The following 9 aggregations are: Low income (10,000 - 12,499 USD), by race. + # The following 9 aggregations are: Low income (10,000 - 12,499 USD), by race. - ancestor_sv_id: Count_Person_16OrMoreYears_USDollar10000To12499_WithIncome_AmericanIndianOrAlaskaNativeAlone source_sv_ids: # Population: Female, 10,000 - 12,499 USD, American Indian or Alaska Native Alone, Not Worked Full Time @@ -1642,7 +1639,7 @@ calculations: output_import: OECDRegionalDemography_Person_Age_StatVarAgg stat_var_aggregation: aggregations: - # OECD, Age group + # OECD, Age group - ancestor_sv_id: Count_Person_Upto9Years source_sv_ids: - Count_Person_Upto4Years @@ -1728,7 +1725,7 @@ calculations: output_import: CensusACS5YearSurvey_SubjectTables_S0801_StatVarAgg stat_var_aggregation: aggregations: - # Census ACS 5 year, Work Commute group + # Census ACS 5 year, Work Commute group - ancestor_sv_id: Count_Person_Years16Onwards_WorkCommute_Employed source_sv_ids: - Count_Person_Years16Onwards_CarTruckOrVan_WorkCommute_Employed @@ -1774,7 +1771,7 @@ calculations: - Count_Person_Years16Onwards_WorkedAtHome_WorkCommute_Employed_Female - Count_Person_Years16Onwards_WorkedFromHome_WorkCommute_Employed_Female skip_all_sources_present_check: true - # Worked outside of Home + # Worked outside of Home - ancestor_sv_id: Count_Person_Years16Onwards_WorkCommute_Employed_WorkedOutsideOfHome source_sv_ids: - Count_Person_Years16Onwards_StartTimeHour0000To0459_WorkCommute_Employed_WorkedOutsideOfHome diff --git a/pipeline/workflow/ingestion-helper/aggregation/configs/statvar_calculation.yaml b/pipeline/workflow/ingestion-helper/aggregation/configs/statvar_calculation.yaml index 7602fb587..4969e1a98 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/configs/statvar_calculation.yaml +++ b/pipeline/workflow/ingestion-helper/aggregation/configs/statvar_calculation.yaml @@ -1,6 +1,6 @@ calculations: -# Energy. + # Energy. - name: "EIA_Electricity: Annual Emissions Per Capita StatVar Calculation" type: STAT_VAR_CALCULATION input_imports: @@ -30,16 +30,16 @@ calculations: unit: MetricTonCO2ePerGigawattHour observation_period: P1Y -# Climate: Temperature modeling diffs from actual. -# There are 30+ models. For each model, the output diffs are done for -# Aggregations: Mean, Min, Max and also for SSP2 with RCP4.5 and SSP5 with -# RCP8.5 (for each model). -# This includes diffs for P1M, P1Y, P5Y and P10Y. -# Note that the computations below use sv_regex and measurement_method_regex -# instead of sv and measurement_method to allow more expressive matching. Also -# note that the output uses sv_prefix and measurement_method_prefix which is not -# the case above, for example. Whenever sv and measurement_method are set in the -# output, they are preferred. Otherwise, the *_prefix is used. + # Climate: Temperature modeling diffs from actual. + # There are 30+ models. For each model, the output diffs are done for + # Aggregations: Mean, Min, Max and also for SSP2 with RCP4.5 and SSP5 with + # RCP8.5 (for each model). + # This includes diffs for P1M, P1Y, P5Y and P10Y. + # Note that the computations below use sv_regex and measurement_method_regex + # instead of sv and measurement_method to allow more expressive matching. Also + # note that the output uses sv_prefix and measurement_method_prefix which is not + # the case above, for example. Whenever sv and measurement_method are set in the + # output, they are preferred. Otherwise, the *_prefix is used. - name: "Climate: NASA CMIP6 Temperature Modeling Diffs Calculation" type: STAT_VAR_CALCULATION input_imports: From 645966dab0e6e3b8e8f5c4b1271eb491f0b12e12 Mon Sep 17 00:00:00 2001 From: Sandeep Tuniki Date: Thu, 2 Jul 2026 21:25:03 +0530 Subject: [PATCH 30/33] fix(aggregation): resolve review comments for config directory scanning and schema validation --- .../ingestion-helper/aggregation/orchestrator.py | 11 ++++++++++- .../ingestion-helper/aggregation/orchestrator_test.py | 11 +++++++++++ .../workflow/ingestion-helper/aggregation/schema.json | 4 +++- 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py b/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py index 52f9de8a8..a7ddb79a1 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py +++ b/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py @@ -14,6 +14,7 @@ """Aggregation orchestrator for Data Commons ingestion workflow.""" +import glob import logging import os import time @@ -70,7 +71,15 @@ def __init__( schema_file_path = os.path.join(curr_dir, "schema.json") # Load and validate configuration - self.calculations = validate_config(target_config, schema_file_path) + self.calculations = [] + if os.path.isdir(target_config): + yaml_files = sorted( + glob.glob(os.path.join(target_config, "*.yaml")) + glob.glob(os.path.join(target_config, "*.yml")) + ) + for file_path in yaml_files: + self.calculations.extend(validate_config(file_path, schema_file_path)) + else: + self.calculations = validate_config(target_config, schema_file_path) def run(self, active_imports: List[str]) -> None: """Executes aggregations independently for each active import. diff --git a/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py b/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py index c724248d2..3646b8cc2 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py +++ b/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py @@ -80,6 +80,17 @@ def test_get_active_stages_for_import(self, mock_executor): stages = self.orchestrator.get_active_stages_for_import("OtherImport") self.assertEqual(stages, []) + def test_directory_config_loading(self, mock_executor): + """Tests that orchestrator correctly scans and loads config files from a directory.""" + dir_orchestrator = AggregationOrchestrator( + connection_id="conn", + project_id="proj", + instance_id="inst", + database_id="db", + config_dir=self.tmpdir.name + ) + self.assertEqual(len(dir_orchestrator.calculations), 2) + @patch('aggregation.orchestrator.BigQueryExecutor') @patch('aggregation.orchestrator.PlaceAggregationGenerator') diff --git a/pipeline/workflow/ingestion-helper/aggregation/schema.json b/pipeline/workflow/ingestion-helper/aggregation/schema.json index ab59d0f15..a19047eb6 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/schema.json +++ b/pipeline/workflow/ingestion-helper/aggregation/schema.json @@ -10,6 +10,7 @@ "items": { "type": "object", "required": ["type", "input_imports"], + "additionalProperties": false, "properties": { "name": { "type": "string" }, "type": { @@ -37,7 +38,8 @@ "stat_var_aggregation": { "type": "object" }, "entity_aggregation": { "type": "object" }, "stat_var_series_aggregation": { "type": "object" }, - "stat_var_calculation": { "type": "object" } + "stat_var_calculation": { "type": "object" }, + "disabled": { "type": "boolean" } }, "allOf": [ { From 507ade4e8c15256c7eceed028704445b87663969 Mon Sep 17 00:00:00 2001 From: Sandeep Tuniki Date: Fri, 3 Jul 2026 10:57:54 +0530 Subject: [PATCH 31/33] fix(aggregation): connect StatVarCalculationGenerator and clean up legacy fallback getters --- .../aggregation/orchestrator.py | 31 ++++++++++++------- .../aggregation/orchestrator_test.py | 3 +- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py b/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py index a7ddb79a1..9ac6c5a31 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py +++ b/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py @@ -25,6 +25,7 @@ from .place_aggregation_generator import PlaceAggregationGenerator from .provenance_summary_generator import ProvenanceSummaryGenerator from .stat_var_aggregator import StatVarAggregator +from .stat_var_calculation_generator import StatVarCalculationGenerator from .stat_var_group_generator import StatVarGroupGenerator from .validator import validate_config @@ -176,6 +177,8 @@ def _dispatch_stage_steps(self, single_import: str, stage_num: int) -> List[Any] step_jobs = self._trigger_place(calc, [single_import]) elif step_type == "STAT_VAR_AGGREGATION": step_jobs = self._trigger_stat_var(calc, [single_import]) + elif step_type == "STAT_VAR_CALCULATION": + step_jobs = self._trigger_stat_var_calculation(calc, [single_import]) elif step_type == "LINKED_EDGES": step_jobs = self._trigger_linked_edges(calc, [single_import]) elif step_type == "PROVENANCE_SUMMARY": @@ -221,8 +224,8 @@ def _wait_for_jobs(self, job_ids: List[str], poll_interval: int = 5) -> None: def _trigger_place(self, config: Dict[str, Any], applicable_imports: List[str]) -> List[Any]: """Triggers place-level rollup aggregations.""" place_cfg = config.get("place_aggregation", {}) - from_type = place_cfg.get("from_place_types") or config.get("source_type") - to_type = place_cfg.get("to_place_types") or config.get("destination_type") + from_type = place_cfg["from_place_types"] + to_type = place_cfg["to_place_types"] logging.info(f" -> Place Rollup: {from_type} -> {to_type} for imports {applicable_imports}") generator = PlaceAggregationGenerator(self.executor, self.is_base_dc) @@ -238,15 +241,7 @@ def _trigger_stat_var(self, config: Dict[str, Any], applicable_imports: List[str """Triggers statistical variable aggregations.""" stat_cfg = config.get("stat_var_aggregation", {}) aggregations = stat_cfg.get("aggregations", []) - output_import_name = config.get("output_import") or config.get("output_import_name") - - # Backwards compatibility fallback for single item config - if not aggregations and "ancestor_sv_id" in config: - aggregations = [{ - "ancestor_sv_id": config["ancestor_sv_id"], - "source_sv_ids": config["source_sv_ids"], - "skip_all_sources_present_check": config.get("skip_all_sources_present_check", False) - }] + output_import_name = config.get("output_import") generator = StatVarAggregator(self.executor, self.is_base_dc) jobs = [] @@ -268,6 +263,20 @@ def _trigger_stat_var(self, config: Dict[str, Any], applicable_imports: List[str return jobs + def _trigger_stat_var_calculation(self, config: Dict[str, Any], applicable_imports: List[str]) -> List[Any]: + """Triggers statistical variable calculations.""" + calc_cfg = config.get("stat_var_calculation", {}) + calculations = calc_cfg.get("calculations", []) + output_import_name = config.get("output_import") + + logging.info(f" -> Stat Var Calculation for imports {applicable_imports}") + generator = StatVarCalculationGenerator(self.executor, self.is_base_dc) + return generator.calculate_stat_vars( + calculations=calculations, + import_names=applicable_imports, + output_import_name=output_import_name + ) + def _trigger_linked_edges(self, config: Dict[str, Any], applicable_imports: List[str]) -> List[Any]: """Triggers linked edge aggregations.""" logging.info(f" -> Linked Edges Aggregation for imports {applicable_imports}") diff --git a/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py b/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py index 3646b8cc2..fa431e74f 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py +++ b/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py @@ -95,6 +95,7 @@ def test_directory_config_loading(self, mock_executor): @patch('aggregation.orchestrator.BigQueryExecutor') @patch('aggregation.orchestrator.PlaceAggregationGenerator') @patch('aggregation.orchestrator.StatVarAggregator') +@patch('aggregation.orchestrator.StatVarCalculationGenerator') class TestOrchestratorExecution(unittest.TestCase): """Tests stage execution, verifying job submission and synchronization.""" @@ -115,7 +116,7 @@ def setUp(self): def tearDown(self): self.tmpdir.cleanup() - def test_run_synchronized_pipeline(self, mock_sv_agg, mock_place_gen, mock_executor_cls): + def test_run_synchronized_pipeline(self, mock_calc_gen, mock_sv_agg, mock_place_gen, mock_executor_cls): """Tests complete synchronized run pipeline for an import across stages.""" mock_job1 = MagicMock() mock_job1.job_id = "job-place-1" From 3b1baf7f3ee1d4d52285a1036a7e71554f1e6036 Mon Sep 17 00:00:00 2001 From: Sandeep Tuniki Date: Fri, 3 Jul 2026 11:04:15 +0530 Subject: [PATCH 32/33] refactor(aggregation): make get_active_stages_for_import private and test via public API --- .../workflow/ingestion-helper/aggregation/orchestrator.py | 6 +++--- .../ingestion-helper/aggregation/orchestrator_test.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py b/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py index 9ac6c5a31..10f36979a 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py +++ b/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py @@ -95,7 +95,7 @@ def run(self, active_imports: List[str]) -> None: for single_import in active_imports: logging.info(f"=== Starting Aggregation Pipeline for Import: '{single_import}' ===") - active_stages = self.get_active_stages_for_import(single_import) + active_stages = self._get_active_stages_for_import(single_import) if not active_stages: logging.info(f"No aggregation steps configured for import '{single_import}'. Skipping.") @@ -107,7 +107,7 @@ def run(self, active_imports: List[str]) -> None: logging.info(f"=== Successfully completed all aggregation stages for Import: '{single_import}' ===") - def get_active_stages_for_import(self, single_import: str) -> List[int]: + def _get_active_stages_for_import(self, single_import: str) -> List[int]: """Returns a sorted list of unique active stage numbers for a single import. Args: @@ -126,7 +126,7 @@ def get_active_stages(self, active_imports: List[str]) -> List[int]: """Returns a sorted list of unique active stage numbers across active imports.""" stages = set() for single_import in active_imports: - stages.update(self.get_active_stages_for_import(single_import)) + stages.update(self._get_active_stages_for_import(single_import)) return sorted(list(stages)) def execute_stage(self, stage_num: int, active_imports: List[str]) -> List[str]: diff --git a/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py b/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py index fa431e74f..6acded88f 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py +++ b/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py @@ -72,12 +72,12 @@ def setUp(self): def tearDown(self): self.tmpdir.cleanup() - def test_get_active_stages_for_import(self, mock_executor): + def test_get_active_stages(self, mock_executor): """Tests getting active stages for matching and non-matching imports.""" - stages = self.orchestrator.get_active_stages_for_import("USFed_Census") + stages = self.orchestrator.get_active_stages(["USFed_Census"]) self.assertEqual(stages, [1, 2]) - stages = self.orchestrator.get_active_stages_for_import("OtherImport") + stages = self.orchestrator.get_active_stages(["OtherImport"]) self.assertEqual(stages, []) def test_directory_config_loading(self, mock_executor): From de358cfaabfccaad0ecf6a94e21b13a825eea72d Mon Sep 17 00:00:00 2001 From: Sandeep Tuniki Date: Fri, 3 Jul 2026 13:16:31 +0530 Subject: [PATCH 33/33] feat(aggregation): integrate AggregationOrchestrator into Cloud Run Job with dry_run support --- .../workflow/aggregation-helper/Dockerfile | 22 ++++++- .../aggregation-helper/cloudbuild.yaml | 1 + pipeline/workflow/aggregation-helper/main.py | 60 +++++++++++++++---- pipeline/workflow/build-services.yaml | 2 +- .../aggregation/bq_executor.py | 10 +++- .../aggregation/orchestrator.py | 13 +++- .../aggregation/orchestrator_test.py | 28 ++++++++- 7 files changed, 114 insertions(+), 22 deletions(-) diff --git a/pipeline/workflow/aggregation-helper/Dockerfile b/pipeline/workflow/aggregation-helper/Dockerfile index b782018c8..8934dd82a 100644 --- a/pipeline/workflow/aggregation-helper/Dockerfile +++ b/pipeline/workflow/aggregation-helper/Dockerfile @@ -14,13 +14,29 @@ FROM python:3.12-slim +# Copy uv binary +COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ + # Allow statements and log messages to immediately appear in the logs ENV PYTHONUNBUFFERED True WORKDIR /app -# Copy local code to the container image -COPY main.py . +# Copy dependency definition files to leverage Docker layer caching +COPY ingestion-helper/pyproject.toml ingestion-helper/uv.lock ingestion-helper/__init__.py ./ + +# Install production dependencies (without the project itself) +RUN uv sync --no-dev --no-install-project + +# Copy ingestion-helper code (includes aggregation package) and main.py +COPY ingestion-helper/ . +COPY aggregation-helper/main.py . + +# Install the project itself +RUN uv sync --no-dev + +# Place the virtual environment's bin directory on the PATH +ENV PATH="/app/.venv/bin:$PATH" -# Run the script +# Run the Cloud Run Job script ENTRYPOINT ["python", "main.py"] diff --git a/pipeline/workflow/aggregation-helper/cloudbuild.yaml b/pipeline/workflow/aggregation-helper/cloudbuild.yaml index b3fc0d318..4c3f78cf1 100644 --- a/pipeline/workflow/aggregation-helper/cloudbuild.yaml +++ b/pipeline/workflow/aggregation-helper/cloudbuild.yaml @@ -17,6 +17,7 @@ steps: - name: 'gcr.io/cloud-builders/docker' args: [ 'build', + '-f', 'aggregation-helper/Dockerfile', '-t', '${_AR_REPO_URL}/${_IMAGE_NAME}:${_TAG}', '-t', '${_AR_REPO_URL}/${_IMAGE_NAME}:${_VERSION}', '.' diff --git a/pipeline/workflow/aggregation-helper/main.py b/pipeline/workflow/aggregation-helper/main.py index e74a7f996..9438774da 100644 --- a/pipeline/workflow/aggregation-helper/main.py +++ b/pipeline/workflow/aggregation-helper/main.py @@ -12,20 +12,39 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Aggregation Helper Cloud Run Job skeleton.""" +"""Aggregation Helper Cloud Run Job execution entry point.""" import argparse import json import logging +import os import sys +from aggregation import AggregationOrchestrator + + def main(): logging.basicConfig(level=logging.INFO) - logging.info("Starting Aggregation Helper Job") + logging.info("Starting Aggregation Helper Cloud Run Job...") parser = argparse.ArgumentParser(description="Run aggregation helper job.") - parser.add_argument("--import_list", help="JSON string representing the list of imports to process.") - + parser.add_argument( + "--import_list", + help="JSON string representing the list of imports to process." + ) + parser.add_argument( + "--dry_run", + action="store_true", + default=True, + help="Run in dry-run mode without executing jobs (default: True)." + ) + parser.add_argument( + "--execute", + action="store_false", + dest="dry_run", + help="Disable dry-run mode and execute BigQuery aggregation jobs." + ) + args = parser.parse_args() if not args.import_list: @@ -34,17 +53,36 @@ def main(): try: import_list = json.loads(args.import_list) - logging.info(f"Received import list: {import_list}") + logging.info(f"Received active imports to process: {import_list}") except json.JSONDecodeError as e: logging.error(f"Failed to parse import_list JSON: {e}") sys.exit(1) - # Dummy logic - logging.info("Processing aggregation (dummy)...") - for imp in import_list: - logging.info(f"Processing import: {imp}") - - logging.info("Aggregation Helper Job completed successfully.") + connection_id = os.environ.get("BQ_SPANNER_CONN_ID") + project_id = os.environ.get("PROJECT_ID") + instance_id = os.environ.get("SPANNER_INSTANCE_ID") + database_id = os.environ.get("SPANNER_DATABASE_ID") or os.environ.get("SPANNER_GRAPH_DATABASE_ID") + location = os.environ.get("LOCATION") + + if not all([connection_id, project_id, instance_id, database_id]): + logging.error( + f"Missing required environment variables. connection_id={connection_id}, " + f"project_id={project_id}, instance_id={instance_id}, database_id={database_id}" + ) + sys.exit(1) + + orchestrator = AggregationOrchestrator( + connection_id=connection_id, + project_id=project_id, + instance_id=instance_id, + database_id=database_id, + location=location, + ) + + logging.info(f"Executing AggregationOrchestrator pipeline (dry_run={args.dry_run}) for imports: {import_list}") + orchestrator.run(active_imports=import_list, dry_run=args.dry_run) + logging.info("Aggregation Helper Cloud Run Job completed successfully.") + if __name__ == "__main__": main() diff --git a/pipeline/workflow/build-services.yaml b/pipeline/workflow/build-services.yaml index dde23e970..f6bf5f530 100644 --- a/pipeline/workflow/build-services.yaml +++ b/pipeline/workflow/build-services.yaml @@ -36,7 +36,7 @@ steps: - id: 'build-aggregation-helper' name: 'gcr.io/cloud-builders/gcloud' - args: ['builds', 'submit', 'aggregation-helper', '--config', 'aggregation-helper/cloudbuild.yaml', '--substitutions', '_AR_REPO_URL=${_AR_REPO_URL},_VERSION=${_VERSION}'] + args: ['builds', 'submit', '.', '--config', 'aggregation-helper/cloudbuild.yaml', '--substitutions', '_AR_REPO_URL=${_AR_REPO_URL},_VERSION=${_VERSION}'] dir: 'pipeline/workflow' waitFor: ['-'] diff --git a/pipeline/workflow/ingestion-helper/aggregation/bq_executor.py b/pipeline/workflow/ingestion-helper/aggregation/bq_executor.py index 58f4f992e..010e8606b 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/bq_executor.py +++ b/pipeline/workflow/ingestion-helper/aggregation/bq_executor.py @@ -37,8 +37,14 @@ def __init__(self, self.location = location # TODO: Remove run_sequential logic once DCP migrates to async execution. self.run_sequential = run_sequential - self.client = bigquery.Client(project=self.project_id, - location=self.location) + self._client: Optional[bigquery.Client] = None + + @property + def client(self) -> bigquery.Client: + """Lazily initializes and returns the BigQuery client.""" + if self._client is None: + self._client = bigquery.Client(project=self.project_id, location=self.location) + return self._client def get_spanner_destination_uri(self) -> str: """Returns the Spanner destination URI for EXPORT DATA.""" diff --git a/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py b/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py index 10f36979a..bf42d2d53 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py +++ b/pipeline/workflow/ingestion-helper/aggregation/orchestrator.py @@ -82,7 +82,7 @@ def __init__( else: self.calculations = validate_config(target_config, schema_file_path) - def run(self, active_imports: List[str]) -> None: + def run(self, active_imports: List[str], dry_run: bool = True) -> None: """Executes aggregations independently for each active import. Blocks and synchronizes stage progression for each import: @@ -90,8 +90,11 @@ def run(self, active_imports: List[str]) -> None: Args: active_imports: List of active import dataset names to process. + dry_run: If True, logs imports and active stages without executing BigQuery jobs. """ - logging.info(f"Starting Aggregation Orchestrator run for active imports: {active_imports}") + logging.info( + f"Starting Aggregation Orchestrator run (dry_run={dry_run}) for active imports: {active_imports}" + ) for single_import in active_imports: logging.info(f"=== Starting Aggregation Pipeline for Import: '{single_import}' ===") @@ -101,6 +104,12 @@ def run(self, active_imports: List[str]) -> None: logging.info(f"No aggregation steps configured for import '{single_import}'. Skipping.") continue + if dry_run: + logging.info( + f"Detected active stage(s) {active_stages} for import '{single_import}'. Skipping execution because dry_run=True." + ) + continue + for stage_num in active_stages: logging.info(f"--- Triggering Stage {stage_num} for import '{single_import}' ---") self._execute_and_synchronize_stage(single_import, stage_num) diff --git a/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py b/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py index 6acded88f..95ed3e9e6 100644 --- a/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py +++ b/pipeline/workflow/ingestion-helper/aggregation/orchestrator_test.py @@ -116,8 +116,14 @@ def setUp(self): def tearDown(self): self.tmpdir.cleanup() - def test_run_synchronized_pipeline(self, mock_calc_gen, mock_sv_agg, mock_place_gen, mock_executor_cls): - """Tests complete synchronized run pipeline for an import across stages.""" + def test_run_dry_run_true(self, mock_calc_gen, mock_sv_agg, mock_place_gen, mock_executor_cls): + """Tests that run with dry_run=True logs imports and skips job submission.""" + self.orchestrator.run(active_imports=["USFed_Census"], dry_run=True) + mock_place_gen.return_value.aggregate_places.assert_not_called() + mock_sv_agg.return_value.aggregate_stat_vars.assert_not_called() + + def test_run_dry_run_false(self, mock_calc_gen, mock_sv_agg, mock_place_gen, mock_executor_cls): + """Tests that run with dry_run=False submits BigQuery jobs across stages.""" mock_job1 = MagicMock() mock_job1.job_id = "job-place-1" mock_place_gen.return_value.aggregate_places.return_value = mock_job1 @@ -129,7 +135,7 @@ def test_run_synchronized_pipeline(self, mock_calc_gen, mock_sv_agg, mock_place_ self.orchestrator.executor = MagicMock() self.orchestrator.executor.get_jobs_status.return_value = {"status": "DONE"} - self.orchestrator.run(active_imports=["USFed_Census"]) + self.orchestrator.run(active_imports=["USFed_Census"], dry_run=False) mock_place_gen.return_value.aggregate_places.assert_called_once_with( import_names=["USFed_Census"], @@ -146,6 +152,22 @@ def test_run_synchronized_pipeline(self, mock_calc_gen, mock_sv_agg, mock_place_ skip_all_sources_present_check=True ) + def test_execute_stage(self, mock_calc_gen, mock_sv_agg, mock_place_gen, mock_executor_cls): + """Tests manual execution of a specific stage.""" + mock_job1 = MagicMock() + mock_job1.job_id = "job-place-1" + mock_place_gen.return_value.aggregate_places.return_value = mock_job1 + + job_ids = self.orchestrator.execute_stage(1, ["USFed_Census"]) + + mock_place_gen.return_value.aggregate_places.assert_called_once_with( + import_names=["USFed_Census"], + source_type="County", + destination_type="State", + allow_multiple_to_places=False + ) + self.assertEqual(job_ids, ["job-place-1"]) + if __name__ == '__main__': unittest.main()