diff --git a/scripts/world_bank/wdi/README.md b/scripts/world_bank/wdi/README.md index ef1f0f5dc6..239d0f7277 100644 --- a/scripts/world_bank/wdi/README.md +++ b/scripts/world_bank/wdi/README.md @@ -146,5 +146,24 @@ If you want to perform "only download", run the below command: python3 worldbank.py --mode=download ``` +### Added golden files and increased the threshold with golden checks in validation_config.json. + +The `GOLDENS_CHECK` validator confirms that the import includes a specific set of expected records. This is useful for verifying that critical StatVars, Places, or specific metadata combinations are consistently present in the output. + +The validator compares the input data (usually from the stats data source) against one or more "golden" files (MCF or CSV). + +If any combination of values in a golden file row is missing from the input, the validation fails. The missing golden rows are then listed in the validation report JSON. + +If you want to get goldens, run the below command: +```bash +#goldens from output csv +python3 validator_goldens.py --validate_goldens_input=../../scripts/world_bank/wdi/output/WorldBank.csv --generate_goldens=golden_data/golden_observations.csv --goldens_must_include="ISO3166Alpha3:gs://unresolved_mcf/import_validation/top_100k_places.csv" --generate_goldens_property_sets="ISO3166Alpha3" +``` + +#goldens from summary reports +```bash +python3 validator_goldens.py --validate_goldens_input="summary_report.csv" --generate_goldens=golden_data/golden_summary_report.csv --generate_goldens_property_sets="StatVar|Units|MinDate|MeasurementMethods|observationPeriod" +``` + We highly recommend the use of the import validation tool for this import which you can find in https://github.com/datacommonsorg/tools/tree/master/import-validation-helper. diff --git a/scripts/world_bank/wdi/golden_data/golden_summary_report.csv b/scripts/world_bank/wdi/golden_data/golden_summary_report.csv new file mode 100644 index 0000000000..579c50c0b7 --- /dev/null +++ b/scripts/world_bank/wdi/golden_data/golden_summary_report.csv @@ -0,0 +1,71 @@ +"MeasurementMethods","NumPlaces","ScalingFactors","MinDate","StatVar","observationPeriods","Units" +"[]","186","[]","2000","Count_Death_IntentionalSelfHarm_Male_AsFractionOf_Count_Person_Male","[P1Y]","[Per100000Males]" +"[]","203","[]","1990","Amount_EconomicActivity_GrossNationalIncome_PurchasingPowerParity","[P1Y]","[InternationalDollar]" +"[JointChildMalnutritionEstimate]","165","[100]","1983","Count_Person_Upto4Years_Wasting_AsFractionOf_Count_Person_Upto4Years","[P1Y]","[Percent]" +"[]","144","[]","1994","Count_Person_25OrMoreYears_DoctorateDegree_AsFractionOf_Count_Person_25OrMoreYears","[P1Y]","[]" +"[]","204","[]","1970","Amount_Emissions_CarbonDioxide_PerCapita","[P1Y]","[MetricTon]" +"[]","185","[]","1970","Count_Person_25OrMoreYears_Male_TertiaryEducation_AsFractionOf_Count_Person_25OrMoreYears_Male","[P1Y]","[]" +"[]","218","[]","1960","LifeExpectancy_Person_Female","[P1Y]","[Year]" +"[]","139","[]","1994","Count_Person_25OrMoreYears_Male_DoctorateDegree_AsFractionOf_Count_Person_25OrMoreYears_Male","[P1Y]","[]" +"[]","197","[]","1990","Count_CriminalActivities_MurderAndNonNegligentManslaughter_AsFractionOf_Count_Person","[P1Y]","[Per100000Persons]" +"[]","194","[]","2000","Amount_EconomicActivity_ExpenditureActivity_HealthcareExpenditure_AsFractionOf_Count_Person","[P1Y]","[InternationalDollar, USDollar]" +"[]","202","[100]","1980","Amount_EconomicActivity_ExpenditureActivity_EducationExpenditure_Government_AsFractionOf_Amount_EconomicActivity_ExpenditureActivity_Government","[P1Y]","[Percent]" +"[]","188","[]","1970","Count_Person_25OrMoreYears_Male_BachelorsDegreeOrHigher_AsFractionOf_Count_Person_25OrMoreYears_Male","[P1Y]","[]" +"[]","218","[]","1960","FertilityRate_Person_Female","[]","[]" +"[WorldBankEstimate]","218","[]","1960","Count_Person_Rural","[P1Y]","[]" +"[]","184","[]","1970","Count_Person_25OrMoreYears_Female_TertiaryEducation_AsFractionOf_Count_Person_25OrMoreYears_Female","[P1Y]","[]" +"[WorldBankEstimate]","218","[]","1960","Count_Person_Urban","[P1Y]","[]" +"[]","165","[]","1983","Count_Person_Upto4Years_Overweight_AsFractionOf_Count_Person_Upto4Years","[P1Y]","[]" +"[]","218","[]","1960","LifeExpectancy_Person_Male","[P1Y]","[Year]" +"[]","218","[]","1960","Count_BirthEvent_LiveBirth_AsFractionOf_Count_Person","[P1Y]","[Per1000Persons]" +"[]","197","[]","1960","MortalityRate_Person_Upto4Years_AsFractionOf_Count_BirthEvent_LiveBirth","[P1Y]","[Per1000LiveBirths]" +"[]","218","[]","1960","Count_Person","[P1Y]","[]" +"[JointChildMalnutritionEstimate]","160","[100]","1986","Count_Person_Upto4Years_Male_Wasting_AsFractionOf_Count_Person_Upto4Years_Male","[P1Y]","[Percent]" +"[]","204","[100]","1970","Amount_EconomicActivity_ExpenditureActivity_EducationExpenditure_Government_AsFractionOf_Amount_EconomicActivity_GrossDomesticProduction_Nominal","[P1Y]","[Percent]" +"[]","188","[]","1970","Count_Person_25OrMoreYears_BachelorsDegreeOrHigher_AsFractionOf_Count_Person_25OrMoreYears","[P1Y]","[]" +"[AgeAdjustedPrevalence]","165","[]","2000","Count_Person_15OrMoreYears_Female_Smoking_AsFractionOf_Count_Person_15OrMoreYears_Female","[P1Y]","[]" +"[AgeAdjustedPrevalence]","165","[]","2000","Count_Person_15OrMoreYears_Smoking_AsFractionOf_Count_Person_15OrMoreYears","[P1Y]","[]" +"[]","203","[]","1990","Amount_EconomicActivity_GrossNationalIncome_PurchasingPowerParity_PerCapita","[P1Y]","[InternationalDollar]" +"[]","160","[]","1986","Count_Person_Upto4Years_Male_Overweight_AsFractionOf_Count_Person_Upto4Years_Male","[P1Y]","[]" +"[]","195","[]","1970","Amount_EconomicActivity_ExpenditureActivity_TertiaryEducationExpenditure_Government_AsFractionOf_Amount_EconomicActivity_ExpenditureActivity_EducationExpenditure_Government","[P1Y]","[]" +"[JointChildMalnutritionEstimate]","159","[100]","1986","Count_Person_Upto4Years_Male_SevereWasting_AsFractionOf_Count_Person_Upto4Years_Male","[P1Y]","[Percent]" +"[]","151","[]","1990","Amount_Consumption_Electricity_PerCapita","[P1Y]","[KilowattHour]" +"[]","180","[]","1990","Amount_Consumption_Energy_PerCapita","[P1Y]","[KilogramOfOilEquivalent]" +"[]","186","[]","2000","Count_Death_IntentionalSelfHarm_Female_AsFractionOf_Count_Person_Female","[P1Y]","[Per100000Females]" +"[AgeAdjustedPrevalence]","165","[]","2000","Count_Person_15OrMoreYears_Male_Smoking_AsFractionOf_Count_Person_15OrMoreYears_Male","[P1Y]","[]" +"[]","149","[]","1990","Count_CriminalActivities_MurderAndNonNegligentManslaughter_Male_AsFractionOf_Count_Person_Male","[P1Y]","[Per100000Males]" +"[WorldBankEstimate]","200","[100]","1970","Amount_Remittance_InwardRemittance_AsFractionOf_Amount_EconomicActivity_GrossDomesticProduction_Nominal","[P1Y]","[Percent]" +"[]","188","[]","1990","Count_Person_15To64Years_InLaborForce_AsFractionOf_Count_Person_15To64Years","[P1Y]","[]" +"[WorldBankEstimate]","171","[]","1963","GiniIndex_EconomicActivity","[P1Y]","[]" +"[]","162","[]","1990","Count_Person_25OrMoreYears_Female_MastersDegreeOrHigher_AsFractionOf_Count_Person_25OrMoreYears_Female","[P1Y]","[]" +"[]","170","[]","1990","Count_Person_25OrMoreYears_MastersDegreeOrHigher_AsFractionOf_Count_Person_25OrMoreYears","[P1Y]","[]" +"[]","152","[]","1990","Count_CriminalActivities_MurderAndNonNegligentManslaughter_Female_AsFractionOf_Count_Person_Female","[P1Y]","[Per100000Females]" +"[]","188","[]","1990","Count_Person_15To64Years_Female_InLaborForce_AsFractionOf_Count_Person_15To64Years_Female","[P1Y]","[]" +"[]","104","[100]","1975","Amount_Stock_AsFractionOf_Amount_EconomicActivity_GrossDomesticProduction_Nominal","[P1Y]","[Percent]" +"[]","131","[]","1994","Count_Person_25OrMoreYears_Female_DoctorateDegree_AsFractionOf_Count_Person_25OrMoreYears_Female","[P1Y]","[]" +"[]","215","[]","1961","GrowthRate_Amount_EconomicActivity_GrossDomesticProduction","[P1Y]","[]" +"[WorldBankWeightedAverage]","218","[]","1960","Count_Death_AsAFractionOfCount_Person","[P1Y]","[Per1000Persons]" +"[]","215","[]","1960","Amount_EconomicActivity_GrossDomesticProduction_Nominal","[P1Y]","[USDollar]" +"[]","188","[]","1990","Count_Person_15To64Years_Male_InLaborForce_AsFractionOf_Count_Person_15To64Years_Male","[P1Y]","[]" +"[WorldBankEstimate]","200","[]","1970","Amount_Remittance_InwardRemittance","[P1Y]","[USDollar]" +"[JointChildMalnutritionEstimate]","161","[100]","1983","Count_Person_Upto4Years_SevereWasting_AsFractionOf_Count_Person_Upto4Years","[P1Y]","[Percent]" +"[]","188","[]","1970","Count_Person_25OrMoreYears_Female_BachelorsDegreeOrHigher_AsFractionOf_Count_Person_25OrMoreYears_Female","[P1Y]","[]" +"[]","167","[]","1990","Count_Person_25OrMoreYears_Male_MastersDegreeOrHigher_AsFractionOf_Count_Person_25OrMoreYears_Male","[P1Y]","[]" +"[WorldHealthOrganizationEstimates]","188","[]","2000","Amount_Consumption_Alcohol_15OrMoreYears_AsFractionOf_Count_Person_15OrMoreYears","[P1Y]","[Liter]" +"[]","188","[]","1990","Count_Person_15OrMoreYears_InLaborForce_Female_AsFractionOf_Count_Person_InLaborForce","[P1Y]","[]" +"[]","215","[]","1960","Count_Product_MobileCellularSubscription_AsFractionOf_Count_Person","[P1Y]","[]" +"[InternationalLaborOrganization]","188","[]","1990","Count_Person_InLaborForce","[P1Y]","[]" +"[]","186","[]","2000","Count_Death_IntentionalSelfHarm_AsFractionOf_Count_Person","[P1Y]","[Per100000Persons]" +"[UnitedNationsIGMEEstimate]","197","[]","1960","Count_Death_0Years_AsFractionOf_Count_BirthEvent_LiveBirth","[P1Y]","[Per1000LiveBirths]" +"[JointChildMalnutritionEstimate]","160","[100]","1986","Count_Person_Upto4Years_Female_Wasting_AsFractionOf_Count_Person_Upto4Years_Female","[P1Y]","[Percent]" +"[WorldBankEstimate]","203","[]","1970","Amount_Remittance_OutwardRemittance","[P1Y]","[USDollar]" +"[]","160","[]","1986","Count_Person_Upto4Years_Female_Overweight_AsFractionOf_Count_Person_Upto4Years_Female","[P1Y]","[]" +"[]","214","[100]","1990","Count_Person_IsInternetUser_PerCapita","[P1Y]","[]" +"[]","210","[]","1990","Amount_Production_ElectricityFromNuclearSources_AsFractionOf_Amount_Production_Energy","[P1Y]","[]" +"[JointChildMalnutritionEstimate]","159","[100]","1986","Count_Person_Upto4Years_Female_SevereWasting_AsFractionOf_Count_Person_Upto4Years_Female","[P1Y]","[Percent]" +"[]","185","[]","1970","Count_Person_25OrMoreYears_TertiaryEducation_AsFractionOf_Count_Person_25OrMoreYears","[P1Y]","[]" +"[]","210","[]","1990","Amount_Production_ElectricityFromOilGasOrCoalSources_AsFractionOf_Amount_Production_Energy","[P1Y]","[]" +"[]","218","[]","1961","GrowthRate_Count_Person","[P1Y]","[]" +"[]","213","[]","1990","Amount_Consumption_RenewableEnergy_AsFractionOf_Amount_Consumption_Energy","[P1Y]","[]" +"[]","104","[]","1975","Amount_Stock","[P1Y]","[USDollar]" +"[]","218","[]","1960","LifeExpectancy_Person","[]","[Year]" diff --git a/scripts/world_bank/wdi/manifest.json b/scripts/world_bank/wdi/manifest.json index bc3927141e..eb427c0472 100644 --- a/scripts/world_bank/wdi/manifest.json +++ b/scripts/world_bank/wdi/manifest.json @@ -20,7 +20,8 @@ "WorldBankCountries.csv", "schema_csvs/WorldBankIndicators_prod.csv" ], - "cron_schedule": "0 11 * * 2" + "cron_schedule": "0 11 * * 2", + "validation_config_file": "validation_config.json" } ] } \ No newline at end of file diff --git a/scripts/world_bank/wdi/validation_config.json b/scripts/world_bank/wdi/validation_config.json new file mode 100644 index 0000000000..cf2f722e1e --- /dev/null +++ b/scripts/world_bank/wdi/validation_config.json @@ -0,0 +1,20 @@ +{ + "schema_version": "1.0", + "rules": [ + { + "rule_id": "check_deleted_records_percent", + "description": "Checks that the percentage of deleted points is within the threshold.", + "validator": "DELETED_RECORDS_PERCENT", + "params": { + "threshold": 0.1 + } + }, + { + "rule_id": "check_goldens_summary_report", + "validator": "GOLDENS_CHECK", + "params": { + "golden_files": "../../../../golden_data/golden_summary_report.csv" + } + } + ] +} \ No newline at end of file diff --git a/scripts/world_bank/wdi/worldbank.py b/scripts/world_bank/wdi/worldbank.py index d864bfeaa2..f411bd8c65 100644 --- a/scripts/world_bank/wdi/worldbank.py +++ b/scripts/world_bank/wdi/worldbank.py @@ -39,6 +39,10 @@ "indicatorSchemaFile", os.path.join(_MODULE_DIR, "schema_csvs/WorldBankIndicators_prod.csv"), "") flags.DEFINE_string('mode', '', 'Options: download or process') +flags.DEFINE_string( + 'historical_gcs_path', + 'gs://resolved_mcf/world_bank/wdi/deleted_rows_06_2026.csv', + 'GCS path to the deleted historical data CSV file') # Remaps the columns provided by World Bank API. WORLDBANK_COL_REMAP = { @@ -504,9 +508,40 @@ def output_csv_and_tmcf_by_grouping(worldbank_dataframe, df = df.replace({'StatisticalVariable': RESOLUTION_TO_EXISTING_DCID}) if saveOutput: logging.info("Writing output csv") - df.drop('IndicatorCode', axis=1).to_csv('output/WorldBank.csv', + output_file_path = 'output/WorldBank.csv' + df.drop('IndicatorCode', axis=1).to_csv(output_file_path, float_format='%.10f', index=False) + + # Read and append historical deleted data from GCS. + try: + logging.info( + f"Reading historical deleted data from GCS: {_FLAGS.historical_gcs_path}" + ) + final_df = pd.read_csv(output_file_path) + deleted_df = pd.read_csv(_FLAGS.historical_gcs_path) + + # Combine dataframes. final_df is placed first so its versions are preferred. + final_df = pd.concat([final_df, deleted_df], ignore_index=True) + + # Deduplicate based on composite keys, keeping the first occurrence (from final_df) + composite_keys = [ + 'StatisticalVariable', 'ISO3166Alpha3', 'Year', + 'observationPeriod', 'unit', 'measurementMethod', + 'scalingFactor' + ] + final_df = final_df.drop_duplicates(subset=composite_keys, + keep='first') + final_df.to_csv(output_file_path, + float_format='%.10f', + index=False) + logging.info( + "Successfully merged and de-duplicated deleted historical data." + ) + except Exception as e: + logging.warning( + f"Could not read historical deleted data from GCS: {e}. Proceeding with fresh data only." + ) else: return df except Exception as e: diff --git a/tools/import_validation/runner.py b/tools/import_validation/runner.py index e82714c360..bcf83fb06e 100644 --- a/tools/import_validation/runner.py +++ b/tools/import_validation/runner.py @@ -41,6 +41,7 @@ class ValidationRunner: def __init__(self, validation_config_path: str, differ_output: str, stats_summary: str, lint_report: str, validation_output: str): + self.validation_config_path = validation_config_path self.config = ValidationConfig(validation_config_path) self.validation_output = validation_output self.validator = Validator() @@ -247,6 +248,12 @@ def run_validations(self) -> tuple[bool, list[ValidationResult]]: if output_dir: rule_params.setdefault('output_path', output_dir) + # Inject the directory containing the validation config file as config_dir. + # This allows individual validators to resolve relative paths relative to the config file's directory. + config_dir = os.path.dirname( + os.path.abspath(self.validation_config_path)) + rule_params.setdefault('config_dir', config_dir) + if validator_name == 'SQL_VALIDATOR': result = validation_func(self.data_sources['stats'], self.data_sources['differ'], diff --git a/tools/import_validation/validator_goldens.py b/tools/import_validation/validator_goldens.py index 7b19b783fe..b8652c7819 100644 --- a/tools/import_validation/validator_goldens.py +++ b/tools/import_validation/validator_goldens.py @@ -298,7 +298,13 @@ def load_nodes_from_file(files: str) -> dict: file_nodes = file_util.file_load_csv_dict(input_file, key_index=True) for node in file_nodes.values(): - nodes[len(nodes)] = node + # Clean up namespace prefixes (like dcs:, dcid:, schema:) from values (column headers are kept as is) + clean_node = {} + for k, v in node.items(): + clean_val = mcf_file_util.strip_namespace(v) if isinstance( + v, str) else v + clean_node[k] = clean_val + nodes[len(nodes)] = clean_node else: # For MCF or JSON, we assume nodes are already keyed by DCID. file_nodes = mcf_file_util.load_mcf_nodes(input_file) @@ -449,6 +455,15 @@ def generate_goldens(input_files: str, return golden_nodes +def _resolve_paths(path: str, config_dir: str) -> str: + """Resolves relative paths to be absolute relative to config_dir.""" + if isinstance(path, str) and path and not os.path.isabs(path): + resolved = os.path.join(config_dir, path) + logging.info("Resolved relative path '%s' to '%s'", path, resolved) + return resolved + return path + + def validate_goldens(inputs: str | dict, golden_files: str, output_file: str = None, @@ -469,6 +484,11 @@ def validate_goldens(inputs: str | dict, if config is None: config = get_validator_goldens_config() + config_dir = config.get('config_dir') if config else None + if config_dir: + inputs = _resolve_paths(inputs, config_dir) + golden_files = _resolve_paths(golden_files, config_dir) + # Load all nodes from input and golden files. if isinstance(inputs, dict): input_nodes = inputs