Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 45 additions & 14 deletions src/midst_toolkit/attacks/tartan_federer/tartan_federer_attack.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import csv
import os
from collections.abc import Generator
from dataclasses import replace as dc_replace
Comment thread
bzamanlooy marked this conversation as resolved.
from logging import INFO
from pathlib import Path
from typing import Any
Expand Down Expand Up @@ -98,6 +99,7 @@ def mixed_loss(


# TODO: Unify this with the Dataset.from_df function.
# TODO: Noise scale is always called with a value of 0 for the attack.
Comment thread
bzamanlooy marked this conversation as resolved.
def make_dataset_from_df_with_loaded(
data: pd.DataFrame,
transformation: Transformations,
Expand All @@ -108,27 +110,26 @@ def make_dataset_from_df_with_loaded(
noise_scale: float = 0,
) -> Dataset:
"""
Create a dataset using artifacts.
Makes a dataset from a dataframe with loaded transformations.

Args:
data: Raw data to be used for creating the dataset.
transformation: Transformations that one might apply to the dataset, including NaN policies etc.
is_target_conditioned: Enum indicating how, if at all, the model uses a target for generation conditioning.
table_metadata: Meta data about the table or tables.
label_encoders: Encoders that were used to encode the categorical data.
numerical_transform: Transformations that should be applied to the numerical data. Defaults to None.
noise_scale: he scale of the noise to add to the categorical features. Noise is drawn from a normal
distribution with standard deviation of ``noise_scale``. Defaults to 0.
data: The dataframe to make the dataset from.
transformation: The transformations to apply to the data.
is_target_conditioned: Whether the target is conditioned on the data.
Comment thread
bzamanlooy marked this conversation as resolved.
table_metadata: The metadata for the table.
label_encoders: The label encoders for the categorical columns.
numerical_transform: The numerical transform to apply to the data.
noise_scale: The scale of the noise to add to the data.
Comment thread
bzamanlooy marked this conversation as resolved.

Returns:
A full dataset constructed of the various pieces.
A dataset object.
"""
categorical_column_names, numerical_column_names = get_categorical_and_numerical_column_names(
table_metadata,
is_target_conditioned,
)
numerical_features = {DataSplit.TRAIN.value: data[numerical_column_names].values.astype(np.float32)}
categorical_features = {DataSplit.TRAIN.value: data[categorical_column_names].to_numpy(dtype=np.str_)}
categorical_features = {DataSplit.TRAIN.value: data[categorical_column_names].to_numpy()}
targets = {DataSplit.TRAIN.value: data[[table_metadata.target_column_name]].values.astype(np.float32)}

if len(categorical_column_names) > 0:
Expand All @@ -153,6 +154,13 @@ def make_dataset_from_df_with_loaded(
numerical_features = categorical_features

target_info = TargetInfo(policy=None, mean=None, std=None)

# Apply the model's pre-fitted numerical transform directly instead of re-fitting a new one.
# Calling transform_dataset() would fit a brand new QuantileTransformer on the MIA data,
# which produces a different normalization than the model saw during training, destroying signal.
if numerical_transform is not None:
numerical_features = {k: numerical_transform.transform(v) for k, v in numerical_features.items()}
Comment thread
bzamanlooy marked this conversation as resolved.

dataset = Dataset(
numerical_features=numerical_features,
categorical_features=None,
Expand All @@ -163,7 +171,9 @@ def make_dataset_from_df_with_loaded(
categorical_transform=None,
numerical_transform=numerical_transform,
)
return transform_dataset(dataset, transformation, None)
# Use a no-normalization transformation since we've already applied the model's scaler above.
transformation_no_norm = dc_replace(transformation, normalization=None)
return transform_dataset(dataset, transformation_no_norm, None)


def get_dataset(
Expand Down Expand Up @@ -394,7 +404,7 @@ def prepare_dataframe(
return filter_dataframe(merged_data, df_data, columns_for_deduplication)


def train_tartan_federer_attack_classifier(
def train_tartan_federer_attack_classifier( # noqa: PLR0915, PLR0912
train_indices: list[int],
val_indices: list[int] | None,
timesteps: list[int],
Expand Down Expand Up @@ -448,7 +458,28 @@ def train_tartan_federer_attack_classifier(
population_df_for_validation = pd.read_csv(population_data_dir / "population_dataset_for_validating_attack.csv")
log(INFO, "Population datasets for validating loaded.")

noise_dimension = len([col for col in population_df_for_training.columns if "_id" not in col])
# Fix 1: derive noise dimension from the actual diffusion model's num_numerical_features rather
Comment thread
bzamanlooy marked this conversation as resolved.
# than from the population dataframe column count. The mixed_loss function slices
# x[:, :diffusion.num_numerical_features], so the noise vectors must have exactly that length.
# We load the first available model to read this value, then discard it.
first_model_number = (train_indices + (val_indices or []))[0]
Comment thread
bzamanlooy marked this conversation as resolved.
first_model_dir = model_data_dir / f"{model_type}_{first_model_number}"
first_model_path = first_model_dir / target_model_subdir

if model_type != "tabddpm":
raise ValueError(
f"Unsupported model_type {model_type}. Tartan Federer Attack is only supported for ClavaDDPM-single-table models."
)
# TODO: We should read this from the metadata instead.
_relation_order = [("None", "trans")]
Comment thread
bzamanlooy marked this conversation as resolved.
for _parent, _child in _relation_order:
_ckpt_path = first_model_path / f"{_parent}_{_child}_ckpt.pkl"
with open(_ckpt_path, "rb") as _f:
_probe_model = CustomUnpickler(_f).load()
noise_dimension = _probe_model.diffusion.num_numerical_features
Comment thread
emersodb marked this conversation as resolved.
log(INFO, f"Noise dimension read from diffusion model: {noise_dimension}")
break

input_noise = [np.random.normal(size=noise_dimension).tolist() for _ in range(num_noise_per_time_step)]
input_dimension = len(input_noise) * len(timesteps) * len(additional_timesteps)

Expand Down
7 changes: 7 additions & 0 deletions src/midst_toolkit/models/clavaddpm/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,7 @@ def from_df(
table_metadata: TableMetadata,
data_split_percentages: list[float] | None = None,
noise_scale: float = 0,
label_encoders_path: str | None = None,
# TODO: Find places in code that have this kind of hardcoded random default and remove (with TESTING)
data_split_random_state: int = 42,
) -> tuple[Dataset, dict[int, LabelEncoder], list[str]]:
Expand Down Expand Up @@ -314,6 +315,8 @@ def from_df(
data_split_percentages: The percentages of the dataset to go into train, val, and test splits. The sum of
the percentages must amount to 1 (within a tolerance of 0.01). Optional, default is [0.7, 0.2, 0.1].
noise_scale: The scale of the noise to add to the categorical features. Optional, default is 0.
label_encoders_path: The path to the label encoders pkl file. If provided, already fitted label encoder
will be loaded from the pkl file, otherwise they will be fitted on the current data.
data_split_random_state: The random state to use for the data split. Will be passed down to the
``train_test_split`` function from sklearn. Optional, default is 42.

Expand Down Expand Up @@ -377,10 +380,14 @@ def from_df(
column_orders = numerical_column_names + categorical_column_names

# Encode the categorical features and merge them with the numerical features
# Look for pre-fitted label encoders in the parent directories of the data

features, label_encoders = encode_and_merge_features(
categorical_features,
numerical_features,
noise_scale,
categorical_column_names=categorical_column_names if len(categorical_column_names) > 0 else None,
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd say don't do this here. Just send the categorical_column_names as is and handle empty lists in the function itself.

label_encoders_path=label_encoders_path,
)

assert isinstance(table_metadata.n_classes, int)
Expand Down
44 changes: 41 additions & 3 deletions src/midst_toolkit/models/clavaddpm/dataset_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ def encode_and_merge_features(
categorical_features: ArrayDict | None,
numerical_features: ArrayDict | None,
noise_scale: float,
categorical_column_names: list[str] | None = None,
label_encoders_path: str | None = None,
) -> tuple[ArrayDict, dict[int, LabelEncoder]]:
"""
Merge the categorical with the numerical features for train, validation, and test datasets. Numerical features
Expand All @@ -75,6 +77,9 @@ def encode_and_merge_features(
keys are "train", "val", "test" from the DataSplit enumeration
noise_scale: The scale of the noise to add to the categorical features. Noise is drawn from a normal
distribution with standard deviation of ``noise_scale``.
categorical_column_names: The names of the categorical columns.
label_encoders_path: The path to the label encoders pkl file. If provided, already fitted label encoder
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would add a comment as to why we might do this.

will be loaded from the pkl file, otherwise they will be fitted on the current data.

Returns:
The merged features for train, validation, and test datasets and the label encoders used to do so. The label
Expand All @@ -95,14 +100,47 @@ def encode_and_merge_features(
)
)

# Load pre-fitted label encoders from pkl if provided, otherwise fit on current data
preloaded_encoders: dict[str, LabelEncoder] | None = None
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will note that I was very confused here. Your label encoder dictionary here is index by strings (column names) by the label_encoders that are to be returned are index by column indices. You do end up taking care of this later on. However, I would suggest you add a comment here to explain what's happening because the label encoder you're preloading is definitely not of the same "kind" as the ones we are constructing here (that is, it must be constructed somewhere else, we're not reusing an artifact formed by the process)

if label_encoders_path is not None:
_pkl_path = Path(label_encoders_path)
Comment thread
bzamanlooy marked this conversation as resolved.

if not _pkl_path.exists():
raise FileNotFoundError(f"label_encoders_path does not exist: {_pkl_path}")
with open(_pkl_path, "rb") as _f:
preloaded_encoders = pickle.load(_f)

if preloaded_encoders is not None:
assert categorical_column_names is not None, (
"categorical_column_names must be provided when using label_encoders_path."
)

expected_cols = set(categorical_column_names)
available_cols = set(preloaded_encoders.keys())

missing_cols = expected_cols - available_cols
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the categorical_column_names is empty and that is unexpected, you'll throw here anyway. So I don't think you need the

categorical_column_names if len(categorical_column_names) > 0 else None

from above anyway?


if missing_cols:
raise ValueError(
"label_encoders_path is missing encoders for categorical columns: "
f"{sorted(missing_cols)}. "
"Refusing to mix preloaded encoders with freshly fit encoders."
Comment thread
bzamanlooy marked this conversation as resolved.
)

categorical_data_encoded = []
label_encoders = {}
for column in range(all_categorical_data.shape[1]):
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(all_categorical_data[:, column]).astype(float)
if preloaded_encoders is not None:
Comment thread
bzamanlooy marked this conversation as resolved.
assert categorical_column_names is not None
label_encoder = preloaded_encoders[categorical_column_names[column]]
encoded_labels = label_encoder.transform(all_categorical_data[:, column]).astype(float)
else:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(all_categorical_data[:, column]).astype(float)
Comment on lines +103 to +139
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Don't silently mix cached and freshly fit encoders.

If label_encoders_path is stale or points at the wrong file, missing columns fall through to fit_transform() and you end up with a mixed encoder set that no longer matches the checkpoint you meant to reuse. Validate the full categorical_column_names set up front and fail fast instead of partially re-fitting.

🔧 Suggested guard
     if label_encoders_path is not None:
         _pkl_path = Path(label_encoders_path)
         if _pkl_path.exists():
             with open(_pkl_path, "rb") as _f:
                 preloaded_encoders = pickle.load(_f)
+            if categorical_column_names is not None:
+                missing = set(categorical_column_names) - set(preloaded_encoders)
+                if missing:
+                    raise ValueError(
+                        f"Missing label encoders for categorical columns: {sorted(missing)}"
+                    )
@@
-        else:
+        elif preloaded_encoders is None:
             # Fallback: fit on current data
             label_encoder = LabelEncoder()
             encoded_labels = label_encoder.fit_transform(all_categorical_data[:, column]).astype(float)
+        else:
+            raise KeyError(f"No cached encoder found for categorical column: {col_name}")
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
# Load pre-fitted label encoders from pkl if provided, otherwise fit on current data
preloaded_encoders: dict[str, LabelEncoder] | None = None
if label_encoders_path is not None:
_pkl_path = Path(label_encoders_path)
if _pkl_path.exists():
with open(_pkl_path, "rb") as _f:
preloaded_encoders = pickle.load(_f)
categorical_data_encoded = []
label_encoders = {}
for column in range(all_categorical_data.shape[1]):
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(all_categorical_data[:, column]).astype(float)
col_name = categorical_column_names[column] if categorical_column_names is not None else None
if preloaded_encoders is not None and col_name is not None and col_name in preloaded_encoders:
# Use pre-fitted encoder from full dataset (e.g. 101K rows)
label_encoder = preloaded_encoders[col_name]
encoded_labels = label_encoder.transform(all_categorical_data[:, column]).astype(float)
else:
# Fallback: fit on current data
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(all_categorical_data[:, column]).astype(float)
# Load pre-fitted label encoders from pkl if provided, otherwise fit on current data
preloaded_encoders: dict[str, LabelEncoder] | None = None
if label_encoders_path is not None:
_pkl_path = Path(label_encoders_path)
if _pkl_path.exists():
with open(_pkl_path, "rb") as _f:
preloaded_encoders = pickle.load(_f)
if categorical_column_names is not None:
missing = set(categorical_column_names) - set(preloaded_encoders)
if missing:
raise ValueError(
f"Missing label encoders for categorical columns: {sorted(missing)}"
)
categorical_data_encoded = []
label_encoders = {}
for column in range(all_categorical_data.shape[1]):
col_name = categorical_column_names[column] if categorical_column_names is not None else None
if preloaded_encoders is not None and col_name is not None and col_name in preloaded_encoders:
# Use pre-fitted encoder from full dataset (e.g. 101K rows)
label_encoder = preloaded_encoders[col_name]
encoded_labels = label_encoder.transform(all_categorical_data[:, column]).astype(float)
elif preloaded_encoders is None:
# Fallback: fit on current data
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(all_categorical_data[:, column]).astype(float)
else:
raise KeyError(f"No cached encoder found for categorical column: {col_name}")
🧰 Tools
🪛 OpenGrep (1.22.0)

[ERROR] 109-109: pickle.load/loads deserializes arbitrary Python objects and can execute arbitrary code. Use a safe format like JSON instead.

(coderabbit.deserialization.python-pickle)

🪛 Ruff (0.15.14)

[error] 109-109: pickle and modules that wrap it can be unsafe when used to deserialize untrusted data, possible security issue

(S301)

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@src/midst_toolkit/models/clavaddpm/dataset_utils.py` around lines 103 - 122,
When a label_encoders_path is supplied you must fail fast instead of mixing
preloaded and newly-fitted encoders: after loading preloaded_encoders (from
label_encoders_path) validate that categorical_column_names is not None and that
every name in categorical_column_names exists as a key in preloaded_encoders; if
any are missing, raise a clear error (or return/raise ValueError) rather than
falling back to fitting per-column. Update the loop that currently checks
preloaded_encoders and conditionally fits (the block using preloaded_encoders,
label_encoder, encoded_labels and the fallback LabelEncoder()) to assume
encoders are present when label_encoders_path was provided and only fit new
encoders when no path was provided; include the check up front so you never mix
cached and freshly-fit encoders.


if noise_scale > 0:
# add noise
encoded_labels += np.random.normal(0, noise_scale, encoded_labels.shape)

categorical_data_encoded.append(encoded_labels)
label_encoders[column] = label_encoder

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@ def test_tf_attack_whitebox_tiny_config_midst_toolkit():
"model_data_dir": base_path,
"target_model_subdir": Path("."),
"model_type": "tabddpm",
"classifier_hidden_dim": 20,
"classifier_num_epochs": 200,
"classifier_hidden_dim": 100,
"classifier_num_epochs": 20,
"samples_per_train_model": 3000,
"samples_per_val_model": 10,
"num_noise_per_time_step": 30,
"timesteps": [5, 10, 15],
"timesteps": [5, 7, 9],
"additional_timesteps": [0],
"predictions_file_name": "challenge_label_predictions",
# TODO: Make results path a temp directory
Expand All @@ -52,14 +52,14 @@ def test_tf_attack_whitebox_tiny_config_midst_toolkit():
roc_auc_test = mia_performance_test["roc_auc"]
tpr_at_fpr_test = mia_performance_test["max_tpr"]

assert roc_auc_train == pytest.approx(0.4469875, abs=1e-8)
assert tpr_at_fpr_train == pytest.approx(0.08, abs=1e-8)
assert roc_auc_train == pytest.approx(0.6315875, abs=1e-8)
assert tpr_at_fpr_train == pytest.approx(0.165, abs=1e-8)

assert roc_auc_val == pytest.approx(0.5054624999999999, abs=1e-8)
assert tpr_at_fpr_val == pytest.approx(0.125, abs=1e-8)
assert roc_auc_val == pytest.approx(0.6732, abs=1e-8)
assert tpr_at_fpr_val == pytest.approx(0.28, abs=1e-8)

assert roc_auc_test == pytest.approx(0.4937875, abs=1e-8)
assert tpr_at_fpr_test == pytest.approx(0.115, abs=1e-8)
assert roc_auc_test == pytest.approx(0.6607, abs=1e-8)
assert tpr_at_fpr_test == pytest.approx(0.19, abs=1e-8)

unset_all_random_seeds()
os.environ.pop("CUBLAS_WORKSPACE_CONFIG", None)
Expand Down Expand Up @@ -107,14 +107,14 @@ def test_tf_attack_whitebox_tiny_config_midst_toolkit_single_model():
roc_auc_test = mia_performance_test["roc_auc"]
tpr_at_fpr_test = mia_performance_test["max_tpr"]

assert roc_auc_train == pytest.approx(0.5046999999999999, abs=1e-8)
assert tpr_at_fpr_train == pytest.approx(0.09, abs=1e-8)
assert roc_auc_train == pytest.approx(0.6985000000000001, abs=1e-8)
assert tpr_at_fpr_train == pytest.approx(0.33, abs=1e-8)

assert roc_auc_val == pytest.approx(0.47159999999999996, abs=1e-8)
assert tpr_at_fpr_val == pytest.approx(0.12, abs=1e-8)
assert roc_auc_val == pytest.approx(0.7075, abs=1e-8)
assert tpr_at_fpr_val == pytest.approx(0.32, abs=1e-8)

assert roc_auc_test == pytest.approx(0.46390000000000003, abs=1e-8)
assert tpr_at_fpr_test == pytest.approx(0.16, abs=1e-8)
assert roc_auc_test == pytest.approx(0.8042, abs=1e-8)
assert tpr_at_fpr_test == pytest.approx(0.56, abs=1e-8)

unset_all_random_seeds()
os.environ.pop("CUBLAS_WORKSPACE_CONFIG", None)
Expand Down Expand Up @@ -162,11 +162,11 @@ def test_tf_attack_whitebox_tiny_config_midst_toolkit_no_validation():

assert mia_performance_val is None

assert roc_auc_train == pytest.approx(0.4996999999999999, abs=1e-8)
assert tpr_at_fpr_train == pytest.approx(0.07, abs=1e-8)
assert roc_auc_train == pytest.approx(0.6980999999999999, abs=1e-8)
assert tpr_at_fpr_train == pytest.approx(0.33, abs=1e-8)

assert roc_auc_test == pytest.approx(0.5174, abs=1e-8)
assert tpr_at_fpr_test == pytest.approx(0.13, abs=1e-8)
assert roc_auc_test == pytest.approx(0.7075000000000001, abs=1e-8)
assert tpr_at_fpr_test == pytest.approx(0.32, abs=1e-8)

unset_all_random_seeds()
os.environ.pop("CUBLAS_WORKSPACE_CONFIG", None)