fix: issues found in comprehensive tests: cache capacity, watsonx (#560)

jakelorocco · web-flow · commit ff00e890c5b9 · 2026-02-26T08:24:00.000-05:00
model name, and alora training on macs
diff --git a/cli/alora/train.py b/cli/alora/train.py
@@ -133,6 +133,13 @@ def train_model(
         model_base = AutoModelForCausalLM.from_pretrained(
             base_model, device_map=device_map, use_cache=False
         )
+
+        # `fp16=True` enables CUDA-specific mixed precision via GradScaler, which doesn't function properly on cpu or mps.
+        # Check all the model's parameters to ensure it's okay to use.
+        use_fp16 = all(
+            param.device.type != "cpu" and param.device.type != "mps"
+            for param in model_base.parameters()
+        )
     except NotImplementedError as e:
         if "meta tensor" in str(e):
             raise RuntimeError(
@@ -176,7 +183,7 @@ def train_model(
             max_seq_length=max_length,
             per_device_train_batch_size=batch_size,
             gradient_accumulation_steps=grad_accum,
-            fp16=True,
+            fp16=use_fp16,
         )
 
         trainer = SafeSaveTrainer(
@@ -210,7 +217,7 @@ def train_model(
             max_seq_length=max_length,
             per_device_train_batch_size=batch_size,
             gradient_accumulation_steps=grad_accum,
-            fp16=True,
+            fp16=use_fp16,
         )
 
         trainer = SafeSaveTrainer(
diff --git a/mellea/backends/cache.py b/mellea/backends/cache.py
@@ -60,6 +60,9 @@ def get(self, key: str | int) -> Any | None:
 
     def put(self, key: str | int, value: Any):
         """Put a value into the cache."""
+        if self.capacity == 0:
+            return
+
         if key in self.cache:
             # If the key exists, move it to the end (most recent)
             self.cache.pop(key)
diff --git a/mellea/backends/model_ids.py b/mellea/backends/model_ids.py
@@ -79,7 +79,7 @@ class ModelIdentifier:
 IBM_GRANITE_4_MICRO_3B = ModelIdentifier(
     hf_model_name="ibm-granite/granite-4.0-micro",
     ollama_name="granite4:micro",
-    watsonx_name="ibm/granite-4-small",
+    watsonx_name="ibm/granite-4-h-small",  # Keeping hybrid version here for backwards compatibility.
 )
 
 # Granite 3.3 Vision Model (2B)
diff --git a/test/cli/test_alora_train.py b/test/cli/test_alora_train.py
@@ -28,6 +28,9 @@ def test_alora_config_creation():
         mock_tokenizer_class.from_pretrained.return_value = mock_tokenizer
 
         mock_model = Mock()
+        mock_param = Mock()
+        mock_param.device.type = "cuda"
+        mock_model.parameters.return_value = [mock_param]
         mock_model_class.from_pretrained.return_value = mock_model
 
         mock_peft_model = Mock()
@@ -102,6 +105,9 @@ def test_lora_config_creation():
         mock_tokenizer_class.from_pretrained.return_value = mock_tokenizer
 
         mock_model = Mock()
+        mock_param = Mock()
+        mock_param.device.type = "cuda"
+        mock_model.parameters.return_value = [mock_param]
         mock_model_class.from_pretrained.return_value = mock_model
 
         mock_peft_model = Mock()
@@ -175,7 +181,11 @@ def test_invocation_prompt_tokenization():
         mock_tokenizer_class.from_pretrained.return_value = mock_tokenizer
 
         # Setup other mocks
-        mock_model_class.from_pretrained.return_value = Mock()
+        mock_model = Mock()
+        mock_param = Mock()
+        mock_param.device.type = "cuda"
+        mock_model.parameters.return_value = [mock_param]
+        mock_model_class.from_pretrained.return_value = mock_model
         mock_get_peft_model.return_value = Mock()
 
         mock_ds = MagicMock()

Original file line number	Diff line number	Diff line change
`@@ -79,7 +79,7 @@ class ModelIdentifier:`
`79`	`79`	`IBM_GRANITE_4_MICRO_3B = ModelIdentifier(`
`80`	`80`	`hf_model_name="ibm-granite/granite-4.0-micro",`
`81`	`81`	`ollama_name="granite4:micro",`
`82`		`- watsonx_name="ibm/granite-4-small",`
	`82`	`+ watsonx_name="ibm/granite-4-h-small", # Keeping hybrid version here for backwards compatibility.`
`83`	`83`	`)`
`84`	`84`
`85`	`85`	`# Granite 3.3 Vision Model (2B)`