Fix HunyuanVideo 1.5 I2V by preprocessing image at pixel resolution i… (#13440)

akshan-main · web-flow · commit 87beae7771f8 · 2026-04-10T09:54:36.000-10:00
Fix HunyuanVideo 1.5 I2V by preprocessing image at pixel resolution instead of latent resolution
diff --git a/src/diffusers/pipelines/hunyuan_video1_5/pipeline_hunyuan_video1_5_image2video.py b/src/diffusers/pipelines/hunyuan_video1_5/pipeline_hunyuan_video1_5_image2video.py
@@ -611,7 +611,7 @@ def prepare_cond_latents_and_mask(
             tuple: (cond_latents_concat, mask_concat) - both are zero tensors for t2v
         """
 
-        batch, channels, frames, height, width = latents.shape
+        batch, channels, frames, latent_height, latent_width = latents.shape
 
         image_latents = self._get_image_latents(
             vae=self.vae,
@@ -626,7 +626,7 @@ def prepare_cond_latents_and_mask(
         latent_condition[:, :, 1:, :, :] = 0
         latent_condition = latent_condition.to(device=device, dtype=dtype)
 
-        latent_mask = torch.zeros(batch, 1, frames, height, width, dtype=dtype, device=device)
+        latent_mask = torch.zeros(batch, 1, frames, latent_height, latent_width, dtype=dtype, device=device)
         latent_mask[:, :, 0, :, :] = 1.0
 
         return latent_condition, latent_mask