From ee8f693e539841ed7cd2e45b668b0e86a67f00e2 Mon Sep 17 00:00:00 2001 From: Akshan Krithick Date: Wed, 17 Jun 2026 15:13:05 -0700 Subject: [PATCH] fix(bria_fibo): fix guidance_embeds, prompt_embeds, tensor-image and multi-image crashes --- .../models/transformers/transformer_bria_fibo.py | 4 ++-- .../pipelines/bria_fibo/pipeline_bria_fibo.py | 14 ++++++++++---- .../bria_fibo/pipeline_bria_fibo_edit.py | 16 +++++++++++----- 3 files changed, 23 insertions(+), 11 deletions(-) diff --git a/src/diffusers/models/transformers/transformer_bria_fibo.py b/src/diffusers/models/transformers/transformer_bria_fibo.py index 7ddbccfa47c5..eb7e6cf45720 100644 --- a/src/diffusers/models/transformers/transformer_bria_fibo.py +++ b/src/diffusers/models/transformers/transformer_bria_fibo.py @@ -470,7 +470,7 @@ def __init__( self.time_embed = BriaFiboTimestepProjEmbeddings(embedding_dim=self.inner_dim, time_theta=time_theta) if guidance_embeds: - self.guidance_embed = BriaFiboTimestepProjEmbeddings(embedding_dim=self.inner_dim) + self.guidance_embed = BriaFiboTimestepProjEmbeddings(embedding_dim=self.inner_dim, time_theta=time_theta) self.context_embedder = nn.Linear(self.config.joint_attention_dim, self.inner_dim) self.x_embedder = torch.nn.Linear(self.config.in_channels, self.inner_dim) @@ -555,7 +555,7 @@ def forward( temb = self.time_embed(timestep, dtype=hidden_states.dtype) - if guidance: + if guidance is not None: temb += self.guidance_embed(guidance, dtype=hidden_states.dtype) encoder_hidden_states = self.context_embedder(encoder_hidden_states) diff --git a/src/diffusers/pipelines/bria_fibo/pipeline_bria_fibo.py b/src/diffusers/pipelines/bria_fibo/pipeline_bria_fibo.py index 1f178066b17d..5738c0044e92 100644 --- a/src/diffusers/pipelines/bria_fibo/pipeline_bria_fibo.py +++ b/src/diffusers/pipelines/bria_fibo/pipeline_bria_fibo.py @@ -260,6 +260,11 @@ def encode_prompt( ) prompt_embeds = prompt_embeds.to(dtype=self.transformer.dtype) prompt_layers = [tensor.to(dtype=self.transformer.dtype) for tensor in prompt_layers] + else: + raise ValueError( + "`prompt_embeds` cannot be passed on its own; this pipeline also needs the per-layer embeddings " + "computed from `prompt`. Please pass `prompt` instead." + ) if guidance_scale > 1: if isinstance(negative_prompt, list) and negative_prompt[0] is None: @@ -773,10 +778,11 @@ def __call__( for scaled_latent in latents_scaled: curr_image = self.vae.decode(scaled_latent.unsqueeze(0), return_dict=False)[0] curr_image = self.image_processor.postprocess(curr_image.squeeze(dim=2), output_type=output_type) - image.append(curr_image) - if len(image) == 1: - image = image[0] - else: + if output_type == "np": + image.append(curr_image[0]) + else: + image.extend(curr_image) + if output_type == "np": image = np.stack(image, axis=0) # Offload all models diff --git a/src/diffusers/pipelines/bria_fibo/pipeline_bria_fibo_edit.py b/src/diffusers/pipelines/bria_fibo/pipeline_bria_fibo_edit.py index c2327bbce1c7..0e6bb7ae583b 100644 --- a/src/diffusers/pipelines/bria_fibo/pipeline_bria_fibo_edit.py +++ b/src/diffusers/pipelines/bria_fibo/pipeline_bria_fibo_edit.py @@ -418,6 +418,11 @@ def encode_prompt( ) prompt_embeds = prompt_embeds.to(dtype=self.transformer.dtype) prompt_layers = [tensor.to(dtype=self.transformer.dtype) for tensor in prompt_layers] + else: + raise ValueError( + "`prompt_embeds` cannot be passed on its own; this pipeline also needs the per-layer embeddings " + "computed from `prompt`. Please pass `prompt` instead." + ) if guidance_scale > 1: if isinstance(negative_prompt, list) and negative_prompt[0] is None: @@ -807,7 +812,7 @@ def __call__( prompt_layers = prompt_layers + [prompt_layers[-1]] * (total_num_layers_transformer - len(prompt_layers)) # Preprocess image - if image is not None and not (isinstance(image, torch.Tensor) and image.size(1) == self.latent_channels): + if image is not None: image = self.image_processor.resize(image, height, width) image = self.image_processor.preprocess(image, height, width) @@ -996,10 +1001,11 @@ def __call__( for scaled_latent in latents_scaled: curr_image = self.vae.decode(scaled_latent.unsqueeze(0), return_dict=False)[0] curr_image = self.image_processor.postprocess(curr_image.squeeze(dim=2), output_type=output_type) - image.append(curr_image) - if len(image) == 1: - image = image[0] - else: + if output_type == "np": + image.append(curr_image[0]) + else: + image.extend(curr_image) + if output_type == "np": image = np.stack(image, axis=0) # Offload all models