diff --git a/src/core/ggml_extend.hpp b/src/core/ggml_extend.hpp index d0326a192..ee4b413c5 100644 --- a/src/core/ggml_extend.hpp +++ b/src/core/ggml_extend.hpp @@ -1346,10 +1346,18 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_attention_ext(ggml_context* ctx, v_in = ggml_cast(ctx, v_in, GGML_TYPE_F16); if (mask_in != nullptr) { - mask_in = ggml_transpose(ctx, mask_in); - } - - if (mask_in != nullptr) { + // ggml_flash_attn_ext expects the mask as a contiguous F16 tensor shaped + // [n_kv, n_q, (heads), (batch)] (ne0 = key length, ne1 = query length) and, + // unlike the manual-attention path, does not broadcast the query dimension. + // Some callers (e.g. Chroma/T5) pass a per-key padding mask broadcast over + // queries ([n_kv, 1, ...]); materialize the query dimension to L_q so the + // kernel indexes it correctly. (A bare ggml_transpose here produced a + // [1, n_kv, ...] mask that the kernel silently misreads, yielding NaN/blank + // output for masked flash attention.) + if (mask_in->ne[1] != L_q) { + mask_in = ggml_repeat(ctx, mask_in, + ggml_new_tensor_4d(ctx, mask_in->type, mask_in->ne[0], L_q, mask_in->ne[2], mask_in->ne[3])); + } mask_in = ggml_cast(ctx, mask_in, GGML_TYPE_F16); } diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 8ba4a463a..e98a0327e 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -575,15 +575,6 @@ class StableDiffusionGGML { } } if (is_chroma) { - if ((sd_ctx_params->flash_attn || sd_ctx_params->diffusion_flash_attn) && sd_ctx_params->chroma_use_dit_mask) { - LOG_WARN( - "!!!It looks like you are using Chroma with flash attention. " - "This is currently unsupported. " - "If you find that the generated images are broken, " - "try either disabling flash attention or specifying " - "--chroma-disable-dit-mask as a workaround."); - } - cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), params_backend_for(SDBackendModule::TE), tensor_storage_map,