From 232e05b088508a7bdddfcf7a959818e97b822040 Mon Sep 17 00:00:00 2001 From: Mark Caldwell Date: Tue, 9 Jun 2026 11:18:42 -0700 Subject: [PATCH 1/2] fix: correct mask shape for masked flash attention The flash-attention branch in ggml_ext_attention_ext passed the attention mask to ggml_flash_attn_ext after a ggml_transpose, turning a [n_kv, n_q] (or query-broadcast [n_kv, 1]) mask into [n_q/1, n_kv]. ggml_flash_attn_ext expects the mask as a contiguous F16 tensor shaped [n_kv, n_q, ...] and does not broadcast the query dimension, so the transposed mask was misindexed by the kernel and produced NaN / all-blank output. ggml's mask shape assertion (ggml_can_repeat_rows) is currently disabled, so this ran silently instead of erroring. Models that need a real attention mask with flash attention enabled (e.g. Chroma, which passes a T5 padding mask of shape [n_kv, 1] broadcast over queries) therefore rendered a blank image whenever --diffusion-fa was set. Drop the transpose and, for query-broadcast masks, materialize the query dimension to L_q with ggml_repeat before the F16 cast. The change is guarded by mask != nullptr, so the common no-mask flash-attention path is unchanged. Verified on Chroma1-HD at 1024x1024: masked flash attention now produces a correct image matching the non-flash-attention reference on both the CUDA and Vulkan backends. --- src/core/ggml_extend.hpp | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/core/ggml_extend.hpp b/src/core/ggml_extend.hpp index d0326a192..ee4b413c5 100644 --- a/src/core/ggml_extend.hpp +++ b/src/core/ggml_extend.hpp @@ -1346,10 +1346,18 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_attention_ext(ggml_context* ctx, v_in = ggml_cast(ctx, v_in, GGML_TYPE_F16); if (mask_in != nullptr) { - mask_in = ggml_transpose(ctx, mask_in); - } - - if (mask_in != nullptr) { + // ggml_flash_attn_ext expects the mask as a contiguous F16 tensor shaped + // [n_kv, n_q, (heads), (batch)] (ne0 = key length, ne1 = query length) and, + // unlike the manual-attention path, does not broadcast the query dimension. + // Some callers (e.g. Chroma/T5) pass a per-key padding mask broadcast over + // queries ([n_kv, 1, ...]); materialize the query dimension to L_q so the + // kernel indexes it correctly. (A bare ggml_transpose here produced a + // [1, n_kv, ...] mask that the kernel silently misreads, yielding NaN/blank + // output for masked flash attention.) + if (mask_in->ne[1] != L_q) { + mask_in = ggml_repeat(ctx, mask_in, + ggml_new_tensor_4d(ctx, mask_in->type, mask_in->ne[0], L_q, mask_in->ne[2], mask_in->ne[3])); + } mask_in = ggml_cast(ctx, mask_in, GGML_TYPE_F16); } From 18cd3e88da063f745ad1479190ef6170f3def9d8 Mon Sep 17 00:00:00 2001 From: Mark Caldwell Date: Tue, 9 Jun 2026 18:00:39 -0700 Subject: [PATCH 2/2] fix: remove stale Chroma + flash-attention 'unsupported' warning The masked-flash-attention fix on this branch makes Chroma + flash attention render correctly, so the warning telling users it is unsupported and to disable flash attention now gives the wrong advice. Drop it. (Spotted by @wbruna in review.) --- src/stable-diffusion.cpp | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 8ba4a463a..e98a0327e 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -575,15 +575,6 @@ class StableDiffusionGGML { } } if (is_chroma) { - if ((sd_ctx_params->flash_attn || sd_ctx_params->diffusion_flash_attn) && sd_ctx_params->chroma_use_dit_mask) { - LOG_WARN( - "!!!It looks like you are using Chroma with flash attention. " - "This is currently unsupported. " - "If you find that the generated images are broken, " - "try either disabling flash attention or specifying " - "--chroma-disable-dit-mask as a workaround."); - } - cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), params_backend_for(SDBackendModule::TE), tensor_storage_map,