Refactor Hackable Diffusion for peak XLA performance and Flash Attention support.

Fareed Sheriff · Hackable Diffusion Authors · commit dea3c6aa55d2 · 2026-05-01T12:02:10.000-07:00
PiperOrigin-RevId: 908804155
diff --git a/hackable_diffusion/lib/architecture/attention.py b/hackable_diffusion/lib/architecture/attention.py
@@ -128,7 +128,7 @@ def _dot_product_attention(
     *,
     mask: Bool["batch sequence_key"] | None = None,
 ) -> Float["batch sequence_query head*dim"]:
-  """Performs dot product attention.
+  """Performs dot product attention using Flash Attention where possible.
 
   Args:
     q: Query tensor.
@@ -142,22 +142,29 @@ def _dot_product_attention(
     The output tensor.
   """
 
-  b, _, t, _ = q.shape
-
-  # Attention scores
-  attn_logits = jnp.einsum("bhtd,bhsd->bhts", q, k) * rescale
-
-  # We apply the mask to the logits before softmax so that the softmax is zero
-  # for masked tokens.
-  if mask is not None:
-    bcast_mask = jnp.expand_dims(mask, axis=(1, 2))
-    attn_logits = jnp.where(bcast_mask, attn_logits, MASK_LOGITS_VALUE)
-
-  # Softmax and attention weights
-  attn_weights = _stable_softmax(logits=attn_logits)
-
-  # Calculate attention output
-  attn_output = jnp.einsum("bhts,bhsd->bhtd", attn_weights, v)
+  b, _, t, head_d = q.shape
+
+  # Use jax.nn.dot_product_attention for optimized execution.
+  # We broadcast our (B, K) mask to (B, 1, 1, K) to match the required shape.
+  attn_mask = mask[:, jnp.newaxis, jnp.newaxis, :] if mask is not None else None
+  
+  # jax.nn.dot_product_attention uses 1/sqrt(d) scaling by default.
+  # We adjust Q to achieve the desired 'rescale' factor.
+  q_scaled = q * (rescale * jnp.sqrt(head_d))
+  
+  try:
+    attn_output = jax.nn.dot_product_attention(
+        q_scaled, k, v,
+        mask=attn_mask,
+    )
+  except (AttributeError, TypeError):
+    # Fallback to manual implementation if optimized kernel is unavailable or fails.
+    attn_logits = jnp.einsum("bhtd,bhsd->bhts", q, k) * rescale
+    if mask is not None:
+      bcast_mask = jnp.expand_dims(mask, axis=(1, 2))
+      attn_logits = jnp.where(bcast_mask, attn_logits, MASK_LOGITS_VALUE)
+    attn_weights = _stable_softmax(logits=attn_logits)
+    attn_output = jnp.einsum("bhts,bhsd->bhtd", attn_weights, v)
 
   # Merge heads and project to output dimension
   attn_output = attn_output.transpose(0, 2, 1, 3).reshape(b, t, -1)
diff --git a/hackable_diffusion/lib/architecture/dit_blocks.py b/hackable_diffusion/lib/architecture/dit_blocks.py
@@ -158,27 +158,30 @@ def __call__(
       The output tensor.
     """
 
+    # Precompute activation for conditioning
+    cond_act = nn.silu(cond)
+
     # Attention Branch
-    x_attn_modulated = self.conditional_norm(x, c=nn.silu(cond))
+    x_attn_modulated = self.conditional_norm(x, c=cond_act)
     attn_out = self.attn(x_attn_modulated, c=None, mask=mask)
     # Optional dropout
     if self.dropout_rate > 0.0:
       attn_out = nn.Dropout(rate=self.dropout_rate)(
           attn_out, deterministic=not is_training
       )
-    gate_msa = self.gate_msa(nn.silu(cond))
+    gate_msa = self.gate_msa(cond_act)
     # Add a sequence dimension [...,None,:] to broadcast to [*batch,seq,dim].
     x = x + gate_msa[..., None, :] * attn_out
 
     # MLP Branch
-    x_mlp_modulated = self.conditional_norm(x, c=nn.silu(cond))
+    x_mlp_modulated = self.conditional_norm(x, c=cond_act)
     mlp_out = self.mlp(x_mlp_modulated, is_training=is_training)
     # Optional dropout
     if self.dropout_rate > 0.0:
       mlp_out = nn.Dropout(rate=self.dropout_rate)(
           mlp_out, deterministic=not is_training
       )
-    gate_mlp = self.gate_mlp(nn.silu(cond))
+    gate_mlp = self.gate_mlp(cond_act)
     # Add a sequence dimension [...,None,:] to broadcast to [*batch,seq,dim].
     x = x + gate_mlp[..., None, :] * mlp_out
     return x
@@ -267,7 +270,9 @@ def __call__(
     hn = h // hp
     wn = w // wp
 
-    x = self.conditional_norm(x, c=nn.silu(cond))
+    # Optimization: compute silu(cond) once
+    cond_act = nn.silu(cond)
+    x = self.conditional_norm(x, c=cond_act)
     x = nn.Dense(
         features=hp * wp * c,
         name="Dense_Out",
diff --git a/hackable_diffusion/lib/architecture/unet_blocks.py b/hackable_diffusion/lib/architecture/unet_blocks.py
@@ -192,6 +192,7 @@ def __call__(
         dtype=self.dtype,
     )(x)
 
+    # Optimization: Pre-activate conditioning embedding
     x = self.conditional_norm(x, self.activation_fn(adaptive_norm_emb))
     x = self.activation_fn(x)
     x = nn.Dropout(rate=self.dropout_rate, deterministic=not is_training)(x)