burn_mamba/mamba2/ssd/
ssd_path.rs

1//! # SSD algorithm selection and input bundle (Mamba-2)
2//!
3//! [`Mamba2SsdPath`] chooses which of the three exact SSD reformulations
4//! ([`super::minimal`] / [`super::serial`] / [`super::serial_recalculated`])
5//! runs, and at what chunk length.  [`Mamba2SsdInput`] bundles the pre-processed
6//! tensors the scan consumes (B/C already GQA-expanded to per-head); its
7//! [`Mamba2SsdInput::run`] dispatches to the path-selected algorithm.
8
9use crate::mamba2::prelude::*;
10use burn::backend::Backend;
11use burn::prelude::*;
12
13/// Algorithm selection for the Mamba-2 chunkwise SSD.
14///
15/// Each variant carries an optional chunk length. Larger values increase the
16/// intra-chunk GEMM work and reduce the inter-chunk scan length; the optimal
17/// value is approximately `√(state_rank · per_head_dim)` (see
18/// [`Self::optimal_chunk_len`]). `None` falls back to that optimal value.
19#[derive(Debug, Clone)]
20pub enum Mamba2SsdPath {
21    /// Minimal SSD: mostly batched matmuls; backward via autodiff.
22    ///
23    /// See [`Mamba2SsdInput::ssd_minimal`]. For training, prefer
24    /// [`Self::SerialRecalculated`].
25    ///
26    /// Based on `/mamba_ssm/modules/ssd_minimal.py` from the `state-spaces/mamba`
27    /// github reference.
28    Minimal(Option<usize>),
29
30    /// (Hybrid) serial SSD: a serial loop over the chunks plus batched matmuls;
31    /// backward via autodiff.
32    ///
33    /// See [`Mamba2SsdInput::ssd_serial`]. For a memory-saving custom backward,
34    /// see [`Self::SerialRecalculated`].
35    ///
36    /// Based on 5 kernels under `/mamba_ssm/ops/triton/` from the
37    /// `state-spaces/mamba` github reference:
38    /// - `ssd_chunk_state.py` (K1, K3).
39    /// - `ssd_bmm.py` (K2).
40    /// - `ssd_state_passing.py` (K4).
41    /// - `ssd_chunk_scan.py` (K5).
42    Serial(Option<usize>),
43
44    /// (Hybrid) serial SSD with a custom, memory-efficient backward that
45    /// recomputes the forward intermediates instead of storing them.
46    ///
47    /// See [`Mamba2SsdInput::ssd_serial_recalculated`]. For a plain autodiff
48    /// backward, see [`Self::Serial`].
49    ///
50    /// Based on the combined kernel `/mamba_ssm/ops/triton/ssd_combined.py` from
51    /// the `state-spaces/mamba` github reference.
52    SerialRecalculated(Option<usize>),
53}
54
55/// SSD input.
56///
57/// All tensors are pre-processed: B/C are already GQA-expanded to per-head.
58pub struct Mamba2SsdInput {
59    /// # Shape
60    /// - `[batch, nchunks, chunk_len, nheads, per_head_dim]`
61    pub x_bnlhp: Tensor<5>,
62    /// # Shape
63    /// - `[batch, nchunks, chunk_len, nheads]`
64    pub dt_bnlh: Tensor<4>,
65    /// # Shape
66    /// - `[nheads]`
67    pub a_decay_h: Tensor<1>,
68    /// B tensor, expanded to per-head.
69    ///
70    /// # Shape
71    /// - `[batch, nchunks, chunk_len, nheads, state_rank]`
72    pub b_bnlhr: Tensor<5>,
73    /// C tensor, expanded to per-head.
74    ///
75    /// # Shape
76    /// - `[batch, nchunks, chunk_len, nheads, state_rank]`
77    pub c_bnlhr: Tensor<5>,
78    /// # Shape
79    /// - `[nheads]`
80    pub d_h: Tensor<1>,
81    /// # Shape
82    /// - `[batch, nheads, per_head_dim, state_rank]`
83    pub initial_state_bhpr: Tensor<4>,
84    /// # Shape
85    /// - `[nheads, per_head_dim, state_rank]`
86    pub init_state_hpr: Option<Tensor<3>>,
87}
88
89impl Mamba2SsdInput {
90    /// Run the [`NaN`/`Inf` guards](crate::utils::sanity) on every input tensor.
91    pub fn sanity(&self) {
92        use crate::modules::sanity as san;
93        san(&self.x_bnlhp);
94        san(&self.dt_bnlh);
95        san(&self.a_decay_h);
96        san(&self.b_bnlhr);
97        san(&self.c_bnlhr);
98        san(&self.d_h);
99        san(&self.initial_state_bhpr);
100        if let Some(ref init_state_hpr) = self.init_state_hpr {
101            san(init_state_hpr);
102        }
103    }
104}
105
106impl Mamba2SsdPath {
107    /// Optimal chunk length, approximately `√(state_rank · per_head_dim)`,
108    /// rounded up to a multiple of 32 and capped at 512.
109    pub fn optimal_chunk_len(state_rank: usize, per_head_dim: usize) -> usize {
110        (state_rank * per_head_dim)
111            .isqrt()
112            .next_multiple_of(32) // rule-of-thumb: common plane dimension.
113            .min(512) // rule-of-thumb: ceiling at 512.
114    }
115
116    /// The chunk length carried by this variant, if any.
117    pub fn chunk_len(&self) -> Option<usize> {
118        match self {
119            Self::Minimal(chunk_len)
120            | Self::Serial(chunk_len)
121            | Self::SerialRecalculated(chunk_len) => *chunk_len,
122        }
123    }
124
125    /// The chunk length carried by this variant, or [`Self::optimal_chunk_len`]
126    /// when unset.
127    pub fn chunk_len_or_optimal(&self, state_rank: usize, per_head_dim: usize) -> usize {
128        self.chunk_len()
129            .unwrap_or_else(|| Self::optimal_chunk_len(state_rank, per_head_dim))
130    }
131
132    /// The recommended default path for a given block: [`Self::SerialRecalculated`]
133    /// with [`Self::optimal_chunk_len`] for the block's dimensions.
134    pub fn default_optimal_from_block<B: Backend>(block: &Mamba2) -> Self {
135        let chunk_len = Self::optimal_chunk_len(block.state_rank, block.per_head_dim());
136        Self::SerialRecalculated(Some(chunk_len))
137    }
138}
139
140impl Mamba2SsdInput {
141    /// Run the selected SSD algorithm on this input.
142    ///
143    /// Dispatches by [`Mamba2SsdPath`] variant to `ssd_minimal`, `ssd_serial`,
144    /// or `ssd_serial_recalculated`.
145    ///
146    /// # Returns
147    /// - `y_bnlhp`: `[batch, nchunks, chunk_len, nheads, per_head_dim]`
148    /// - `final_state_bhpr`: `[batch, nheads, per_head_dim, state_rank]`
149    pub fn run(self, path: &Mamba2SsdPath) -> (Tensor<5>, Tensor<4>) {
150        match path {
151            Mamba2SsdPath::Minimal(_) => self.ssd_minimal(),
152            Mamba2SsdPath::Serial(_) => self.ssd_serial(),
153            Mamba2SsdPath::SerialRecalculated(_) => self.ssd_serial_recalculated(),
154        }
155    }
156}
157
158impl Default for Mamba2SsdPath {
159    fn default() -> Mamba2SsdPath {
160        // Defaults to the SerialRecalculated algorithm with the optimal chunk length.
161        Mamba2SsdPath::SerialRecalculated(None)
162    }
163}
164
165// ---------------------------------------------------------------------------
166// Tests
167// ---------------------------------------------------------------------------
168
169#[cfg(all(test, feature = "_dev-test"))]
170mod tests;
burn_mamba/mamba2/ssd/ssd_path.rs

burn_mamba/mamba2/ssd/
ssd_path.rs