burn_mamba/mamba2/
cache.rs

1//! # Mamba-2 Inference Caches
2//!
3//! This module defines the state that must be preserved between calls during
4//! autoregressive (token-by-token) generation.  During *training* or *prefill*
5//! the full sequence is available at once and the chunked SSD algorithm is used
6//! (see [`Mamba2::forward`]).  During *decoding* the model
7//! processes one token per step and the SSM operates in its pure recurrent
8//! form (see [`Mamba2::step`]):
9//!
10//! ```text
11//!   hₜ = Āₜ hₜ₋₁ + B̄ₜ xₜ        (state update)
12//!   yₜ = Cₜᵀ hₜ + D xₜ            (output)
13//! ```
14//!
15//! Two pieces of state are required per layer:
16//!
17//! 1. **Convolution cache** — the last `conv_kernel` inputs to the depthwise
18//!    Conv1d, kept so that every decoding step can apply the causal filter
19//!    without re-processing previous tokens.
20//!
21//! 2. **SSM hidden state** — the matrix `hₜ ∈ ℝ^{per_head_dim×state_rank}` (per head), which
22//!    compresses the entire past context into a fixed-size representation
23//!    regardless of how many tokens have been generated.  This is the key
24//!    memory-efficiency advantage of SSMs over attention: the KV-cache of a
25//!    Transformer grows as O(sequence·state_rank) with sequence length, whereas the SSM state
26//!    is always O(per_head_dim·state_rank).
27
28use crate::mamba2::prelude::*;
29use crate::modules::sanity as san;
30use burn::module::Module;
31use burn::prelude::*;
32
33// ---------------------------------------------------------------------------
34// Mamba2Caches  (one cache entry per layer)
35// ---------------------------------------------------------------------------
36
37/// A collection of per-layer caches for a complete Mamba-2 network.
38///
39/// During autoregressive decoding, a [`Mamba2Caches`] instance is threaded
40/// through every layer-stack `step` call (the family-generic
41/// [`crate::generic::Layers`]).  Each element of `caches` corresponds to one
42/// (virtual) layer in the network.
43#[derive(Module, Debug)]
44pub struct Mamba2Caches {
45    /// Per-layer caches.
46    ///
47    /// Length: `n_real_caches` (the number of *virtual* layers, which may
48    /// exceed the number of *real* weight layers when weight-sharing / layer
49    /// scheduling is in use).
50    pub caches: Vec<Mamba2Cache>,
51}
52
53/// Configuration / factory for [`Mamba2Caches`].
54#[derive(Config, Debug)]
55pub struct Mamba2CachesConfig {
56    /// Number of cache slots.  Equals the number of virtual layers in the
57    /// network (one cache per layer, even when layers share weights).
58    pub n_real_caches: usize,
59
60    /// Shared configuration that determines the shape of each individual
61    /// cache tensor.
62    pub cache: Mamba2CacheConfig,
63}
64
65impl Mamba2CachesConfig {
66    /// Convenience constructor that derives cache shapes directly from a
67    /// [`Mamba2Config`] block configuration.
68    pub fn new_from_block_config(
69        n_real_caches: usize,
70        batch: usize,
71        block_config: Mamba2Config,
72    ) -> Self {
73        Self {
74            n_real_caches,
75            cache: Mamba2CacheConfig::new_from_block_config(batch, block_config),
76        }
77    }
78
79    /// Allocate all cache tensors (zero-initialised) on `device`.
80    pub fn init(&self, device: &Device) -> Mamba2Caches {
81        let caches = (0..self.n_real_caches)
82            .map(|_| self.cache.clone().init(device))
83            .collect();
84        Mamba2Caches { caches }
85    }
86}
87
88// ---------------------------------------------------------------------------
89// Mamba2Cache  (state for a single layer)
90// ---------------------------------------------------------------------------
91
92/// The mutable state carried between decoding steps for a **single** Mamba-2
93/// layer.
94///
95/// Both tensors are updated in-place (via Burn's functional clone) at every
96/// call to [`Mamba2::step`].
97#[derive(Module, Debug)]
98pub struct Mamba2Cache {
99    /// **Convolution rolling window.**
100    ///
101    /// Stores the last `conv_kernel` pre-activation feature vectors fed into
102    /// the depthwise Conv1d.  At each step, the oldest column is discarded and
103    /// the new token's projection is appended (a left-shift followed by an
104    /// insert into the rightmost column), maintaining strict causality.
105    ///
106    /// Shape: `[batch, conv_dim, conv_kernel]`
107    ///   - `conv_dim  = d_inner + 2 · ngroups · state_rank`
108    ///   - `conv_kernel` is typically 4
109    pub conv_bvk: Tensor<3>,
110
111    /// **SSM hidden state** `hₜ`.
112    ///
113    /// This is the O(per_head_dim·state_rank) compressed summary of all tokens seen so far.
114    /// Updated via `hₜ = Āₜ hₜ₋₁ + B̄ₜ xₜ` at each decoding step.
115    ///
116    /// The tensor is indexed as `[batch, nheads, per_head_dim, state_rank]`
117    /// (i.e. `[batch, nheads, per_head_dim, state_rank]` in the paper's notation), which is the transpose
118    /// of the mathematical `hₜ ∈ ℝ^{state_rank×per_head_dim}` but equivalent in content.
119    ///
120    /// Shape: `[batch, nheads, per_head_dim, state_rank]`
121    pub ssm_bhpr: Tensor<4>,
122}
123
124impl Mamba2Cache {
125    /// Run the [`NaN`/`Inf` guards](crate::utils::sanity) on every cached tensor.
126    pub fn sanity(&self) {
127        san(&self.conv_bvk);
128        san(&self.ssm_bhpr);
129    }
130}
131
132/// Configuration / factory for a single [`Mamba2Cache`].
133#[derive(Config, Debug)]
134pub struct Mamba2CacheConfig {
135    /// Batch size.
136    pub batch: usize,
137
138    /// `state_rank` — the number of latent dimensions in the SSM hidden
139    /// state.  Corresponds to `state_rank` in [`Mamba2Config`].
140    #[config(default = 128)]
141    pub state_rank: usize,
142
143    /// Causal convolution window length.  Corresponds to `conv_kernel` in
144    /// [`Mamba2Config`].
145    #[config(default = 4)]
146    pub conv_kernel: usize,
147
148    /// Number of channels entering (and leaving) the depthwise convolution.
149    /// Equal to `d_inner + 2 · ngroups · state_rank`.
150    pub conv_dim: usize,
151
152    /// Head dimension `per_head_dim`.  Corresponds to `per_head_dim` in [`Mamba2Config`].
153    #[config(default = 64)]
154    pub per_head_dim: usize,
155
156    /// Number of SSM heads `nheads`.
157    pub nheads: usize,
158}
159
160impl Mamba2CacheConfig {
161    /// Derive cache shapes from a Mamba-2 block configuration plus a batch
162    /// size.
163    pub fn new_from_block_config(batch: usize, block_config: Mamba2Config) -> Self {
164        Self {
165            batch,
166            state_rank: block_config.state_rank,
167            conv_kernel: block_config.conv_kernel,
168            conv_dim: block_config.conv_dim(),
169            per_head_dim: block_config.per_head_dim,
170            nheads: block_config.nheads(),
171        }
172    }
173
174    /// Allocate zero-initialised cache tensors on `device`.
175    ///
176    /// Zero initialisation is correct because:
177    /// - The convolution cache represents "no previous tokens" (identity padding).
178    /// - The SSM state represents `h₀ = 0` (zero initial condition), which is
179    ///   the standard default.  Learnable initial state (if configured) are
180    ///   added on top of this inside [`Mamba2::forward`] /
181    ///   [`Mamba2::step`].
182    pub fn init(&self, device: &Device) -> Mamba2Cache {
183        let conv_bvk = Tensor::zeros(
184            Shape::new([self.batch, self.conv_dim, self.conv_kernel]),
185            device,
186        );
187        let ssm_bhpr = Tensor::zeros(
188            Shape::new([self.batch, self.nheads, self.per_head_dim, self.state_rank]),
189            device,
190        );
191        Mamba2Cache { conv_bvk, ssm_bhpr }
192    }
193}
194
195impl Mamba2Caches {
196    /// Number of per-layer caches.
197    pub fn caches_len(&self) -> usize {
198        self.caches.len()
199    }
200
201    /// Wrap a vector of per-layer caches.
202    pub fn from_vec(vec: Vec<Mamba2Cache>) -> Self {
203        Self { caches: vec }
204    }
205
206    /// Wrap each per-layer cache in `Some` so the layer loop can `take` it
207    /// without cloning (Burn tensors are reference-counted).
208    pub fn into_options(self) -> Vec<Option<Mamba2Cache>> {
209        self.caches.into_iter().map(Some).collect()
210    }
211
212    /// Inverse of [`Self::into_options`]: unwrap each slot and re-bundle.
213    pub fn from_options(options: Vec<Option<Mamba2Cache>>) -> Self {
214        let caches = options.into_iter().map(Option::unwrap).collect();
215        Self::from_vec(caches)
216    }
217}
burn_mamba/mamba2/cache.rs

burn_mamba/mamba2/
cache.rs