burn_mamba/mamba2/ssd/serial_recalculated/
backward.rs

1//! # Custom autodiff node for the Mamba-2 recompute backward
2//!
3//! Implements [`Mamba2BackendExt`] for `Autodiff<B>` by registering a single
4//! Burn [`Backward`] node.  The forward stores only its (small) leaf inputs;
5//! during backprop those are replayed through the K1–K5 kernels and the
6//! analytic gradient math in [`combined_backward`], so the large intermediate
7//! tensors never have to be kept alive — the ~⅓ training-memory saving of the
8//! `SerialRecalculated` path.
9//!
10//! The two forward outputs (`y` and `final_state`) are flattened into one
11//! tracked 1-D tensor (via [`crate::utils::combined_grad`]) so that a single
12//! `Backward<B, 7>` node — one per the 7 differentiable inputs — covers both.
13
14#![allow(non_snake_case)]
15
16use crate::mamba2::ssd::serial_recalculated::{
17    Mamba2BackendExt,
18    combined_backward::{self, CombinedGrads},
19};
20use burn::backend::autodiff::{
21    Autodiff,
22    checkpoint::{base::Checkpointer, strategy::CheckpointStrategy},
23    grads::Gradients,
24    ops::{Backward, Ops, OpsKind},
25};
26use burn::backend::tensor::FloatTensor;
27use burn::backend::{Backend, BackendTypes};
28
29impl<B: Backend + Mamba2BackendExt, C: CheckpointStrategy> Mamba2BackendExt for Autodiff<B, C> {
30    /// Memory-efficient combined forward+backward.
31    ///
32    /// The two output tensors are concatenated into a single 1-dimensional tracked tensor
33    /// so that one `Backward<B, 7>` node covers both outputs.  The caller
34    /// receives split+reshaped slices of that combined tensor; burn's autodiff
35    /// accumulates their upstream gradients back into a single gradient vector
36    /// before firing this backward.
37    fn ssd_serial_recalculated(
38        x_bnlhp: FloatTensor<Self>,
39        dt_discretized_bhnl: FloatTensor<Self>,
40        b_bnlhr: FloatTensor<Self>,
41        c_bnlhr: FloatTensor<Self>,
42        d_h: FloatTensor<Self>,
43        initial_state_bhpr: FloatTensor<Self>,
44        a_decay_h: FloatTensor<Self>,
45    ) -> (FloatTensor<Self>, FloatTensor<Self>) {
46        // ── Backward struct ──────────────────────────────────────────────────
47        #[derive(Debug)]
48        struct CombinedKernelsBackward;
49
50        #[derive(Clone, Debug)]
51        struct State<B: Backend> {
52            x_bnlhp: <B as BackendTypes>::FloatTensorPrimitive,
53            dt_discretized_bhnl: <B as BackendTypes>::FloatTensorPrimitive,
54            b_bnlhr: <B as BackendTypes>::FloatTensorPrimitive,
55            c_bnlhr: <B as BackendTypes>::FloatTensorPrimitive,
56            d_h: <B as BackendTypes>::FloatTensorPrimitive,
57            initial_state_bhpr: <B as BackendTypes>::FloatTensorPrimitive,
58            a_decay_h: <B as BackendTypes>::FloatTensorPrimitive,
59            // flat byte-sizes for splitting the combined gradient vector
60            flat_len_y_BNLHP: usize,
61            flat_len_final_state_BHPR: usize,
62            // shapes needed to reconstruct tensors in the right ranks
63            shape_x_bnlhp: [usize; 5],
64            shape_dt_discretized_bhnl: [usize; 4],
65            shape_b_bnlhr: [usize; 5],
66            shape_c_bnlhr: [usize; 5],
67            shape_d_h: [usize; 1],
68            shape_initial_state_bhpr: [usize; 4],
69            shape_a_decay_h: [usize; 1],
70            shape_y_bnlhp: [usize; 5],          // (output 1)
71            shape_final_state_bhpr: [usize; 4], // (output 2)
72        }
73
74        /// State carried across the forward→backward boundary.
75        ///
76        /// Only the 7 original inputs are saved; all intermediates (cb, intra
77        /// state, chunk_input_state) are recomputed during `backward`.
78        #[allow(clippy::type_complexity)]
79        impl<B: Backend + Mamba2BackendExt> Backward<B, 7> for CombinedKernelsBackward {
80            type State = State<B>;
81
82            fn backward(
83                self,
84                ops: Ops<Self::State, 7>,
85                grads: &mut Gradients,
86                _checkpointer: &mut Checkpointer,
87            ) {
88                let [
89                    node_x_bnlhp,
90                    node_dt_discretized_bhnl,
91                    node_b_bnlhr,
92                    node_c_bnlhr,
93                    node_d_h,
94                    node_initial_state_bhpr,
95                    node_a_decay_h,
96                ] = ops.parents;
97
98                let d_combined: <B as BackendTypes>::FloatTensorPrimitive =
99                    grads.consume::<B>(&ops.node);
100
101                let State {
102                    x_bnlhp,
103                    dt_discretized_bhnl,
104                    b_bnlhr,
105                    c_bnlhr,
106                    d_h,
107                    initial_state_bhpr,
108                    a_decay_h,
109                    //
110                    flat_len_y_BNLHP,
111                    flat_len_final_state_BHPR,
112                    //
113                    shape_x_bnlhp,
114                    shape_dt_discretized_bhnl,
115                    shape_b_bnlhr,
116                    shape_c_bnlhr,
117                    shape_d_h,
118                    shape_initial_state_bhpr,
119                    shape_a_decay_h,
120                    //
121                    shape_y_bnlhp,
122                    shape_final_state_bhpr,
123                } = ops.state;
124
125                // ── Reconstruct saved tensors as rank-tagged primitives ────
126                use crate::utils::fprim::F;
127
128                let x_bnlhp = F::<B, 5>::new(x_bnlhp).reshape(shape_x_bnlhp);
129                let dt_discretized_bhnl =
130                    F::<B, 4>::new(dt_discretized_bhnl).reshape(shape_dt_discretized_bhnl);
131                let b_bnlhr = F::<B, 5>::new(b_bnlhr).reshape(shape_b_bnlhr);
132                let c_bnlhr = F::<B, 5>::new(c_bnlhr).reshape(shape_c_bnlhr);
133                let d_h = F::<B, 1>::new(d_h).reshape(shape_d_h);
134                let initial_state_bhpr =
135                    F::<B, 4>::new(initial_state_bhpr).reshape(shape_initial_state_bhpr);
136                let a_decay_h = F::<B, 1>::new(a_decay_h).reshape(shape_a_decay_h);
137
138                // ── Split incoming combined gradient ───────────────────────
139                let (d_y_bnlhp, d_final_state_bhpr) =
140                    crate::utils::combined_grad::unflatten_pair::<B, 5, 4>(
141                        d_combined,
142                        flat_len_y_BNLHP,
143                        flat_len_final_state_BHPR,
144                        shape_y_bnlhp,
145                        shape_final_state_bhpr,
146                    );
147
148                // ── Core gradient computation ──────────────────────────────
149                let CombinedGrads {
150                    d_x_bnlhp,
151                    d_dt_discretized_bhnl,
152                    d_b_bnlhr,
153                    d_c_bnlhr,
154                    d_d_h,
155                    d_initial_state_bhpr,
156                    d_a_decay_h,
157                    ..
158                } = combined_backward::combined_backward(
159                    F::<B, 5>::new(d_y_bnlhp),
160                    F::<B, 4>::new(d_final_state_bhpr),
161                    //
162                    x_bnlhp,
163                    dt_discretized_bhnl,
164                    b_bnlhr,
165                    c_bnlhr,
166                    d_h,
167                    initial_state_bhpr,
168                    a_decay_h,
169                );
170
171                // ── Register gradients ─────────────────────────────────────
172                if let Some(n) = node_x_bnlhp {
173                    grads.register::<B>(n.id, d_x_bnlhp.inner());
174                }
175                if let Some(n) = node_dt_discretized_bhnl {
176                    grads.register::<B>(n.id, d_dt_discretized_bhnl.inner());
177                }
178                if let Some(n) = node_b_bnlhr {
179                    grads.register::<B>(n.id, d_b_bnlhr.inner());
180                }
181                if let Some(n) = node_c_bnlhr {
182                    grads.register::<B>(n.id, d_c_bnlhr.inner());
183                }
184                if let Some(n) = node_d_h {
185                    grads.register::<B>(n.id, d_d_h.inner());
186                }
187                if let Some(n) = node_initial_state_bhpr {
188                    grads.register::<B>(n.id, d_initial_state_bhpr.inner());
189                }
190                if let Some(n) = node_a_decay_h {
191                    grads.register::<B>(n.id, d_a_decay_h.inner());
192                }
193            }
194        } // end impl Backward
195
196        // ── Shape extraction helpers ───────────────────────────────────────
197        // Accessed via the AutodiffTensor wrappers (which own both .node
198        // and .primitive).
199        use burn::backend::TensorMetadata;
200        let [batch, nchunks, chunk_len, nheads, per_head_dim] = x_bnlhp.primitive.shape().dims();
201        let [_, _, _, _nheads_b, state_rank] = b_bnlhr.primitive.shape().dims();
202
203        let flat_len_y_BNLHP = batch * nchunks * chunk_len * nheads * per_head_dim;
204        let flat_len_final_state_BHPR = batch * nheads * per_head_dim * state_rank;
205
206        let shape_x_bnlhp: [usize; 5] = [batch, nchunks, chunk_len, nheads, per_head_dim];
207        let shape_dt_discretized_bhnl: [usize; 4] = [batch, nheads, nchunks, chunk_len];
208        let shape_b_bnlhr: [usize; 5] = [batch, nchunks, chunk_len, nheads, state_rank];
209        let shape_c_bnlhr: [usize; 5] = [batch, nchunks, chunk_len, nheads, state_rank];
210        let shape_d_h: [usize; 1] = [nheads];
211        let shape_initial_state_bhpr: [usize; 4] = [batch, nheads, per_head_dim, state_rank];
212        let shape_a_decay_h: [usize; 1] = [nheads];
213        let shape_y_bnlhp: [usize; 5] = [batch, nchunks, chunk_len, nheads, per_head_dim];
214        let shape_final_state_bhpr: [usize; 4] = [batch, nheads, per_head_dim, state_rank];
215
216        // ── Register backward / run forward ───────────────────────────────
217        match CombinedKernelsBackward
218            .prepare::<C>([
219                x_bnlhp.node.clone(),
220                dt_discretized_bhnl.node.clone(),
221                b_bnlhr.node.clone(),
222                c_bnlhr.node.clone(),
223                d_h.node.clone(),
224                initial_state_bhpr.node.clone(),
225                a_decay_h.node.clone(),
226            ])
227            .compute_bound()
228            .stateful() // requires compute_bound
229        {
230            OpsKind::Tracked(prep) => {
231                // Run the inner (non-autodiff) forward pass.
232                let (prim_y_bnlhp, prim_final_state_bhpr) = B::ssd_serial_recalculated(
233                    x_bnlhp.primitive.clone(),
234                    dt_discretized_bhnl.primitive.clone(),
235                    b_bnlhr.primitive.clone(),
236                    c_bnlhr.primitive.clone(),
237                    d_h.primitive.clone(),
238                    initial_state_bhpr.primitive.clone(),
239                    a_decay_h.primitive.clone(),
240                );
241
242                // prep.finish takes a single tensor, so pack both outputs into a
243                // single 1-D tensor; one Backward node then covers both.
244                let (prim_combined, _, _) = crate::utils::combined_grad::flatten_pair::<B>(
245                    prim_y_bnlhp,
246                    prim_final_state_bhpr,
247                );
248
249                let state = State {
250                    x_bnlhp: x_bnlhp.primitive.clone(),
251                    dt_discretized_bhnl: dt_discretized_bhnl.primitive.clone(),
252                    b_bnlhr: b_bnlhr.primitive.clone(),
253                    c_bnlhr: c_bnlhr.primitive.clone(),
254                    d_h: d_h.primitive.clone(),
255                    initial_state_bhpr: initial_state_bhpr.primitive.clone(),
256                    a_decay_h: a_decay_h.primitive.clone(),
257                    //
258                    flat_len_y_BNLHP,
259                    flat_len_final_state_BHPR,
260                    //
261                    shape_x_bnlhp, shape_dt_discretized_bhnl, shape_b_bnlhr, shape_c_bnlhr, shape_d_h, shape_initial_state_bhpr, shape_a_decay_h,
262                    shape_y_bnlhp, shape_final_state_bhpr,
263                };
264                let tracked_combined: FloatTensor<Autodiff<B, C>> =
265                    prep.finish(state, prim_combined);
266
267                // Split the tracked combined tensor back into the two outputs.
268                // The narrow/reshape ops are thin autodiff pass-throughs whose
269                // backwards accumulate into the combined gradient vector that
270                // `backward` above consumes.
271                let (tracked_y_bnlhp, tracked_final_state_bhpr) =
272                    crate::utils::combined_grad::autodiff_unflatten_pair::<B, C, 5, 4>(
273                    tracked_combined,
274                    flat_len_y_BNLHP,
275                    flat_len_final_state_BHPR,
276                    shape_y_bnlhp,
277                    shape_final_state_bhpr,
278                );
279
280                (
281                    tracked_y_bnlhp,
282                    tracked_final_state_bhpr,
283                )
284            }
285
286            OpsKind::UnTracked(prep) => {
287                // No gradient tracking — just run the bare forward.
288                let (prim_y_bnlhp, prim_final_state_bhpr) = B::ssd_serial_recalculated(
289                    x_bnlhp.primitive,
290                    dt_discretized_bhnl.primitive,
291                    b_bnlhr.primitive,
292                    c_bnlhr.primitive,
293                    d_h.primitive,
294                    initial_state_bhpr.primitive,
295                    a_decay_h.primitive,
296                );
297
298                let (combined, _, _) = crate::utils::combined_grad::flatten_pair::<B>(
299                    prim_y_bnlhp,
300                    prim_final_state_bhpr,
301                );
302
303                let tracked_combined: FloatTensor<Autodiff<B, C>> =
304                    prep.finish(combined);
305
306                let (tracked_y_bnlhp, tracked_final_state_bhpr) = crate::utils::combined_grad::autodiff_unflatten_pair::<B, C, 5, 4>(
307                    tracked_combined,
308                    flat_len_y_BNLHP,
309                    flat_len_final_state_BHPR,
310                    shape_y_bnlhp,
311                    shape_final_state_bhpr,
312                );
313
314                (
315                    tracked_y_bnlhp,
316                    tracked_final_state_bhpr,
317                )
318            }
319        } // end match
320    } // end fn ssd_serial_recalculated on Autodiff<B, C>
321} // end impl Mamba2BackendExt for Autodiff<B, C>
burn_mamba/mamba2/ssd/serial_recalculated/backward.rs

burn_mamba/mamba2/ssd/serial_recalculated/
backward.rs