Struct Mamba3

Source

pub struct Mamba3<B: Backend> {Show 20 fields
    pub in_proj: Linear<B>,
    pub dt_bias_h: Param<Tensor<B, 1>>,
    pub dt_limit: (f64, f64),
    pub a_floor: f64,
    pub d_h: Param<Tensor<B, 1>>,
    pub b_norm: RmsNorm<B>,
    pub c_norm: RmsNorm<B>,
    pub b_bias_hrn: Param<Tensor<B, 3>>,
    pub c_bias_hrn: Param<Tensor<B, 3>>,
    pub mimo_x: Option<Param<Tensor<B, 3>>>,
    pub mimo_z: Option<Param<Tensor<B, 3>>>,
    pub mimo_o: Option<Param<Tensor<B, 3>>>,
    pub out_norm: Option<RmsNormGated<B>>,
    pub out_proj: Linear<B>,
    pub init_state_hpr: Option<Param<Tensor<B, 3>>>,
    pub state_rank: usize,
    pub ngroups: usize,
    pub num_rope_angles: usize,
    pub rope_dim: usize,
    pub mimo_rank: usize,
}

Expand description

The Mamba-3 SSM block.

Implements the full Mamba-3 layer with exponential-trapezoidal discretization and data-dependent RoPE. Supports SISO (mimo_rank=1) and MIMO (mimo_rank>1). Supports two execution modes:

Self::forward — chunkwise two-SSD algorithm for training / prefill
Self::step — recurrent form for token-by-token decoding

Fields§

§in_proj: Linear

Input projection.

For SISO (R=1): maps d_model → 2·d_inner + 2·ngroups·state_rank + 3·nheads + num_rope_angles. For MIMO (R>1): maps d_model → 2·d_inner + 2·ngroups·state_rank·R + 3·nheads + num_rope_angles.

§dt_bias_h: Param<Tensor<B, 1>>

Per-head bias for the discretisation step size Δ. Shape: [nheads]

§dt_limit: (f64, f64)

Hard clamp applied to Δ after softplus.

§a_floor: f64

Minimum absolute value of A: A ∈ (−∞, −a_floor].

§d_h: Param<Tensor<B, 1>>

Per-head skip (D) coefficient. Shape: [nheads]; initialised to ones.

§b_norm: RmsNorm

RMSNorm applied to the B projection (QK-Norm, no gating). Normalises over the state_rank dimension.

§c_norm: RmsNorm

RMSNorm applied to the C projection (QK-Norm, no gating). Normalises over the state_rank dimension.

§b_bias_hrn: Param<Tensor<B, 3>>

Learnable per-head, per-rank bias for B, added after QK-norm. Shape: [nheads, mimo_rank, state_rank]; initialised to ones.

For SISO (mimo_rank=1) this has shape [nheads, 1, state_rank].

§c_bias_hrn: Param<Tensor<B, 3>>

Learnable per-head, per-rank bias for C, added after QK-norm. Shape: [nheads, mimo_rank, state_rank]; initialised to ones.

§mimo_x: Option<Param<Tensor<B, 3>>>

MIMO up-projection for x (values). Shape: [nheads, mimo_rank, per_head_dim]. Only present when mimo_rank > 1. When SISO, this is None.

§mimo_z: Option<Param<Tensor<B, 3>>>

MIMO up-projection for z (gate). Shape: [nheads, mimo_rank, per_head_dim]. Only present when mimo_rank > 1.

§mimo_o: Option<Param<Tensor<B, 3>>>

MIMO down-projection for the output. Shape: [nheads, mimo_rank, per_head_dim]. Only present when mimo_rank > 1.

§out_norm: Option<RmsNormGated>

Optional gated RMSNorm applied before the output projection.

When Some, the SiLU gate at the block tail is replaced by RmsNormGated(y, z) which normalises y over per_head_dim and gates with SiLU(z). Created when has_outproj_norm = true.

§out_proj: Linear

Output projection: maps d_inner → d_model.

§init_state_hpr: Option<Param<Tensor<B, 3>>>

Optional learnable initial hidden state h₀. Shape: [nheads, per_head_dim, state_rank]

§state_rank: usize

State rank N.

§ngroups: usize

Number of B/C groups G. Must divide nheads.

§num_rope_angles: usize

Number of RoPE angle pairs (rope_dim / 2).

§rope_dim: usize

Effective RoPE dimension (= 2 · num_rope_angles). Always even and ≤ state_rank. Only the first rope_dim entries of B/C are rotated.

§mimo_rank: usize

MIMO rank R. 1 = SISO (standard Mamba-3).

Struct Mamba3 Copy item path

Fields§

Implementations§

impl<B: Backend> Mamba3<B>

pub fn step( &self, input_bm: Tensor<B, 2>, cache: Option<Mamba3Cache<B>>, ) -> (Tensor<B, 2>, Mamba3Cache<B>)

§Shapes

impl<B: Backend> Mamba3<B>

pub fn d_inner(&self) -> usize

pub fn nheads(&self) -> usize

pub fn per_head_dim(&self) -> usize

impl<B: Backend + Mamba3BackendExt> Mamba3<B>

pub fn forward( &self, input_bsm: Tensor<B, 3>, cache: Option<Mamba3Cache<B>>, ssd_path: Mamba3SsdPath, ) -> (Tensor<B, 3>, Mamba3Cache<B>)

§Shapes

impl<B: Backend> Mamba3<B>

pub fn ssd_minimal(input: Mamba3SsdInput<B>) -> (Tensor<B, 6>, Tensor<B, 4>)

§Shapes

impl<B: Backend> Mamba3<B>

pub fn ssd_serial(input: Mamba3SsdInput<B>) -> (Tensor<B, 6>, Tensor<B, 4>)

§Returns

impl<B: Backend + Mamba3BackendExt> Mamba3<B>

pub fn ssd_serial_recalculated( input: Mamba3SsdInput<B>, ) -> (Tensor<B, 6>, Tensor<B, 4>)

§Returns

Trait Implementations§

impl<B> AutodiffModule<B> for Mamba3<B>where B: AutodiffBackend + Backend, <B as AutodiffBackend>::InnerBackend: Backend,

type InnerModule = Mamba3<<B as AutodiffBackend>::InnerBackend>

fn valid(&self) -> Self::InnerModule

fn from_inner(module: Self::InnerModule) -> Self

impl<B: Backend> Clone for Mamba3<B>

fn clone(&self) -> Self

fn clone_from(&mut self, source: &Self)

impl<B: Debug + Backend> Debug for Mamba3<B>

fn fmt(&self, f: &mut Formatter<'_>) -> Result

impl<B: Backend> Display for Mamba3<B>

fn fmt(&self, f: &mut Formatter<'_>) -> Result

impl<B> HasAutodiffModule<B> for Mamba3<B::InnerBackend>where B: AutodiffBackend + Backend, <B as AutodiffBackend>::InnerBackend: Backend,

type TrainModule = Mamba3<B>

impl<B: Backend> Module<B> for Mamba3<B>

type Record = Mamba3Record<B>

fn load_record(self, record: Self::Record) -> Self

fn into_record(self) -> Self::Record

fn num_params(&self) -> usize

fn visit<Visitor: ModuleVisitor<B>>(&self, visitor: &mut Visitor)

fn map<Mapper: ModuleMapper<B>>(self, mapper: &mut Mapper) -> Self

fn collect_devices(&self, devices: Devices<B>) -> Devices<B>

fn to_device(self, device: &B::Device) -> Self

fn fork(self, device: &B::Device) -> Self

fn devices(&self) -> Vec<<B as BackendTypes>::Device>

fn no_grad(self) -> Self

fn train<AB>(self) -> Self::TrainModulewhere AB: AutodiffBackend<InnerBackend = B>, Self: HasAutodiffModule<AB>,

fn quantize_weights(self, quantizer: &mut Quantizer) -> Self

impl<B: Backend> ModuleDisplay for Mamba3<B>

fn format(&self, passed_settings: DisplaySettings) -> String

fn custom_settings(&self) -> Option<DisplaySettings>

fn custom_content(&self, _content: Content) -> Option<Content>

impl<B: Backend> ModuleDisplayDefault for Mamba3<B>

fn content(&self, content: Content) -> Option<Content>

fn num_params(&self) -> usize

Auto Trait Implementations§

impl<B> !Freeze for Mamba3<B>

impl<B> !RefUnwindSafe for Mamba3<B>

impl<B> Send for Mamba3<B>

impl<B> Sync for Mamba3<B>

impl<B> Unpin for Mamba3<B>where <B as BackendTypes>::Device: Unpin, <B as BackendTypes>::FloatTensorPrimitive: Unpin, <B as BackendTypes>::QuantizedTensorPrimitive: Unpin,

impl<B> UnsafeUnpin for Mamba3<B>where <B as BackendTypes>::Device: UnsafeUnpin, <B as BackendTypes>::FloatTensorPrimitive: UnsafeUnpin, <B as BackendTypes>::QuantizedTensorPrimitive: UnsafeUnpin,

impl<B> !UnwindSafe for Mamba3<B>

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> CloneToUninit for Twhere T: Clone,

unsafe fn clone_to_uninit(&self, dest: *mut u8)

impl<T> From<T> for T

fn from(t: T) -> T

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T> ToOwned for Twhere T: Clone,

type Owned = T

Struct Mamba3

impl<B> AutodiffModule<B> for Mamba3<B>
where B: AutodiffBackend + Backend, <B as AutodiffBackend>::InnerBackend: Backend,

impl<B> HasAutodiffModule<B> for Mamba3<B::InnerBackend>
where B: AutodiffBackend + Backend, <B as AutodiffBackend>::InnerBackend: Backend,

fn train<AB>(self) -> Self::TrainModule
where AB: AutodiffBackend<InnerBackend = B>, Self: HasAutodiffModule<AB>,

impl<B> Unpin for Mamba3<B>
where <B as BackendTypes>::Device: Unpin, <B as BackendTypes>::FloatTensorPrimitive: Unpin, <B as BackendTypes>::QuantizedTensorPrimitive: Unpin,

impl<B> UnsafeUnpin for Mamba3<B>
where <B as BackendTypes>::Device: UnsafeUnpin, <B as BackendTypes>::FloatTensorPrimitive: UnsafeUnpin, <B as BackendTypes>::QuantizedTensorPrimitive: UnsafeUnpin,

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T> CloneToUninit for T
where T: Clone,

impl<T, U> Into<U> for T
where U: From<T>,

impl<T> ToOwned for T
where T: Clone,

impl<T> ToString for T
where T: Display + ?Sized,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,