Implementation:Deepspeedai DeepSpeed Normalize Layer

Knowledge Sources	DeepSpeed
Domains	Normalization, Transformer, Training, CUDA_Kernels
Last Updated	2026-02-09 00:00 GMT

Overview

C++ wrapper class for fused layer normalization operations supporting both training and inference with optional mean/variance saving for backward passes.

Description

The Normalize_Layer template class provides a high-level interface to DeepSpeed's optimized layer normalization CUDA kernels. It manages state for training (variance and optional mean storage), supports both standard and checkpoint-friendly forward passes, and orchestrates fused backward computations including residual gradient addition. The implementation offers configuration flexibility through its Config structure controlling batch size, sequence length, hidden dimension, epsilon for numerical stability, training mode, and whether to use mean-based or variance-only formulations. The class distinguishes between ForwardCheckpoint (saves statistics for exact backward) and Forward (recomputes if needed) modes, and provides separate Backward functions for different gradient flow scenarios including fused addition of multiple gradient streams.

Usage

Use this class when implementing transformer layers that require efficient layer normalization. The checkpoint mode is valuable for memory-constrained training where some recomputation is acceptable, while the invertible normalization option enables reversible architectures that can reconstruct activations during backward passes.

Code Reference

Source Location

Repository: DeepSpeed
File: csrc/includes/normalize_layer.h

Signature

template <typename T>
class Normalize_Layer {
public:
    struct Config {
        uint32_t batchSize, seqLength, hiddenDim;
        float epsilon;
        bool training, useMean;
        Config(uint32_t batch, uint32_t seq, uint32_t h,
               float epsilon = 1e-12, bool training = true,
               bool useMean = true);
    };

    Normalize_Layer(Config config);

    // Forward with statistics saving
    void ForwardCheckpoint(int bsz, T* vals, const T* residual,
                          const T* gamma, const T* betta,
                          cudaStream_t& stream, bool preLayerNorm = false);

    // Forward without saving (may recompute)
    void Forward(int bsz, T* vals, const T* residual,
                const T* gamma, const T* betta,
                cudaStream_t& stream, bool preLayerNorm = false);

    // Backward pass
    void Backward(int bsz, const T* out_grad, const T* gamma,
                 T* gamma_grad, T* betta_grad, cudaStream_t stream[2],
                 T* inp_grad_out, const T* norm_in = nullptr);

    // Backward with fused gradient addition
    void BackwardFusedAdd(int bsz, const T* out_grad1, const T* out_grad2,
                         const T* gamma, T* gamma_grad, T* betta_grad,
                         cudaStream_t stream[2], T* inp_grad_out,
                         const T* norm_in = nullptr);

    inline bool UseMean() const;
    inline void SetVar(T* variance);
    inline void SetMean(T* mean);
};

Import

#include "csrc/includes/normalize_layer.h"

I/O Contract

Input	Type	Description
vals	T*	Input activations (also output buffer)
residual	const T*	Residual connection to add
gamma	const T*	Scale parameters [hidden_dim]
betta	const T*	Bias parameters [hidden_dim]
bsz	int	Batch size × sequence length
stream	cudaStream_t	CUDA stream for execution
preLayerNorm	bool	If true, apply norm before residual add

Output	Type	Description
vals	T*	Normalized output (in-place)
gamma_grad	T*	Gradient w.r.t. gamma (backward only)
betta_grad	T*	Gradient w.r.t. beta (backward only)
inp_grad_out	T*	Gradient w.r.t. input (backward only)

Usage Examples

Basic Layer Normalization:

#include "normalize_layer.h"

// Setup
int batch = 32, seq_len = 512, hidden = 768;
float epsilon = 1e-5;

Normalize_Layer<__half>::Config config(
    batch, seq_len, hidden, epsilon, true, true);
Normalize_Layer<__half> layer_norm(config);

// Allocate buffers
__half *input, *residual, *gamma, *beta, *output;
__half *variance, *mean;
// ... allocate ...

layer_norm.SetVar(variance);
layer_norm.SetMean(mean);

// Forward pass
cudaStream_t stream;
cudaStreamCreate(&stream);

layer_norm.ForwardCheckpoint(
    batch * seq_len, input, residual, gamma, beta, stream, false);

Pre-Layer Norm Configuration:

// Pre-LN: Normalize before sublayer (GPT-style)
class PreLNTransformerBlock {
    Normalize_Layer<__half> attn_norm;
    Normalize_Layer<__half> ffn_norm;

public:
    PreLNTransformerBlock(int batch, int seq, int hidden)
        : attn_norm({batch, seq, hidden, 1e-5, true, true}),
          ffn_norm({batch, seq, hidden, 1e-5, true, true}) {
    }

    void forward(__half* x, __half* attn_out, __half* ffn_out,
                __half* gamma1, __half* beta1,
                __half* gamma2, __half* beta2,
                cudaStream_t stream) {
        int bsz = batch * seq;

        // Pre-LN for attention
        attn_norm.ForwardCheckpoint(bsz, x, x, gamma1, beta1, stream, true);
        // ... attention computation into attn_out ...

        // Residual connection
        add_residual<<<...>>>(attn_out, x, bsz * hidden);

        // Pre-LN for FFN
        ffn_norm.ForwardCheckpoint(bsz, attn_out, attn_out,
                                   gamma2, beta2, stream, true);
        // ... FFN computation into ffn_out ...

        // Final residual
        add_residual<<<...>>>(ffn_out, attn_out, bsz * hidden);
    }
};

Memory-Efficient Checkpointing:

// Use Forward (no checkpoint) for memory-constrained scenarios
Normalize_Layer<__half>::Config config(32, 512, 768, 1e-5, true, false);
Normalize_Layer<__half> norm(config);  // useMean=false saves memory

// Forward without saving mean (only variance)
__half *x, *residual, *gamma, *beta;
norm.Forward(batch * seq, x, residual, gamma, beta, stream, false);

// Backward will recompute mean from saved variance
__half *grad_out, *grad_in, *grad_gamma, *grad_beta;
cudaStream_t streams[2];
norm.Backward(batch * seq, grad_out, gamma, grad_gamma, grad_beta,
             streams, grad_in, x);  // Pass input for recomputation

Fused Backward with Multiple Gradients:

// When gradients flow from multiple paths (e.g., attention + residual)
void backward_with_multiple_grads(
    Normalize_Layer<__half>& norm,
    const __half* grad_from_next_layer,
    const __half* grad_from_skip_connection,
    const __half* gamma,
    __half* gamma_grad, __half* beta_grad,
    __half* input_grad,
    const __half* normalized_input,
    int batch_seq_size) {

    cudaStream_t streams[2];
    cudaStreamCreate(&streams[0]);
    cudaStreamCreate(&streams[1]);

    // Fuses: grad_total = grad_from_next + grad_from_skip
    // Then computes norm backward
    norm.BackwardFusedAdd(
        batch_seq_size,
        grad_from_next_layer,
        grad_from_skip_connection,
        gamma,
        gamma_grad, beta_grad,
        streams,
        input_grad,
        normalized_input);

    cudaStreamSynchronize(streams[0]);
    cudaStreamSynchronize(streams[1]);
}

Inference Mode (No Statistics):

// For inference, no need to save statistics
Normalize_Layer<__half>::Config infer_config(
    1, 512, 768, 1e-5, false, false);  // training=false
Normalize_Layer<__half> infer_norm(infer_config);

// Forward only (no backward support needed)
infer_norm.Forward(seq_len, activations, nullptr,
                   gamma, beta, stream, false);

Post-Layer Norm (BERT-style):

// Post-LN: Normalize after residual addition
void post_ln_sublayer(__half* x, __half* sublayer_out,
                      __half* gamma, __half* beta,
                      Normalize_Layer<__half>& norm,
                      cudaStream_t stream, int bsz) {
    // Sublayer computes into sublayer_out
    // ... sublayer computation ...

    // Add residual and normalize
    // norm.Forward handles: x = norm(x + sublayer_out)
    norm.ForwardCheckpoint(bsz, sublayer_out, x, gamma, beta,
                          stream, false);  // preLayerNorm=false
}

Related Pages

Custom CUDA Layers - Underlying kernel implementations
Transformer CUDA - Uses normalization in transformer layers
Reduction Utils - Reduction operations for mean/variance

Page Connections

Double-click a node to navigate. Hold to expand connections.

Principle

Implementation

Heuristic

Environment