Implementation:Deepspeedai DeepSpeed Normalize Layer
| Knowledge Sources | |
|---|---|
| Domains | Normalization, Transformer, Training, CUDA_Kernels |
| Last Updated | 2026-02-09 00:00 GMT |
Overview
C++ wrapper class for fused layer normalization operations supporting both training and inference with optional mean/variance saving for backward passes.
Description
The Normalize_Layer template class provides a high-level interface to DeepSpeed's optimized layer normalization CUDA kernels. It manages state for training (variance and optional mean storage), supports both standard and checkpoint-friendly forward passes, and orchestrates fused backward computations including residual gradient addition. The implementation offers configuration flexibility through its Config structure controlling batch size, sequence length, hidden dimension, epsilon for numerical stability, training mode, and whether to use mean-based or variance-only formulations. The class distinguishes between ForwardCheckpoint (saves statistics for exact backward) and Forward (recomputes if needed) modes, and provides separate Backward functions for different gradient flow scenarios including fused addition of multiple gradient streams.
Usage
Use this class when implementing transformer layers that require efficient layer normalization. The checkpoint mode is valuable for memory-constrained training where some recomputation is acceptable, while the invertible normalization option enables reversible architectures that can reconstruct activations during backward passes.
Code Reference
Source Location
- Repository: DeepSpeed
- File: csrc/includes/normalize_layer.h
Signature
template <typename T>
class Normalize_Layer {
public:
struct Config {
uint32_t batchSize, seqLength, hiddenDim;
float epsilon;
bool training, useMean;
Config(uint32_t batch, uint32_t seq, uint32_t h,
float epsilon = 1e-12, bool training = true,
bool useMean = true);
};
Normalize_Layer(Config config);
// Forward with statistics saving
void ForwardCheckpoint(int bsz, T* vals, const T* residual,
const T* gamma, const T* betta,
cudaStream_t& stream, bool preLayerNorm = false);
// Forward without saving (may recompute)
void Forward(int bsz, T* vals, const T* residual,
const T* gamma, const T* betta,
cudaStream_t& stream, bool preLayerNorm = false);
// Backward pass
void Backward(int bsz, const T* out_grad, const T* gamma,
T* gamma_grad, T* betta_grad, cudaStream_t stream[2],
T* inp_grad_out, const T* norm_in = nullptr);
// Backward with fused gradient addition
void BackwardFusedAdd(int bsz, const T* out_grad1, const T* out_grad2,
const T* gamma, T* gamma_grad, T* betta_grad,
cudaStream_t stream[2], T* inp_grad_out,
const T* norm_in = nullptr);
inline bool UseMean() const;
inline void SetVar(T* variance);
inline void SetMean(T* mean);
};
Import
#include "csrc/includes/normalize_layer.h"
I/O Contract
| Input | Type | Description |
|---|---|---|
| vals | T* | Input activations (also output buffer) |
| residual | const T* | Residual connection to add |
| gamma | const T* | Scale parameters [hidden_dim] |
| betta | const T* | Bias parameters [hidden_dim] |
| bsz | int | Batch size × sequence length |
| stream | cudaStream_t | CUDA stream for execution |
| preLayerNorm | bool | If true, apply norm before residual add |
| Output | Type | Description |
|---|---|---|
| vals | T* | Normalized output (in-place) |
| gamma_grad | T* | Gradient w.r.t. gamma (backward only) |
| betta_grad | T* | Gradient w.r.t. beta (backward only) |
| inp_grad_out | T* | Gradient w.r.t. input (backward only) |
Usage Examples
Basic Layer Normalization:
#include "normalize_layer.h"
// Setup
int batch = 32, seq_len = 512, hidden = 768;
float epsilon = 1e-5;
Normalize_Layer<__half>::Config config(
batch, seq_len, hidden, epsilon, true, true);
Normalize_Layer<__half> layer_norm(config);
// Allocate buffers
__half *input, *residual, *gamma, *beta, *output;
__half *variance, *mean;
// ... allocate ...
layer_norm.SetVar(variance);
layer_norm.SetMean(mean);
// Forward pass
cudaStream_t stream;
cudaStreamCreate(&stream);
layer_norm.ForwardCheckpoint(
batch * seq_len, input, residual, gamma, beta, stream, false);
Pre-Layer Norm Configuration:
// Pre-LN: Normalize before sublayer (GPT-style)
class PreLNTransformerBlock {
Normalize_Layer<__half> attn_norm;
Normalize_Layer<__half> ffn_norm;
public:
PreLNTransformerBlock(int batch, int seq, int hidden)
: attn_norm({batch, seq, hidden, 1e-5, true, true}),
ffn_norm({batch, seq, hidden, 1e-5, true, true}) {
}
void forward(__half* x, __half* attn_out, __half* ffn_out,
__half* gamma1, __half* beta1,
__half* gamma2, __half* beta2,
cudaStream_t stream) {
int bsz = batch * seq;
// Pre-LN for attention
attn_norm.ForwardCheckpoint(bsz, x, x, gamma1, beta1, stream, true);
// ... attention computation into attn_out ...
// Residual connection
add_residual<<<...>>>(attn_out, x, bsz * hidden);
// Pre-LN for FFN
ffn_norm.ForwardCheckpoint(bsz, attn_out, attn_out,
gamma2, beta2, stream, true);
// ... FFN computation into ffn_out ...
// Final residual
add_residual<<<...>>>(ffn_out, attn_out, bsz * hidden);
}
};
Memory-Efficient Checkpointing:
// Use Forward (no checkpoint) for memory-constrained scenarios
Normalize_Layer<__half>::Config config(32, 512, 768, 1e-5, true, false);
Normalize_Layer<__half> norm(config); // useMean=false saves memory
// Forward without saving mean (only variance)
__half *x, *residual, *gamma, *beta;
norm.Forward(batch * seq, x, residual, gamma, beta, stream, false);
// Backward will recompute mean from saved variance
__half *grad_out, *grad_in, *grad_gamma, *grad_beta;
cudaStream_t streams[2];
norm.Backward(batch * seq, grad_out, gamma, grad_gamma, grad_beta,
streams, grad_in, x); // Pass input for recomputation
Fused Backward with Multiple Gradients:
// When gradients flow from multiple paths (e.g., attention + residual)
void backward_with_multiple_grads(
Normalize_Layer<__half>& norm,
const __half* grad_from_next_layer,
const __half* grad_from_skip_connection,
const __half* gamma,
__half* gamma_grad, __half* beta_grad,
__half* input_grad,
const __half* normalized_input,
int batch_seq_size) {
cudaStream_t streams[2];
cudaStreamCreate(&streams[0]);
cudaStreamCreate(&streams[1]);
// Fuses: grad_total = grad_from_next + grad_from_skip
// Then computes norm backward
norm.BackwardFusedAdd(
batch_seq_size,
grad_from_next_layer,
grad_from_skip_connection,
gamma,
gamma_grad, beta_grad,
streams,
input_grad,
normalized_input);
cudaStreamSynchronize(streams[0]);
cudaStreamSynchronize(streams[1]);
}
Inference Mode (No Statistics):
// For inference, no need to save statistics
Normalize_Layer<__half>::Config infer_config(
1, 512, 768, 1e-5, false, false); // training=false
Normalize_Layer<__half> infer_norm(infer_config);
// Forward only (no backward support needed)
infer_norm.Forward(seq_len, activations, nullptr,
gamma, beta, stream, false);
Post-Layer Norm (BERT-style):
// Post-LN: Normalize after residual addition
void post_ln_sublayer(__half* x, __half* sublayer_out,
__half* gamma, __half* beta,
Normalize_Layer<__half>& norm,
cudaStream_t stream, int bsz) {
// Sublayer computes into sublayer_out
// ... sublayer computation ...
// Add residual and normalize
// norm.Forward handles: x = norm(x + sublayer_out)
norm.ForwardCheckpoint(bsz, sublayer_out, x, gamma, beta,
stream, false); // preLayerNorm=false
}
Related Pages
- Custom CUDA Layers - Underlying kernel implementations
- Transformer CUDA - Uses normalization in transformer layers
- Reduction Utils - Reduction operations for mean/variance