Implementation:Deepspeedai DeepSpeed Reduction Utils

Knowledge Sources	DeepSpeed
Domains	CUDA_Kernels, Parallel_Algorithms, Performance_Optimization
Last Updated	2026-02-09 00:00 GMT

Overview

Templated CUDA reduction primitives providing warp-level and block-level reductions for sum, min, and max operations across multiple data types.

Description

This header implements a comprehensive reduction framework using cooperative groups for efficient parallel reductions in CUDA kernels. It provides three levels of reduction abstractions: element-wise operations for serial loops, warp-level reductions using shuffle intrinsics, and block-level reductions with shared memory synchronization. The template-based design supports multiple reduction operations (Add, Max, Min) identified by the ROpType enum, and works across various data types including float, double, __half, __half2, __nv_bfloat16, and integer types. The framework includes specialized block reduction functions that can process up to 4 independent reductions simultaneously, and partitioned_block reductions for scenarios where a thread block is divided into multiple independent reduction groups. All implementations optimize for warp-level parallelism and minimize shared memory synchronization overhead.

Usage

Use these utilities whenever implementing kernels that require parallel aggregation operations like computing statistics (mean, variance), finding extrema, or accumulating values across threads. The API automatically handles the complexity of warp shuffles and shared memory coordination.

Code Reference

Source Location

Repository: DeepSpeed
File: csrc/includes/reduction_utils.h

Signature

namespace reduce {
    enum class ROpType { Add, Max, Min };

    // Element-wise reduction (for serial loops)
    template <ROpType Op, typename T>
    DS_D_INLINE T element(const T lhs, const T rhs);

    // Initialize accumulator
    template <ROpType OType, typename T = float>
    DS_D_INLINE T init();

    // Full block reduction (single value)
    template <ROpType Op, int warp_bound = max_warps>
    DS_D_INLINE void block(cg::thread_block& tb,
                          cg::thread_block_tile<hw_warp_size>& warp,
                          float& val);

    // Multiple simultaneous reductions
    template <ROpType Op1, ROpType Op2, int warp_bound = max_warps>
    DS_D_INLINE void block(cg::thread_block& tb,
                          cg::thread_block_tile<hw_warp_size>& warp,
                          float& val1, float& val2);

    // Partitioned block reduction (multiple independent groups)
    template <ROpType Op, int num_threads>
    DS_D_INLINE void partitioned_block(cg::thread_block& tb,
                                      cg::thread_block_tile<hw_warp_size>& warp,
                                      float& val);
}

Import

#include "csrc/includes/reduction_utils.h"

I/O Contract

Input	Type	Description
val	float&	Value to reduce (input/output parameter)
Op	ROpType (template)	Reduction operation: Add, Max, or Min
tb	cg::thread_block&	Thread block cooperative group
warp	cg::thread_block_tile<32>&	Warp-level tile
num_threads	int (template)	Threads per partition (for partitioned)

Output	Type	Description
val	float&	Reduced value (same in all threads after reduction)

Usage Examples

Block-Wide Sum Reduction:

__global__ void sum_array(const float* input, float* output, int n) {
    cg::thread_block tb = cg::this_thread_block();
    cg::thread_block_tile<32> warp = cg::tiled_partition<32>(tb);

    float thread_sum = 0.0f;
    for (int i = tb.thread_rank(); i < n; i += tb.size()) {
        thread_sum += input[i];
    }

    // Reduce across all threads in block
    reduce::block<reduce::ROpType::Add>(tb, warp, thread_sum);

    if (tb.thread_rank() == 0) {
        output[tb.group_index().x] = thread_sum;
    }
}

Computing Mean and Variance:

__global__ void compute_statistics(const __half* data, float* mean, float* var, int n) {
    cg::thread_block tb = cg::this_thread_block();
    cg::thread_block_tile<32> warp = cg::tiled_partition<32>(tb);

    float sum = 0.0f, sum_sq = 0.0f;

    for (int i = tb.thread_rank(); i < n; i += tb.size()) {
        float val = __half2float(data[i]);
        sum += val;
        sum_sq += val * val;
    }

    // Simultaneous reduction of both sums
    reduce::block<reduce::ROpType::Add, reduce::ROpType::Add>(
        tb, warp, sum, sum_sq);

    if (tb.thread_rank() == 0) {
        *mean = sum / n;
        *var = (sum_sq / n) - (*mean * *mean);
    }
}

Finding Min/Max for Normalization:

__global__ void normalize_kernel(__half* data, int n) {
    cg::thread_block tb = cg::this_thread_block();
    cg::thread_block_tile<32> warp = cg::tiled_partition<32>(tb);

    // Initialize to sentinel values
    float local_min = reduce::init<reduce::ROpType::Min, float>();
    float local_max = reduce::init<reduce::ROpType::Max, float>();

    // Find local extrema
    for (int i = tb.thread_rank(); i < n; i += tb.size()) {
        float val = __half2float(data[i]);
        local_min = reduce::element<reduce::ROpType::Min>(local_min, val);
        local_max = reduce::element<reduce::ROpType::Max>(local_max, val);
    }

    // Global min/max across block
    reduce::block<reduce::ROpType::Min, reduce::ROpType::Max>(
        tb, warp, local_min, local_max);

    // Normalize in [0, 1]
    float range = local_max - local_min;
    for (int i = tb.thread_rank(); i < n; i += tb.size()) {
        float val = __half2float(data[i]);
        val = (val - local_min) / range;
        data[i] = __float2half(val);
    }
}

Partitioned Reduction for Multi-Head Attention:

__global__ void attention_softmax_partitioned(__half* scores,
                                             int batch, int heads,
                                             int seq_len) {
    cg::thread_block tb = cg::this_thread_block();
    cg::thread_block_tile<32> warp = cg::tiled_partition<32>(tb);

    // Each head processed by 32 threads (warp)
    constexpr int threads_per_head = 32;
    int head_id = tb.thread_rank() / threads_per_head;
    int local_tid = tb.thread_rank() % threads_per_head;

    // Find max for numerical stability (partitioned by head)
    float max_val = reduce::init<reduce::ROpType::Max, float>();
    for (int i = local_tid; i < seq_len; i += threads_per_head) {
        float val = __half2float(scores[head_id * seq_len + i]);
        max_val = reduce::element<reduce::ROpType::Max>(max_val, val);
    }

    // Reduce within partition (only within same head)
    reduce::partitioned_block<reduce::ROpType::Max, threads_per_head>(
        tb, warp, max_val);

    // Softmax: exp and sum
    float sum = 0.0f;
    for (int i = local_tid; i < seq_len; i += threads_per_head) {
        int idx = head_id * seq_len + i;
        float val = __half2float(scores[idx]);
        val = expf(val - max_val);
        scores[idx] = __float2half(val);
        sum += val;
    }

    reduce::partitioned_block<reduce::ROpType::Add, threads_per_head>(
        tb, warp, sum);

    // Normalize
    for (int i = local_tid; i < seq_len; i += threads_per_head) {
        int idx = head_id * seq_len + i;
        float val = __half2float(scores[idx]);
        scores[idx] = __float2half(val / sum);
    }
}

Half2 Vector Reduction:

__device__ float reduce_half2_array(__half2* data, int n) {
    cg::thread_block tb = cg::this_thread_block();
    cg::thread_block_tile<32> warp = cg::tiled_partition<32>(tb);

    __half2 sum2 = __float2half2_rn(0.0f);

    for (int i = threadIdx.x; i < n; i += blockDim.x) {
        sum2 = reduce::element<reduce::ROpType::Add>(sum2, data[i]);
    }

    // Convert to scalar and reduce
    float2 sum_f2 = __half22float2(sum2);
    float sum = sum_f2.x + sum_f2.y;
    reduce::block<reduce::ROpType::Add>(tb, warp, sum);

    return sum;
}

Related Pages

Quantization Utils - Uses reductions for parameter computation
Custom CUDA Layers - Applies reductions in normalization
Normalize Layer - Mean/variance computation via reductions

Page Connections

Double-click a node to navigate. Hold to expand connections.

Principle

Implementation

Heuristic

Environment