Implementation:Deepspeedai DeepSpeed Reduction Utils
| Knowledge Sources | |
|---|---|
| Domains | CUDA_Kernels, Parallel_Algorithms, Performance_Optimization |
| Last Updated | 2026-02-09 00:00 GMT |
Overview
Templated CUDA reduction primitives providing warp-level and block-level reductions for sum, min, and max operations across multiple data types.
Description
This header implements a comprehensive reduction framework using cooperative groups for efficient parallel reductions in CUDA kernels. It provides three levels of reduction abstractions: element-wise operations for serial loops, warp-level reductions using shuffle intrinsics, and block-level reductions with shared memory synchronization. The template-based design supports multiple reduction operations (Add, Max, Min) identified by the ROpType enum, and works across various data types including float, double, __half, __half2, __nv_bfloat16, and integer types. The framework includes specialized block reduction functions that can process up to 4 independent reductions simultaneously, and partitioned_block reductions for scenarios where a thread block is divided into multiple independent reduction groups. All implementations optimize for warp-level parallelism and minimize shared memory synchronization overhead.
Usage
Use these utilities whenever implementing kernels that require parallel aggregation operations like computing statistics (mean, variance), finding extrema, or accumulating values across threads. The API automatically handles the complexity of warp shuffles and shared memory coordination.
Code Reference
Source Location
- Repository: DeepSpeed
- File: csrc/includes/reduction_utils.h
Signature
namespace reduce {
enum class ROpType { Add, Max, Min };
// Element-wise reduction (for serial loops)
template <ROpType Op, typename T>
DS_D_INLINE T element(const T lhs, const T rhs);
// Initialize accumulator
template <ROpType OType, typename T = float>
DS_D_INLINE T init();
// Full block reduction (single value)
template <ROpType Op, int warp_bound = max_warps>
DS_D_INLINE void block(cg::thread_block& tb,
cg::thread_block_tile<hw_warp_size>& warp,
float& val);
// Multiple simultaneous reductions
template <ROpType Op1, ROpType Op2, int warp_bound = max_warps>
DS_D_INLINE void block(cg::thread_block& tb,
cg::thread_block_tile<hw_warp_size>& warp,
float& val1, float& val2);
// Partitioned block reduction (multiple independent groups)
template <ROpType Op, int num_threads>
DS_D_INLINE void partitioned_block(cg::thread_block& tb,
cg::thread_block_tile<hw_warp_size>& warp,
float& val);
}
Import
#include "csrc/includes/reduction_utils.h"
I/O Contract
| Input | Type | Description |
|---|---|---|
| val | float& | Value to reduce (input/output parameter) |
| Op | ROpType (template) | Reduction operation: Add, Max, or Min |
| tb | cg::thread_block& | Thread block cooperative group |
| warp | cg::thread_block_tile<32>& | Warp-level tile |
| num_threads | int (template) | Threads per partition (for partitioned) |
| Output | Type | Description |
|---|---|---|
| val | float& | Reduced value (same in all threads after reduction) |
Usage Examples
Block-Wide Sum Reduction:
__global__ void sum_array(const float* input, float* output, int n) {
cg::thread_block tb = cg::this_thread_block();
cg::thread_block_tile<32> warp = cg::tiled_partition<32>(tb);
float thread_sum = 0.0f;
for (int i = tb.thread_rank(); i < n; i += tb.size()) {
thread_sum += input[i];
}
// Reduce across all threads in block
reduce::block<reduce::ROpType::Add>(tb, warp, thread_sum);
if (tb.thread_rank() == 0) {
output[tb.group_index().x] = thread_sum;
}
}
Computing Mean and Variance:
__global__ void compute_statistics(const __half* data, float* mean, float* var, int n) {
cg::thread_block tb = cg::this_thread_block();
cg::thread_block_tile<32> warp = cg::tiled_partition<32>(tb);
float sum = 0.0f, sum_sq = 0.0f;
for (int i = tb.thread_rank(); i < n; i += tb.size()) {
float val = __half2float(data[i]);
sum += val;
sum_sq += val * val;
}
// Simultaneous reduction of both sums
reduce::block<reduce::ROpType::Add, reduce::ROpType::Add>(
tb, warp, sum, sum_sq);
if (tb.thread_rank() == 0) {
*mean = sum / n;
*var = (sum_sq / n) - (*mean * *mean);
}
}
Finding Min/Max for Normalization:
__global__ void normalize_kernel(__half* data, int n) {
cg::thread_block tb = cg::this_thread_block();
cg::thread_block_tile<32> warp = cg::tiled_partition<32>(tb);
// Initialize to sentinel values
float local_min = reduce::init<reduce::ROpType::Min, float>();
float local_max = reduce::init<reduce::ROpType::Max, float>();
// Find local extrema
for (int i = tb.thread_rank(); i < n; i += tb.size()) {
float val = __half2float(data[i]);
local_min = reduce::element<reduce::ROpType::Min>(local_min, val);
local_max = reduce::element<reduce::ROpType::Max>(local_max, val);
}
// Global min/max across block
reduce::block<reduce::ROpType::Min, reduce::ROpType::Max>(
tb, warp, local_min, local_max);
// Normalize in [0, 1]
float range = local_max - local_min;
for (int i = tb.thread_rank(); i < n; i += tb.size()) {
float val = __half2float(data[i]);
val = (val - local_min) / range;
data[i] = __float2half(val);
}
}
Partitioned Reduction for Multi-Head Attention:
__global__ void attention_softmax_partitioned(__half* scores,
int batch, int heads,
int seq_len) {
cg::thread_block tb = cg::this_thread_block();
cg::thread_block_tile<32> warp = cg::tiled_partition<32>(tb);
// Each head processed by 32 threads (warp)
constexpr int threads_per_head = 32;
int head_id = tb.thread_rank() / threads_per_head;
int local_tid = tb.thread_rank() % threads_per_head;
// Find max for numerical stability (partitioned by head)
float max_val = reduce::init<reduce::ROpType::Max, float>();
for (int i = local_tid; i < seq_len; i += threads_per_head) {
float val = __half2float(scores[head_id * seq_len + i]);
max_val = reduce::element<reduce::ROpType::Max>(max_val, val);
}
// Reduce within partition (only within same head)
reduce::partitioned_block<reduce::ROpType::Max, threads_per_head>(
tb, warp, max_val);
// Softmax: exp and sum
float sum = 0.0f;
for (int i = local_tid; i < seq_len; i += threads_per_head) {
int idx = head_id * seq_len + i;
float val = __half2float(scores[idx]);
val = expf(val - max_val);
scores[idx] = __float2half(val);
sum += val;
}
reduce::partitioned_block<reduce::ROpType::Add, threads_per_head>(
tb, warp, sum);
// Normalize
for (int i = local_tid; i < seq_len; i += threads_per_head) {
int idx = head_id * seq_len + i;
float val = __half2float(scores[idx]);
scores[idx] = __float2half(val / sum);
}
}
Half2 Vector Reduction:
__device__ float reduce_half2_array(__half2* data, int n) {
cg::thread_block tb = cg::this_thread_block();
cg::thread_block_tile<32> warp = cg::tiled_partition<32>(tb);
__half2 sum2 = __float2half2_rn(0.0f);
for (int i = threadIdx.x; i < n; i += blockDim.x) {
sum2 = reduce::element<reduce::ROpType::Add>(sum2, data[i]);
}
// Convert to scalar and reduce
float2 sum_f2 = __half22float2(sum2);
float sum = sum_f2.x + sum_f2.y;
reduce::block<reduce::ROpType::Add>(tb, warp, sum);
return sum;
}
Related Pages
- Quantization Utils - Uses reductions for parameter computation
- Custom CUDA Layers - Applies reductions in normalization
- Normalize Layer - Mean/variance computation via reductions