Jump to content

Connect SuperML | Leeroopedia MCP: Equip your AI agents with best practices, code verification, and debugging knowledge. Powered by Leeroo — building Organizational Superintelligence. Contact us at founders@leeroo.com.

Implementation:Rapidsai Cuml GLM Preprocess MG

From Leeroopedia


Knowledge Sources
Domains Machine_Learning, Linear_Models
Last Updated 2026-02-08 12:00 GMT

Overview

Provides multi-node multi-GPU (MNMG) pre-processing and post-processing functions for distributed GLM (Generalized Linear Model) operations, including mean centering and intercept computation.

Description

This header defines two pairs of functions in the ML::GLM::opg namespace for distributed data pre-processing and post-processing in the context of linear model fitting:

  • preProcessData: Centers the input feature matrix and labels by subtracting their column-wise means when fit_intercept is true. The computed means are stored in mu_input and mu_labels for later use in post-processing. This centering step is essential for correctly fitting an intercept in distributed OLS and Ridge regression.
  • postProcessData: Reverses the centering applied during pre-processing and computes the intercept from the centered coefficients and stored means. It restores the input data and labels to their original scale and writes the final intercept value.

Both functions have float and double overloads and operate on distributed data described by MLCommon::Matrix::PartDescriptor. They accept multiple CUDA streams for concurrent execution across GPU partitions.

Usage

Use preProcessData before fitting a distributed linear model (OLS/Ridge) to center the data, then use postProcessData after fitting to compute the intercept and restore the original data. These are typically called internally by the multi-GPU OLS/Ridge fitting pipeline.

Code Reference

Source Location

  • Repository: Rapidsai_Cuml
  • File: cpp/include/cuml/linear_model/preprocess_mg.hpp

Signature

namespace ML {
namespace GLM {
namespace opg {

void preProcessData(raft::handle_t& handle,
                    std::vector<MLCommon::Matrix::Data<float>*>& input_data,
                    MLCommon::Matrix::PartDescriptor& input_desc,
                    std::vector<MLCommon::Matrix::Data<float>*>& labels,
                    float* mu_input,
                    float* mu_labels,
                    bool fit_intercept,
                    cudaStream_t* streams,
                    int n_streams,
                    bool verbose);

void preProcessData(raft::handle_t& handle,
                    std::vector<MLCommon::Matrix::Data<double>*>& input_data,
                    MLCommon::Matrix::PartDescriptor& input_desc,
                    std::vector<MLCommon::Matrix::Data<double>*>& labels,
                    double* mu_input,
                    double* mu_labels,
                    bool fit_intercept,
                    cudaStream_t* streams,
                    int n_streams,
                    bool verbose);

void postProcessData(raft::handle_t& handle,
                     std::vector<MLCommon::Matrix::Data<float>*>& input_data,
                     MLCommon::Matrix::PartDescriptor& input_desc,
                     std::vector<MLCommon::Matrix::Data<float>*>& labels,
                     float* coef,
                     float* intercept,
                     float* mu_input,
                     float* mu_labels,
                     bool fit_intercept,
                     cudaStream_t* streams,
                     int n_streams,
                     bool verbose);

void postProcessData(raft::handle_t& handle,
                     std::vector<MLCommon::Matrix::Data<double>*>& input_data,
                     MLCommon::Matrix::PartDescriptor& input_desc,
                     std::vector<MLCommon::Matrix::Data<double>*>& labels,
                     double* coef,
                     double* intercept,
                     double* mu_input,
                     double* mu_labels,
                     bool fit_intercept,
                     cudaStream_t* streams,
                     int n_streams,
                     bool verbose);

};  // end namespace opg
};  // namespace GLM
};  // end namespace ML

Import

#include <cuml/linear_model/preprocess_mg.hpp>

I/O Contract

Inputs (preProcessData)

Name Type Required Description
handle raft::handle_t& Yes cuML handle for GPU resource management
input_data std::vector<MLCommon::Matrix::Data<T>*>& Yes Distributed input feature matrix partitions on device (modified in-place)
input_desc MLCommon::Matrix::PartDescriptor& Yes MNMG descriptor for the input partitions
labels std::vector<MLCommon::Matrix::Data<T>*>& Yes Distributed label data partitions on device (modified in-place)
fit_intercept bool Yes Whether to center the data for intercept fitting
streams cudaStream_t* Yes Array of CUDA streams for concurrent execution
n_streams int Yes Number of CUDA streams
verbose bool Yes Whether to print verbose output

Inputs (postProcessData, additional)

Name Type Required Description
coef float*/double* Yes Device pointer to the fitted coefficients
mu_input float*/double* Yes Device pointer to the stored column means of input features
mu_labels float*/double* Yes Device pointer to the stored mean of labels

Outputs

Name Type Description
input_data std::vector<MLCommon::Matrix::Data<T>*>& Modified in-place: centered (pre) or restored (post)
labels std::vector<MLCommon::Matrix::Data<T>*>& Modified in-place: centered (pre) or restored (post)
mu_input float*/double* Column-wise means of the input features (from preProcessData)
mu_labels float*/double* Mean of the labels (from preProcessData)
intercept float*/double* Computed intercept value (from postProcessData)

Usage Examples

#include <cuml/linear_model/preprocess_mg.hpp>
#include <cuml/prims/opg/matrix/data.hpp>
#include <cuml/prims/opg/matrix/part_descriptor.hpp>
#include <raft/core/handle.hpp>

void distributed_ols_pipeline(
    raft::handle_t& handle,
    std::vector<MLCommon::Matrix::Data<float>*>& input_data,
    MLCommon::Matrix::PartDescriptor& input_desc,
    std::vector<MLCommon::Matrix::Data<float>*>& labels,
    float* coef,
    float* intercept) {

    // Allocate mean storage
    size_t n_cols = input_desc.N;  // number of features
    float* mu_input;
    float* mu_labels;
    cudaMalloc(&mu_input, n_cols * sizeof(float));
    cudaMalloc(&mu_labels, sizeof(float));

    // Create CUDA streams
    int n_streams = 2;
    std::vector<cudaStream_t> streams(n_streams);
    for (int i = 0; i < n_streams; i++) {
        cudaStreamCreate(&streams[i]);
    }

    // Step 1: Center the data
    ML::GLM::opg::preProcessData(handle, input_data, input_desc, labels,
                                  mu_input, mu_labels,
                                  true,  // fit_intercept
                                  streams.data(), n_streams,
                                  false); // verbose

    // Step 2: Fit the model on centered data (OLS/Ridge)...
    // ... produces coef ...

    // Step 3: Restore data and compute intercept
    ML::GLM::opg::postProcessData(handle, input_data, input_desc, labels,
                                   coef, intercept,
                                   mu_input, mu_labels,
                                   true,  // fit_intercept
                                   streams.data(), n_streams,
                                   false); // verbose

    // Clean up
    for (int i = 0; i < n_streams; i++) {
        cudaStreamSynchronize(streams[i]);
        cudaStreamDestroy(streams[i]);
    }
    cudaFree(mu_input);
    cudaFree(mu_labels);
}

Related Pages

Page Connections

Double-click a node to navigate. Hold to expand connections.
Principle
Implementation
Heuristic
Environment