Implementation:Online ml River Dummy Estimators
| Knowledge Sources | |
|---|---|
| Domains | Online_Learning, Baseline_Models, Testing, Benchmarking |
| Last Updated | 2026-02-08 16:00 GMT |
Overview
Simple baseline estimators for testing and establishing performance baselines in online learning scenarios.
Description
The dummy module provides three naive estimators that serve as baselines for comparison and testing: NoChangeClassifier, PriorClassifier, and StatisticRegressor. These models implement the simplest possible prediction strategies without learning complex patterns.
NoChangeClassifier predicts the most recently seen class label. Its predict_one returns the last class while predict_proba_one returns probability 1 for the last class and 0 for all others. This is useful as a baseline for time series classification where the next label is often the same as the current one.
PriorClassifier uses the empirical class distribution from all seen data. It predicts the most common class and returns normalized class frequencies as probabilities. This represents the prior distribution and is a standard baseline for classification tasks, showing what accuracy you'd get by always predicting the majority class.
StatisticRegressor wraps any univariate statistic (like Mean, Median, Quantile) and always predicts that statistic's value regardless of input features. This provides a simple baseline for regression showing what performance you'd get by ignoring features entirely.
All three models are primarily used for testing purposes, establishing baselines, and sanity checking more complex models.
Usage
Use dummy estimators to establish baseline performance metrics before training complex models. They're particularly useful in unit tests, for sanity checking pipelines, and for comparing whether a sophisticated model actually outperforms naive strategies. In some cases, these simple baselines can be surprisingly competitive.
Code Reference
Source Location
- Repository: Online_ml_River
- File: river/dummy.py
Signature
class NoChangeClassifier(base.Classifier):
def __init__(self):
...
class PriorClassifier(base.Classifier):
def __init__(self):
...
class StatisticRegressor(base.Regressor):
def __init__(self, statistic: stats.base.Univariate):
...
Import
from river import dummy
I/O Contract
| Parameter | Type | Description |
|---|---|---|
| x | dict | Feature dictionary (ignored) |
| y | any | Class label to remember |
| Parameter | Type | Description |
|---|---|---|
| x | dict | Feature dictionary (ignored) |
| y | any | Class label to count |
| Parameter | Type | Description |
|---|---|---|
| statistic | stats.base.Univariate | Statistic to track (Mean, Median, etc.) |
| x | dict | Feature dictionary (ignored) |
| y | float | Target value to update statistic |
| Method | Return Type | Description |
|---|---|---|
| predict_one(x) | class or float | Predicted class or value |
| predict_proba_one(x) | dict | Class probabilities (classifiers only) |
| learn_one(x, y) | None | Updates internal state |
| Attribute | Type | Description |
|---|---|---|
| last_class | any | Most recently seen class |
| classes | set | Set of all seen classes |
| Attribute | Type | Description |
|---|---|---|
| counts | Counter | Count of each class |
| n | int | Total number of samples seen |
| Attribute | Type | Description |
|---|---|---|
| statistic | stats.base.Univariate | The wrapped statistic |
Usage Examples
from river import dummy
from river import stats
from river import metrics
from river import datasets
# Example 1: NoChangeClassifier
sentences = [
('glad happy glad', '+'),
('glad glad joyful', '+'),
('glad pleasant', '+'),
('miserable sad glad', '−')
]
model = dummy.NoChangeClassifier()
for sentence, label in sentences:
model.learn_one(sentence, label)
new_sentence = 'glad sad miserable pleasant glad'
print(model.predict_one(new_sentence))
# '−' (last seen class)
print(model.predict_proba_one(new_sentence))
# {'+': 0, '−': 1}
# Example 2: PriorClassifier
model = dummy.PriorClassifier()
for sentence, label in sentences:
model.learn_one(sentence, label)
print(model.predict_one(new_sentence))
# '+' (most common class: 3 out of 4)
print(model.predict_proba_one(new_sentence))
# {'+': 0.75, '−': 0.25}
# Example 3: StatisticRegressor with Mean
sentences = [
('glad happy glad', 3),
('glad glad joyful', 3),
('glad pleasant', 2),
('miserable sad glad', -3)
]
model = dummy.StatisticRegressor(stats.Mean())
for sentence, score in sentences:
model.learn_one(sentence, score)
print(model.predict_one(new_sentence))
# 1.25 (mean of 3, 3, 2, -3)
# Example 4: Comparing with real model
from river import linear_model
from river import preprocessing
from river import evaluate
dataset = datasets.Phishing()
# Baseline: prior distribution
baseline = dummy.PriorClassifier()
# Real model
model = (
preprocessing.StandardScaler() |
linear_model.LogisticRegression()
)
metric_baseline = metrics.Accuracy()
metric_model = metrics.Accuracy()
for x, y in dataset:
# Baseline predictions
y_pred_baseline = baseline.predict_one(x)
metric_baseline.update(y, y_pred_baseline)
baseline.learn_one(x, y)
# Real model predictions
y_pred_model = model.predict_one(x)
metric_model.update(y, y_pred_model)
model.learn_one(x, y)
print(f"Baseline accuracy: {metric_baseline.get():.3f}")
print(f"Model accuracy: {metric_model.get():.3f}")
print(f"Improvement: {(metric_model.get() - metric_baseline.get()):.3f}")
# Example 5: StatisticRegressor with different statistics
from river import datasets
dataset = datasets.TrumpApproval()
# Mean baseline
model_mean = dummy.StatisticRegressor(stats.Mean())
# Median baseline (more robust)
model_median = dummy.StatisticRegressor(stats.Quantile(0.5))
# Rolling mean (adapts to drift)
model_rolling = dummy.StatisticRegressor(stats.RollingMean(window_size=50))
metric_mean = metrics.MAE()
metric_median = metrics.MAE()
metric_rolling = metrics.MAE()
for x, y in dataset:
# Mean predictions
y_pred = model_mean.predict_one(x)
metric_mean.update(y, y_pred)
model_mean.learn_one(x, y)
# Median predictions
y_pred = model_median.predict_one(x)
metric_median.update(y, y_pred)
model_median.learn_one(x, y)
# Rolling mean predictions
y_pred = model_rolling.predict_one(x)
metric_rolling.update(y, y_pred)
model_rolling.learn_one(x, y)
print(f"Mean MAE: {metric_mean.get():.3f}")
print(f"Median MAE: {metric_median.get():.3f}")
print(f"Rolling Mean MAE: {metric_rolling.get():.3f}")
# Example 6: NoChangeClassifier for time series
# Useful when consecutive labels are correlated
dataset = datasets.Insects()
model_no_change = dummy.NoChangeClassifier()
model_prior = dummy.PriorClassifier()
metric_no_change = metrics.Accuracy()
metric_prior = metrics.Accuracy()
for x, y in dataset.take(1000):
# NoChange: predicts last seen
if model_no_change.last_class is not None:
y_pred = model_no_change.predict_one(x)
metric_no_change.update(y, y_pred)
# Prior: predicts most common
if model_prior.n > 0:
y_pred = model_prior.predict_one(x)
metric_prior.update(y, y_pred)
model_no_change.learn_one(x, y)
model_prior.learn_one(x, y)
print(f"NoChange accuracy: {metric_no_change.get():.3f}")
print(f"Prior accuracy: {metric_prior.get():.3f}")
# Example 7: Using in pipeline (unusual but possible)
from river import compose
# Even though it ignores features, can be in pipeline
pipeline = (
preprocessing.StandardScaler() | # This transforms features
dummy.PriorClassifier() # But this ignores them!
)
# Example 8: Custom statistic regressor
class LastValueRegressor(dummy.StatisticRegressor):
"""Always predicts the last seen value"""
def __init__(self):
# Create a statistic that just stores last value
class Last(stats.base.Univariate):
def __init__(self):
self.last = 0
def update(self, x):
self.last = x
def get(self):
return self.last
super().__init__(Last())
model = LastValueRegressor()
# Example 9: Sanity check for model implementation
def sanity_check(model, dataset):
"""Check if model beats random baseline"""
baseline = dummy.PriorClassifier()
metric_model = metrics.Accuracy()
metric_baseline = metrics.Accuracy()
for x, y in dataset:
# Model
if hasattr(model, 'predict_proba_one'):
y_pred = model.predict_one(x)
metric_model.update(y, y_pred)
# Baseline
if baseline.n > 0:
y_pred = baseline.predict_one(x)
metric_baseline.update(y, y_pred)
model.learn_one(x, y)
baseline.learn_one(x, y)
improvement = metric_model.get() - metric_baseline.get()
print(f"Model: {metric_model.get():.3f}")
print(f"Baseline: {metric_baseline.get():.3f}")
print(f"Improvement: {improvement:.3f}")
return improvement > 0
# Use it
from river import tree
model = tree.HoeffdingTreeClassifier()
dataset = datasets.Phishing()
passes_sanity = sanity_check(model, dataset)