Implementation:Triton inference server Server Ensemble Test Validation
| Field | Value |
|---|---|
| Implementation Name | Ensemble_Test_Validation |
| Implements | Principle:Triton_inference_server_Server_Ensemble_Validation |
| Domains | Quality_Assurance, Model_Serving, Testing |
| Status | Active |
| Last Updated | 2026-02-13 17:00 GMT |
Overview
Concrete test validation pattern for ensemble models using unittest, numpy, and tritonclient. This implementation covers mathematical correctness testing, inference statistics verification, partial output validation, and sequence flag propagation checks.
Description
The ensemble test validation implementation provides a structured approach to verifying ensemble pipeline correctness. It uses Python's unittest framework combined with tritonclient for inference and numpy for numerical comparison.
Key validation patterns:
infer_exact— A utility that tests known input/output relationships by generating deterministic inputs and comparing outputs against expected mathematical resultsnp.allclose— Numerical comparison with configurable tolerance for floating-point outputs- Inference statistics — Server-side statistics verify that the correct composing models and versions were invoked
- Partial output testing — Validates that requesting a subset of outputs works correctly
Usage
This implementation is used when:
- Writing test suites for ensemble models
- Setting up CI/CD validation for ensemble pipeline deployments
- Debugging tensor routing issues in ensemble configurations
- Verifying that model version updates do not break ensemble correctness
Code Reference
Source Location
qa/L0_simple_ensemble/ensemble_test.py:L73-219—EnsembleTestclass with test methodsqa/L0_simple_ensemble/test.sh:L30-148— Shell test runner for ensemble tests
Signature
import unittest
import numpy as np
import tritonclient.grpc as grpcclient
class EnsembleTest(unittest.TestCase):
def test_ensemble_add_sub(self):
"""Validate ensemble outputs match expected mathematical relationship."""
# iu.infer_exact tests known input->output relationship
iu.infer_exact(
self,
model_name="ensemble_add_sub",
tensor_shape=(batch_size, 16),
batch_size=batch_size,
input_dtype=np.float32,
output0_dtype=np.float32,
output1_dtype=np.float32
)
def test_ensemble_partial_output(self):
"""Validate partial output requests and version routing."""
result = client.infer(
model_name="ensemble_add_sub",
inputs=[input0, input1],
outputs=[output0] # Request only one output
)
np.allclose(result.as_numpy("OUTPUT0"), expected_output)
# Verify inference statistics for version routing
stats = client.get_inference_statistics(model_name)
assert stats["model_stats"][0]["inference_count"] > 0
Import
import unittest
import numpy as np
import tritonclient.grpc as grpcclient
Key Assertions
| Assertion | Purpose | Usage |
|---|---|---|
np.allclose(actual, expected) |
Numerical correctness | Compare floating-point outputs with tolerance (default: atol=1e-8, rtol=1e-5) |
inference_count > 0 |
Routing verification | Confirm that composing models were actually invoked |
assertEqual(version, expected_version) |
Version routing | Verify the correct model version was used |
| Sequence flag checks | State propagation | Verify start/end/ready flags propagate through stateful ensembles |
I/O Contract
Inputs
| Input | Type | Description |
|---|---|---|
| Running server with ensemble | Triton instance | A running Triton Inference Server with the ensemble model and all composing models loaded |
| Known input data | numpy arrays | Deterministic input tensors with mathematically derivable expected outputs |
| Expected relationships | mathematical functions | The expected mathematical relationship between inputs and outputs (e.g., add, subtract) |
Outputs
| Output | Type | Description |
|---|---|---|
| Test pass/fail | boolean | Whether all assertions passed |
| Assertion results | test report | Detailed information about which assertions passed or failed |
| Inference statistics | JSON | Server-side statistics confirming routing and version selection |
Usage Examples
Complete ensemble test class:
import unittest
import numpy as np
import tritonclient.grpc as grpcclient
class EnsembleTest(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.client = grpcclient.InferenceServerClient(url="localhost:8001")
def test_ensemble_ready(self):
"""Verify ensemble model is loaded and ready."""
self.assertTrue(self.client.is_model_ready("ensemble_add_sub"))
def test_ensemble_correctness(self):
"""Validate end-to-end mathematical correctness."""
input0_data = np.ones((1, 16), dtype=np.float32) * 3.0
input1_data = np.ones((1, 16), dtype=np.float32) * 2.0
input0 = grpcclient.InferInput("INPUT0", [1, 16], "FP32")
input0.set_data_from_numpy(input0_data)
input1 = grpcclient.InferInput("INPUT1", [1, 16], "FP32")
input1.set_data_from_numpy(input1_data)
result = self.client.infer(
model_name="ensemble_add_sub",
inputs=[input0, input1],
outputs=[
grpcclient.InferRequestedOutput("OUTPUT0"),
grpcclient.InferRequestedOutput("OUTPUT1")
]
)
output0 = result.as_numpy("OUTPUT0")
output1 = result.as_numpy("OUTPUT1")
# Verify mathematical relationships
expected_add = input0_data + input1_data # 5.0
expected_sub = input0_data - input1_data # 1.0
self.assertTrue(np.allclose(output0, expected_add))
self.assertTrue(np.allclose(output1, expected_sub))
def test_partial_output(self):
"""Validate that requesting a subset of outputs works."""
input0 = grpcclient.InferInput("INPUT0", [1, 16], "FP32")
input0.set_data_from_numpy(np.ones((1, 16), dtype=np.float32))
input1 = grpcclient.InferInput("INPUT1", [1, 16], "FP32")
input1.set_data_from_numpy(np.ones((1, 16), dtype=np.float32))
# Request only OUTPUT0
result = self.client.infer(
model_name="ensemble_add_sub",
inputs=[input0, input1],
outputs=[grpcclient.InferRequestedOutput("OUTPUT0")]
)
output0 = result.as_numpy("OUTPUT0")
expected = np.ones((1, 16), dtype=np.float32) * 2.0
self.assertTrue(np.allclose(output0, expected))
def test_inference_statistics(self):
"""Verify routing via inference statistics."""
# Run an inference first
input0 = grpcclient.InferInput("INPUT0", [1, 16], "FP32")
input0.set_data_from_numpy(np.ones((1, 16), dtype=np.float32))
input1 = grpcclient.InferInput("INPUT1", [1, 16], "FP32")
input1.set_data_from_numpy(np.ones((1, 16), dtype=np.float32))
self.client.infer(
model_name="ensemble_add_sub",
inputs=[input0, input1],
outputs=[
grpcclient.InferRequestedOutput("OUTPUT0"),
grpcclient.InferRequestedOutput("OUTPUT1")
]
)
# Check statistics for composing model
stats = self.client.get_inference_statistics("add_sub_model")
self.assertIsNotNone(stats)
# Verify the composing model was invoked
model_stats = stats.model_stats
self.assertGreater(len(model_stats), 0)
if __name__ == "__main__":
unittest.main()
Shell test runner pattern:
#!/bin/bash
# Start Triton server with ensemble model repository
tritonserver --model-repository=/models &
SERVER_PID=$!
# Wait for server to be ready
for i in $(seq 1 30); do
curl -s localhost:8000/v2/health/ready && break
sleep 1
done
# Run ensemble tests
python -m pytest ensemble_test.py -v
TEST_EXIT_CODE=$?
# Cleanup
kill $SERVER_PID
exit $TEST_EXIT_CODE
Batch size variation testing:
def test_ensemble_batch_sizes(self):
"""Test ensemble with various batch sizes."""
for batch_size in [1, 2, 4, 8]:
with self.subTest(batch_size=batch_size):
input0 = grpcclient.InferInput("INPUT0", [batch_size, 16], "FP32")
input0.set_data_from_numpy(
np.random.rand(batch_size, 16).astype(np.float32)
)
input1 = grpcclient.InferInput("INPUT1", [batch_size, 16], "FP32")
input1.set_data_from_numpy(
np.random.rand(batch_size, 16).astype(np.float32)
)
result = self.client.infer(
model_name="ensemble_add_sub",
inputs=[input0, input1],
outputs=[
grpcclient.InferRequestedOutput("OUTPUT0"),
grpcclient.InferRequestedOutput("OUTPUT1")
]
)
output0 = result.as_numpy("OUTPUT0")
self.assertEqual(output0.shape, (batch_size, 16))