Implementation:Microsoft Autogen Studio Eval Runners
| Sources | python/packages/autogen-studio/autogenstudio/eval/runners.py |
|---|---|
| Domains | Evaluation, Agent_Systems, LLM, Task_Execution |
| Last Updated | 2026-02-11 |
Overview
Description
The Studio Eval Runners module provides an extensible framework for executing evaluation tasks against different types of executors. This implementation includes abstract base classes and two concrete runners that can execute tasks using either a single LLM model or a team of agents.
The module defines:
- BaseEvalRunner - Abstract base class defining the runner interface with component serialization
- BaseEvalRunnerConfig - Configuration model for base runner parameters
- ModelEvalRunner - Concrete implementation that executes tasks using a single LLM model client
- ModelEvalRunnerConfig - Configuration model including model client settings
- TeamEvalRunner - Concrete implementation that executes tasks using a team of agents
- TeamEvalRunnerConfig - Configuration model including team configuration
Both runners handle task input conversion (text or multimodal), execute the task, capture timing information, and handle errors gracefully. They leverage AutoGen's component system for serialization and deserialization.
Usage
Runners are the execution layer of the evaluation pipeline. They take EvalTask objects, execute them using their configured executor (model or team), and produce EvalRunResult objects. These results are then passed to judges for quality assessment. The runner system is designed to be extensible, allowing developers to create custom runners for different execution strategies.
Code Reference
Source Location
python/packages/autogen-studio/autogenstudio/eval/runners.py
Signature
class BaseEvalRunnerConfig(BaseModel):
"""Base configuration for evaluation runners."""
name: str
description: str = ""
metadata: Dict[str, Any] = {}
class BaseEvalRunner(ABC, ComponentBase[BaseEvalRunnerConfig]):
"""Base class for evaluation runners."""
component_type = "eval_runner"
def __init__(
self,
name: str,
description: str = "",
metadata: Optional[Dict[str, Any]] = None
)
@abstractmethod
async def run(
self,
task: EvalTask,
cancellation_token: Optional[CancellationToken] = None
) -> EvalRunResult
def _to_config(self) -> BaseEvalRunnerConfig
class ModelEvalRunnerConfig(BaseEvalRunnerConfig):
"""Configuration for ModelEvalRunner."""
model_client: ComponentModel
class ModelEvalRunner(BaseEvalRunner, Component[ModelEvalRunnerConfig]):
"""Evaluation runner that uses a single LLM to process tasks."""
component_config_schema = ModelEvalRunnerConfig
component_type = "eval_runner"
component_provider_override = "autogenstudio.eval.runners.ModelEvalRunner"
def __init__(
self,
model_client: ChatCompletionClient,
name: str = "Model Runner",
description: str = "Evaluates tasks using a single LLM",
metadata: Optional[Dict[str, Any]] = None,
)
async def run(
self,
task: EvalTask,
cancellation_token: Optional[CancellationToken] = None
) -> EvalRunResult
def _to_config(self) -> ModelEvalRunnerConfig
@classmethod
def _from_config(cls, config: ModelEvalRunnerConfig) -> Self
class TeamEvalRunnerConfig(BaseEvalRunnerConfig):
"""Configuration for TeamEvalRunner."""
team: ComponentModel
class TeamEvalRunner(BaseEvalRunner, Component[TeamEvalRunnerConfig]):
"""Evaluation runner that uses a team of agents to process tasks."""
component_config_schema = TeamEvalRunnerConfig
component_type = "eval_runner"
component_provider_override = "autogenstudio.eval.runners.TeamEvalRunner"
def __init__(
self,
team: Union[Team, ComponentModel],
name: str = "Team Runner",
description: str = "Evaluates tasks using a team of agents",
metadata: Optional[Dict[str, Any]] = None,
)
async def run(
self,
task: EvalTask,
cancellation_token: Optional[CancellationToken] = None
) -> EvalRunResult
def _to_config(self) -> TeamEvalRunnerConfig
@classmethod
def _from_config(cls, config: TeamEvalRunnerConfig) -> Self
Import
from autogenstudio.eval.runners import (
BaseEvalRunner,
BaseEvalRunnerConfig,
ModelEvalRunner,
ModelEvalRunnerConfig,
TeamEvalRunner,
TeamEvalRunnerConfig
)
I/O Contract
Inputs
| Parameter | Type | Description |
|---|---|---|
| task | EvalTask | The evaluation task to execute, containing input (text or multimodal), name, description, and metadata |
| cancellation_token | Optional[CancellationToken] | Token to cancel the task execution |
| model_client (ModelEvalRunner) | ChatCompletionClient | LLM client used to execute the task |
| team (TeamEvalRunner) | Union[Team, ComponentModel] | Team of agents used to execute the task |
Outputs
| Return Type | Description |
|---|---|
| EvalRunResult | Result object containing:
|
Usage Examples
Creating and Using a Model Runner
from autogenstudio.eval.runners import ModelEvalRunner
from autogen_ext.models.openai import OpenAIChatCompletionClient
from autogenstudio.datamodel.eval import EvalTask
# Create model client
model_client = OpenAIChatCompletionClient(
model="gpt-4",
api_key="your-api-key"
)
# Create model runner
model_runner = ModelEvalRunner(
model_client=model_client,
name="GPT-4 Runner",
description="Executes tasks using GPT-4"
)
# Create evaluation task
task = EvalTask(
name="Math Question",
description="Test mathematical reasoning",
input="What is 15% of 240?",
expected_outputs=["36"]
)
# Run task
result = await model_runner.run(task)
# Check result
if result.status:
print(f"Task completed successfully")
print(f"Duration: {(result.end_time - result.start_time).total_seconds()}s")
if result.result:
for msg in result.result.messages:
print(f"Response: {msg.content}")
else:
print(f"Task failed: {result.error}")
Creating and Using a Team Runner
from autogenstudio.eval.runners import TeamEvalRunner
from autogen_agentchat.teams import RoundRobinGroupChat
from autogen_agentchat.agents import AssistantAgent
from autogen_ext.models.openai import OpenAIChatCompletionClient
# Create agents
model_client = OpenAIChatCompletionClient(
model="gpt-4",
api_key="your-api-key"
)
researcher = AssistantAgent(
name="researcher",
model_client=model_client,
system_message="You are a research assistant."
)
writer = AssistantAgent(
name="writer",
model_client=model_client,
system_message="You are a writing assistant."
)
# Create team
team = RoundRobinGroupChat([researcher, writer])
# Create team runner
team_runner = TeamEvalRunner(
team=team,
name="Research Team Runner",
description="Executes tasks using a research team"
)
# Create evaluation task
task = EvalTask(
name="Research Task",
description="Research and write about a topic",
input="Research and write a brief summary about quantum computing.",
metadata={"task_type": "research_and_write"}
)
# Run task
result = await team_runner.run(task)
# Access team result
if result.status and result.result:
print(f"Task completed in {(result.end_time - result.start_time).total_seconds()}s")
print(f"Number of messages: {len(result.result.messages)}")
# Display conversation
for msg in result.result.messages:
print(f"\n{msg.source}: {msg.content}")
Handling Multimodal Inputs
from autogenstudio.eval.runners import ModelEvalRunner
from autogenstudio.datamodel.eval import EvalTask
from autogen_core import Image
# Create multimodal task
task = EvalTask(
name="Image Analysis",
description="Analyze an image and describe it",
input=[
"What do you see in this image? Describe it in detail.",
Image.from_file("path/to/image.png")
],
metadata={"task_type": "vision"}
)
# Create runner with vision-capable model
from autogen_ext.models.openai import OpenAIChatCompletionClient
model_client = OpenAIChatCompletionClient(
model="gpt-4o", # Vision-capable model
api_key="your-api-key"
)
runner = ModelEvalRunner(model_client=model_client)
# Run multimodal task
result = await runner.run(task)
if result.status:
print("Image analysis result:")
print(result.result.messages[-1].content)
Serializing and Deserializing Runners
from autogenstudio.eval.runners import ModelEvalRunner, TeamEvalRunner
import json
# Create and serialize model runner
model_runner = ModelEvalRunner(
model_client=model_client,
name="Test Runner",
description="Runner for testing"
)
runner_config = model_runner.dump_component()
# Save to file
with open("runner_config.json", "w") as f:
json.dump(runner_config.model_dump(), f, indent=2)
# Load from file and deserialize
with open("runner_config.json", "r") as f:
config_dict = json.load(f)
# Reconstruct runner
from autogen_core import ComponentModel
config = ComponentModel(**config_dict)
restored_runner = ModelEvalRunner.load_component(config)
# Use restored runner
result = await restored_runner.run(task)
Creating a Custom Runner
from autogenstudio.eval.runners import BaseEvalRunner, BaseEvalRunnerConfig
from autogenstudio.datamodel.eval import EvalTask, EvalRunResult
from autogen_agentchat.base import TaskResult
from autogen_agentchat.messages import TextMessage
from datetime import datetime
from typing import Optional
from autogen_core import CancellationToken
class MockEvalRunner(BaseEvalRunner):
"""Custom runner that returns mock responses for testing."""
def __init__(
self,
mock_response: str = "Mock response",
name: str = "Mock Runner",
description: str = "Returns mock responses"
):
super().__init__(name, description)
self.mock_response = mock_response
async def run(
self,
task: EvalTask,
cancellation_token: Optional[CancellationToken] = None
) -> EvalRunResult:
"""Return a mock result."""
start_time = datetime.now()
# Simulate some processing
import asyncio
await asyncio.sleep(0.1)
# Create mock result
task_result = TaskResult(
messages=[
TextMessage(
content=f"{self.mock_response}: {task.input}",
source="mock"
)
]
)
return EvalRunResult(
result=task_result,
status=True,
start_time=start_time,
end_time=datetime.now()
)
# Use custom runner
mock_runner = MockEvalRunner(
mock_response="Simulated response for",
name="Test Mock Runner"
)
result = await mock_runner.run(task)
print(f"Mock result: {result.result.messages[0].content}")
Running Multiple Tasks in Parallel
from autogenstudio.eval.runners import ModelEvalRunner
import asyncio
# Create multiple tasks
tasks = [
EvalTask(name="Task 1", input="What is 2+2?"),
EvalTask(name="Task 2", input="What is the capital of France?"),
EvalTask(name="Task 3", input="Name three primary colors."),
EvalTask(name="Task 4", input="What is the largest planet?"),
]
# Run all tasks in parallel
results = await asyncio.gather(
*[model_runner.run(task) for task in tasks]
)
# Process results
for task, result in zip(tasks, results):
if result.status:
duration = (result.end_time - result.start_time).total_seconds()
print(f"{task.name}: Success ({duration:.2f}s)")
if result.result:
print(f" Response: {result.result.messages[-1].content}")
else:
print(f"{task.name}: Failed - {result.error}")
Error Handling and Retry Logic
from autogenstudio.eval.runners import ModelEvalRunner
import asyncio
async def run_with_retry(
runner: ModelEvalRunner,
task: EvalTask,
max_retries: int = 3
) -> EvalRunResult:
"""Run task with retry logic."""
for attempt in range(max_retries):
try:
result = await runner.run(task)
if result.status:
return result
print(f"Attempt {attempt + 1} failed: {result.error}")
if attempt < max_retries - 1:
wait_time = 2 ** attempt # Exponential backoff
print(f"Retrying in {wait_time} seconds...")
await asyncio.sleep(wait_time)
except Exception as e:
print(f"Attempt {attempt + 1} raised exception: {e}")
if attempt < max_retries - 1:
await asyncio.sleep(2 ** attempt)
# Return failure result after all retries
from datetime import datetime
return EvalRunResult(
status=False,
error=f"Failed after {max_retries} attempts",
start_time=datetime.now(),
end_time=datetime.now()
)
# Use retry logic
result = await run_with_retry(model_runner, task, max_retries=3)
Related Pages
- Studio Eval Datamodel - Data models used by runners
- Studio Eval Judges - Judges that evaluate runner results
- Studio Datamodel Pydantic Types - Additional data types
- Evaluation Domain - All evaluation-related implementations
- Agent Systems Domain - Agent team implementations
- Task Execution Domain - Task execution implementations