Jump to content

Connect Leeroopedia MCP: Equip your AI agents to search best practices, build plans, verify code, diagnose failures, and look up hyperparameter defaults.

Implementation:FlagOpen FlagEmbedding MLVU Order Data

From Leeroopedia


Knowledge Sources
Domains Video Understanding, Temporal Reasoning, Sequence Understanding
Last Updated 2026-02-09 00:00 GMT

Overview

Benchmark dataset for testing temporal ordering and sequence understanding in videos.

Description

The MLVU Order dataset contains 3369 questions that test models' ability to understand and reason about the temporal order of events in videos. Questions ask models to arrange multiple events in the correct chronological sequence, testing both event recognition and temporal reasoning capabilities. Each question presents several events (typically 4) and asks the model to identify the correct ordering from multiple candidate sequences.

This dataset is particularly challenging because it requires:

  • Recognizing multiple distinct events in a video
  • Tracking temporal relationships between events
  • Understanding causality and logical sequence
  • Maintaining temporal context across video duration

The questions use arrow notation (e.g., "1->2->3->4") to represent event sequences.

Usage

Use this dataset for evaluating temporal reasoning in video understanding, benchmarking sequence prediction capabilities, or training models on event ordering tasks.

Code Reference

Source Location

Data Structure

{
    "video": str,              # Video filename
    "duration": float,         # Video duration in seconds
    "question": str,           # Question about event ordering
    "candidates": List[str],   # Four candidate orderings (e.g., "1->2->3->4")
    "answer": str,             # Correct ordering
    "question_type": str       # Always "order"
}

Import

import json

# Load order dataset
with open("research/MLVU/data/5_order.json", "r") as f:
    order_data = [json.loads(line) for line in f]

I/O Contract

Inputs

Name Type Required Description
file_path str Yes Path to the order dataset JSON file

Outputs

Field Type Description
video str Video filename
duration float Video duration in seconds
question str Question about event ordering with event descriptions
candidates List[str] Four possible orderings
answer str Correct ordering
question_type str Type identifier ("order")

Usage Examples

import json
from typing import List, Dict

# Load order dataset
def load_order_data(file_path: str) -> List[Dict]:
    with open(file_path, "r") as f:
        return [json.loads(line) for line in f]

data = load_order_data("research/MLVU/data/5_order.json")

# Example entry
example = data[0]
print(f"Video: {example['video']}")
print(f"Duration: {example['duration']:.2f}s")
print(f"Question: {example['question']}")
print(f"Candidates: {example['candidates']}")
print(f"Answer: {example['answer']}")

# Output:
# Video: order_126.mp4
# Duration: 665.34s
# Question: Arrange the following events from the video in the correct
#           chronological order: (1)Woman tapes her hands with white tape;
#           (2)Woman starts boxing in the ring with a guy;
#           (3)Woman does sit ups on a towel on the beach;
#           (4)Pictures of woman in her bikini are shown.
# Candidates: ['2->1->3->4', '3->2->1->4', '4->3->2->1', '1->2->3->4']
# Answer: 1->2->3->4

# Parse event ordering
def parse_ordering(ordering: str) -> List[int]:
    return [int(x) for x in ordering.split("->")]

# Evaluate temporal reasoning
def evaluate_ordering(model, data: List[Dict]) -> Dict[str, float]:
    correct = 0
    total = len(data)

    # Track by number of events
    results_by_length = {}

    for item in data:
        video_path = f"videos/{item['video']}"

        # Model prediction
        predicted_order = model.predict_order(
            video_path,
            item['question'],
            item['candidates']
        )

        is_correct = (predicted_order == item['answer'])
        correct += is_correct

        # Track by event count
        event_count = len(parse_ordering(item['answer']))
        if event_count not in results_by_length:
            results_by_length[event_count] = {"correct": 0, "total": 0}
        results_by_length[event_count]["total"] += 1
        results_by_length[event_count]["correct"] += is_correct

    overall_acc = correct / total

    # Calculate per-length accuracy
    acc_by_length = {}
    for length, stats in results_by_length.items():
        acc_by_length[f"{length}_events"] = stats["correct"] / stats["total"]

    return {
        "overall_acc": overall_acc,
        **acc_by_length
    }

# Analyze ordering complexity
def analyze_complexity(data: List[Dict]) -> Dict:
    complexities = {
        3: 0,  # 3 events
        4: 0,  # 4 events
        5: 0,  # 5+ events
    }

    for item in data:
        event_count = len(parse_ordering(item['answer']))
        if event_count <= 3:
            complexities[3] += 1
        elif event_count == 4:
            complexities[4] += 1
        else:
            complexities[5] += 1

    return complexities

complexity = analyze_complexity(data)
print("Complexity distribution:", complexity)

# Check for temporal consistency
def has_reverse_order(candidates: List[str]) -> bool:
    """Check if any candidate is exact reverse of correct order"""
    orderings = [parse_ordering(c) for c in candidates]
    for i, ord1 in enumerate(orderings):
        for j, ord2 in enumerate(orderings):
            if i != j and ord1 == list(reversed(ord2)):
                return True
    return False

reverse_examples = [
    item for item in data
    if has_reverse_order(item['candidates'])
]
print(f"Questions with reverse ordering candidates: {len(reverse_examples)}")

# Extract event descriptions from questions
import re

def extract_events(question: str) -> List[str]:
    # Find numbered events in parentheses
    pattern = r'\(\d+\)([^;]+)'
    events = re.findall(pattern, question)
    return [e.strip() for e in events]

sample_events = extract_events(data[0]['question'])
print(f"Sample events: {sample_events}")

Related Pages

Page Connections

Double-click a node to navigate. Hold to expand connections.
Principle
Implementation
Heuristic
Environment