Implementation:FlagOpen FlagEmbedding MLVU Order Data
| Knowledge Sources | |
|---|---|
| Domains | Video Understanding, Temporal Reasoning, Sequence Understanding |
| Last Updated | 2026-02-09 00:00 GMT |
Overview
Benchmark dataset for testing temporal ordering and sequence understanding in videos.
Description
The MLVU Order dataset contains 3369 questions that test models' ability to understand and reason about the temporal order of events in videos. Questions ask models to arrange multiple events in the correct chronological sequence, testing both event recognition and temporal reasoning capabilities. Each question presents several events (typically 4) and asks the model to identify the correct ordering from multiple candidate sequences.
This dataset is particularly challenging because it requires:
- Recognizing multiple distinct events in a video
- Tracking temporal relationships between events
- Understanding causality and logical sequence
- Maintaining temporal context across video duration
The questions use arrow notation (e.g., "1->2->3->4") to represent event sequences.
Usage
Use this dataset for evaluating temporal reasoning in video understanding, benchmarking sequence prediction capabilities, or training models on event ordering tasks.
Code Reference
Source Location
- Repository: FlagOpen_FlagEmbedding
- File: research/MLVU/data/5_order.json
Data Structure
{
"video": str, # Video filename
"duration": float, # Video duration in seconds
"question": str, # Question about event ordering
"candidates": List[str], # Four candidate orderings (e.g., "1->2->3->4")
"answer": str, # Correct ordering
"question_type": str # Always "order"
}
Import
import json
# Load order dataset
with open("research/MLVU/data/5_order.json", "r") as f:
order_data = [json.loads(line) for line in f]
I/O Contract
Inputs
| Name | Type | Required | Description |
|---|---|---|---|
| file_path | str | Yes | Path to the order dataset JSON file |
Outputs
| Field | Type | Description |
|---|---|---|
| video | str | Video filename |
| duration | float | Video duration in seconds |
| question | str | Question about event ordering with event descriptions |
| candidates | List[str] | Four possible orderings |
| answer | str | Correct ordering |
| question_type | str | Type identifier ("order") |
Usage Examples
import json
from typing import List, Dict
# Load order dataset
def load_order_data(file_path: str) -> List[Dict]:
with open(file_path, "r") as f:
return [json.loads(line) for line in f]
data = load_order_data("research/MLVU/data/5_order.json")
# Example entry
example = data[0]
print(f"Video: {example['video']}")
print(f"Duration: {example['duration']:.2f}s")
print(f"Question: {example['question']}")
print(f"Candidates: {example['candidates']}")
print(f"Answer: {example['answer']}")
# Output:
# Video: order_126.mp4
# Duration: 665.34s
# Question: Arrange the following events from the video in the correct
# chronological order: (1)Woman tapes her hands with white tape;
# (2)Woman starts boxing in the ring with a guy;
# (3)Woman does sit ups on a towel on the beach;
# (4)Pictures of woman in her bikini are shown.
# Candidates: ['2->1->3->4', '3->2->1->4', '4->3->2->1', '1->2->3->4']
# Answer: 1->2->3->4
# Parse event ordering
def parse_ordering(ordering: str) -> List[int]:
return [int(x) for x in ordering.split("->")]
# Evaluate temporal reasoning
def evaluate_ordering(model, data: List[Dict]) -> Dict[str, float]:
correct = 0
total = len(data)
# Track by number of events
results_by_length = {}
for item in data:
video_path = f"videos/{item['video']}"
# Model prediction
predicted_order = model.predict_order(
video_path,
item['question'],
item['candidates']
)
is_correct = (predicted_order == item['answer'])
correct += is_correct
# Track by event count
event_count = len(parse_ordering(item['answer']))
if event_count not in results_by_length:
results_by_length[event_count] = {"correct": 0, "total": 0}
results_by_length[event_count]["total"] += 1
results_by_length[event_count]["correct"] += is_correct
overall_acc = correct / total
# Calculate per-length accuracy
acc_by_length = {}
for length, stats in results_by_length.items():
acc_by_length[f"{length}_events"] = stats["correct"] / stats["total"]
return {
"overall_acc": overall_acc,
**acc_by_length
}
# Analyze ordering complexity
def analyze_complexity(data: List[Dict]) -> Dict:
complexities = {
3: 0, # 3 events
4: 0, # 4 events
5: 0, # 5+ events
}
for item in data:
event_count = len(parse_ordering(item['answer']))
if event_count <= 3:
complexities[3] += 1
elif event_count == 4:
complexities[4] += 1
else:
complexities[5] += 1
return complexities
complexity = analyze_complexity(data)
print("Complexity distribution:", complexity)
# Check for temporal consistency
def has_reverse_order(candidates: List[str]) -> bool:
"""Check if any candidate is exact reverse of correct order"""
orderings = [parse_ordering(c) for c in candidates]
for i, ord1 in enumerate(orderings):
for j, ord2 in enumerate(orderings):
if i != j and ord1 == list(reversed(ord2)):
return True
return False
reverse_examples = [
item for item in data
if has_reverse_order(item['candidates'])
]
print(f"Questions with reverse ordering candidates: {len(reverse_examples)}")
# Extract event descriptions from questions
import re
def extract_events(question: str) -> List[str]:
# Find numbered events in parentheses
pattern = r'\(\d+\)([^;]+)'
events = re.findall(pattern, question)
return [e.strip() for e in events]
sample_events = extract_events(data[0]['question'])
print(f"Sample events: {sample_events}")