Implementation:FlagOpen FlagEmbedding MLVU Sub Scene Data
| Knowledge Sources | |
|---|---|
| Domains | Video Understanding, Scene Description, Question Answering |
| Last Updated | 2026-02-09 00:00 GMT |
Overview
Benchmark dataset for detailed scene description and understanding of sub-scenes within videos.
Description
The MLVU Sub Scene dataset contains 2488 questions with open-ended answers that test models' ability to provide detailed descriptions of specific scenes or sub-scenes within videos. Unlike multiple-choice questions, this dataset requires generating free-form text descriptions that capture the nuances of particular moments or sequences. Questions typically ask "Please describe..." specific actions, reactions, or scenes, and include scoring points that define the key elements that should be mentioned in complete answers.
This dataset is particularly challenging because it requires:
- Fine-grained scene understanding
- Detailed description generation
- Identifying multiple relevant elements in a scene
- Natural language generation capabilities
- Understanding context and causal relationships
Each entry includes scoring points that serve as evaluation criteria for the quality of generated descriptions.
Usage
Use this dataset for evaluating video captioning and description generation, benchmarking scene understanding with free-form responses, or training models on detailed video narration tasks.
Code Reference
Source Location
- Repository: FlagOpen_FlagEmbedding
- File: research/MLVU/data/8_sub_scene.json
Data Structure
{
"video": str, # Video filename
"duration": float, # Video duration in seconds
"question": str, # Question asking for scene description
"answer": str, # Reference answer description
"question_type": str, # Always "subPlot"
"scoring_points": List[str] # Key elements that should be in answer
}
Import
import json
# Load sub scene dataset
with open("research/MLVU/data/8_sub_scene.json", "r") as f:
subscene_data = [json.loads(line) for line in f]
I/O Contract
Inputs
| Name | Type | Required | Description |
|---|---|---|---|
| file_path | str | Yes | Path to the sub scene dataset JSON file |
Outputs
| Field | Type | Description |
|---|---|---|
| video | str | Video filename |
| duration | float | Video duration in seconds |
| question | str | Question requesting scene description |
| answer | str | Reference description |
| question_type | str | Type identifier ("subPlot") |
| scoring_points | List[str] | Key elements for evaluation |
Usage Examples
import json
from typing import List, Dict
# Load sub scene dataset
def load_subscene_data(file_path: str) -> List[Dict]:
with open(file_path, "r") as f:
return [json.loads(line) for line in f]
data = load_subscene_data("research/MLVU/data/8_sub_scene.json")
# Example entry
example = data[0]
print(f"Video: {example['video']}")
print(f"Duration: {example['duration']:.2f}s")
print(f"Question: {example['question']}")
print(f"Reference Answer: {example['answer']}")
print(f"Scoring Points: {example['scoring_points']}")
# Output:
# Video: subPlot_new_all_126.mp4
# Duration: 5632.83s
# Question: Please describe the scene when the man in the green plaid shirt,
# wearing sunglasses, leads the football players into the golf course.
# Reference Answer: The man in the green plaid shirt, wearing sunglasses,
# leads the football players into the golf course with a swagger.
# A man in a suit quickly runs a few steps to the three people
# on the field to introduce the man in the green plaid shirt.
# Scoring Points: ['The man leads the football players with a swagger',
# 'A man in a suit runs to the three people on the field',
# 'The man in the suit introduces the man in the green plaid shirt']
# Evaluate scene description with scoring points
def evaluate_description(model_answer: str, scoring_points: List[str]) -> float:
"""Calculate score based on how many scoring points are mentioned"""
score = 0
model_lower = model_answer.lower()
for point in scoring_points:
# Simple keyword matching (can be improved with semantic similarity)
point_lower = point.lower()
keywords = point_lower.split()
# Check if most keywords from scoring point appear in answer
matches = sum(1 for kw in keywords if kw in model_lower)
if matches >= len(keywords) * 0.6: # 60% keyword match
score += 1
return score / len(scoring_points) if scoring_points else 0
# Batch evaluation
def evaluate_scene_descriptions(model, data: List[Dict]) -> Dict[str, float]:
total_score = 0
total_samples = len(data)
# Track by number of scoring points
by_complexity = {
"simple": {"score": 0, "count": 0}, # 1-2 points
"medium": {"score": 0, "count": 0}, # 3-4 points
"complex": {"score": 0, "count": 0} # 5+ points
}
for item in data:
video_path = f"videos/{item['video']}"
# Generate description
model_answer = model.describe_scene(
video_path,
item['question']
)
# Score against reference
score = evaluate_description(model_answer, item['scoring_points'])
total_score += score
# Categorize by complexity
num_points = len(item['scoring_points'])
if num_points <= 2:
complexity = "simple"
elif num_points <= 4:
complexity = "medium"
else:
complexity = "complex"
by_complexity[complexity]["score"] += score
by_complexity[complexity]["count"] += 1
avg_score = total_score / total_samples
# Calculate per-complexity scores
complexity_scores = {
f"{comp}_score": stats["score"] / stats["count"]
for comp, stats in by_complexity.items()
if stats["count"] > 0
}
return {
"average_score": avg_score,
**complexity_scores
}
# Analyze scoring point distribution
def analyze_scoring_points(data: List[Dict]) -> Dict:
point_counts = [len(item['scoring_points']) for item in data]
return {
"min_points": min(point_counts),
"max_points": max(point_counts),
"avg_points": sum(point_counts) / len(point_counts),
"distribution": {
"1-2": sum(1 for c in point_counts if c <= 2),
"3-4": sum(1 for c in point_counts if 3 <= c <= 4),
"5+": sum(1 for c in point_counts if c >= 5)
}
}
analysis = analyze_scoring_points(data)
print("Scoring points analysis:", analysis)
# Extract common action types from questions
def extract_action_types(data: List[Dict]) -> List[str]:
action_keywords = []
for item in data:
question = item['question'].lower()
# Common action verbs
verbs = ['describe', 'leads', 'saved', 'reaction', 'action',
'pressed', 'threw', 'picked', 'walked']
for verb in verbs:
if verb in question:
action_keywords.append(verb)
from collections import Counter
return dict(Counter(action_keywords).most_common(10))
action_types = extract_action_types(data)
print("Common action types in questions:", action_types)
# Filter by video length (longer videos are typically harder)
long_videos = [item for item in data if item['duration'] > 3000] # > 50 min
short_videos = [item for item in data if item['duration'] < 1000] # < 16.7 min
print(f"Long videos (>50min): {len(long_videos)}")
print(f"Short videos (<17min): {len(short_videos)}")