Implementation:FlagOpen FlagEmbedding MLVU Summary Data
| Knowledge Sources | |
|---|---|
| Domains | Video Understanding, Summarization, Text Generation |
| Last Updated | 2026-02-09 00:00 GMT |
Overview
Benchmark dataset for video summarization requiring comprehensive understanding and generation of video content summaries.
Description
The MLVU Summary dataset contains 1521 questions that test models' ability to generate comprehensive summaries of entire videos. Unlike sub-scene descriptions, this task requires understanding the full video narrative arc, identifying key events, and synthesizing information into coherent summaries. Questions ask models to summarize main contents, key points, or provide overviews of videos, typically requiring long-form text generation.
This is one of the most challenging tasks in the MLVU benchmark as it requires:
- Complete video comprehension from start to finish
- Identifying salient events and filtering noise
- Understanding narrative structure and progression
- Long-form text generation capabilities
- Balancing detail with brevity
- Maintaining coherence across extended content
The reference answers provide detailed summaries that capture major plot points, character interactions, setting changes, and key events.
Usage
Use this dataset for evaluating video summarization capabilities, benchmarking long-form video understanding, or training models on comprehensive video-to-text generation tasks.
Code Reference
Source Location
- Repository: FlagOpen_FlagEmbedding
- File: research/MLVU/data/9_summary.json
Data Structure
{
"video": str, # Video filename
"duration": float, # Video duration in seconds
"question": str, # Question asking for summary
"answer": str, # Reference summary text
"question_type": str # Always "summary"
}
Import
import json
# Load summary dataset
with open("research/MLVU/data/9_summary.json", "r") as f:
summary_data = [json.loads(line) for line in f]
I/O Contract
Inputs
| Name | Type | Required | Description |
|---|---|---|---|
| file_path | str | Yes | Path to the summary dataset JSON file |
Outputs
| Field | Type | Description |
|---|---|---|
| video | str | Video filename |
| duration | float | Video duration in seconds |
| question | str | Question requesting video summary |
| answer | str | Reference summary text |
| question_type | str | Type identifier ("summary") |
Usage Examples
import json
from typing import List, Dict
# Load summary dataset
def load_summary_data(file_path: str) -> List[Dict]:
with open(file_path, "r") as f:
return [json.loads(line) for line in f]
data = load_summary_data("research/MLVU/data/9_summary.json")
# Example entry
example = data[0]
print(f"Video: {example['video']}")
print(f"Duration: {example['duration']:.2f}s")
print(f"Question: {example['question']}")
print(f"Reference Summary: {example['answer'][:200]}...") # First 200 chars
# Output:
# Video: 217.mp4
# Duration: 480.00s
# Question: Please summarize this video, including its main content.
# Reference Summary: The video starts with waves lapping against the rocks,
# creating a spray. Then, a boat appears with two men on board,
# one with a hat and the other without. The man without a hat...
# Evaluate summary quality with multiple metrics
def evaluate_summary(model_summary: str, reference_summary: str) -> Dict[str, float]:
"""Evaluate generated summary against reference"""
from collections import Counter
# Simple metrics (in practice, use ROUGE, BLEU, BERTScore, etc.)
# Length ratio
len_ratio = len(model_summary) / len(reference_summary)
# Word overlap (unigram precision/recall)
model_words = set(model_summary.lower().split())
ref_words = set(reference_summary.lower().split())
precision = len(model_words & ref_words) / len(model_words) if model_words else 0
recall = len(model_words & ref_words) / len(ref_words) if ref_words else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
return {
"length_ratio": len_ratio,
"precision": precision,
"recall": recall,
"f1": f1
}
# Batch evaluation
def evaluate_summarization(model, data: List[Dict]) -> Dict[str, float]:
total_metrics = {
"length_ratio": 0,
"precision": 0,
"recall": 0,
"f1": 0
}
# Track by video duration
by_duration = {
"short": {"metrics": total_metrics.copy(), "count": 0}, # < 5 min
"medium": {"metrics": total_metrics.copy(), "count": 0}, # 5-15 min
"long": {"metrics": total_metrics.copy(), "count": 0} # > 15 min
}
for item in data:
video_path = f"videos/{item['video']}"
# Generate summary
model_summary = model.summarize_video(
video_path,
item['question']
)
# Evaluate
metrics = evaluate_summary(model_summary, item['answer'])
# Update totals
for key, value in metrics.items():
total_metrics[key] += value
# Categorize by duration
duration = item['duration']
if duration < 300:
category = "short"
elif duration < 900:
category = "medium"
else:
category = "long"
by_duration[category]["count"] += 1
for key, value in metrics.items():
by_duration[category]["metrics"][key] += value
# Calculate averages
total = len(data)
avg_metrics = {k: v / total for k, v in total_metrics.items()}
# Per-duration averages
duration_metrics = {}
for dur, stats in by_duration.items():
if stats["count"] > 0:
for key, value in stats["metrics"].items():
duration_metrics[f"{dur}_{key}"] = value / stats["count"]
return {
**avg_metrics,
**duration_metrics
}
# Analyze summary characteristics
def analyze_summaries(data: List[Dict]) -> Dict:
summary_lengths = [len(item['answer'].split()) for item in data]
# Extract common question phrasings
question_types = []
for item in data:
q_lower = item['question'].lower()
if "summarize" in q_lower:
question_types.append("summarize")
elif "main content" in q_lower:
question_types.append("main_content")
elif "key points" in q_lower:
question_types.append("key_points")
else:
question_types.append("other")
from collections import Counter
return {
"num_videos": len(data),
"avg_summary_words": sum(summary_lengths) / len(summary_lengths),
"min_summary_words": min(summary_lengths),
"max_summary_words": max(summary_lengths),
"question_types": dict(Counter(question_types)),
"avg_video_duration": sum(item['duration'] for item in data) / len(data)
}
analysis = analyze_summaries(data)
print("Summary dataset analysis:", analysis)
# Sample videos by complexity
def sample_by_complexity(data: List[Dict], n: int = 5) -> Dict[str, List[Dict]]:
"""Sample videos of different complexities"""
# Use duration and summary length as complexity proxies
scored_items = []
for item in data:
complexity_score = item['duration'] + len(item['answer'].split())
scored_items.append((complexity_score, item))
scored_items.sort(key=lambda x: x[0])
return {
"easiest": [item for _, item in scored_items[:n]],
"medium": [item for _, item in scored_items[len(scored_items)//2 - n//2:len(scored_items)//2 + n//2]],
"hardest": [item for _, item in scored_items[-n:]]
}
samples = sample_by_complexity(data, n=3)
print(f"Sampled {len(samples['easiest'])} easy, {len(samples['medium'])} medium, {len(samples['hardest'])} hard videos")
# Identify narrative elements in summaries
def analyze_narrative_elements(data: List[Dict]) -> Dict:
"""Analyze presence of narrative elements in summaries"""
narrative_keywords = {
"character": ["man", "woman", "person", "character", "he", "she"],
"action": ["starts", "begins", "moves", "walks", "runs", "appears"],
"setting": ["scene", "room", "outside", "beach", "house", "field"],
"transition": ["then", "next", "after", "subsequently", "later"],
"ending": ["finally", "ends", "concludes", "ultimately"]
}
element_counts = {key: 0 for key in narrative_keywords}
for item in data:
summary_lower = item['answer'].lower()
for element, keywords in narrative_keywords.items():
if any(kw in summary_lower for kw in keywords):
element_counts[element] += 1
return {
elem: count / len(data) for elem, count in element_counts.items()
}
narrative_analysis = analyze_narrative_elements(data)
print("Narrative element presence:", narrative_analysis)