Implementation:FlagOpen FlagEmbedding MLVU Summary Data

Knowledge Sources	FlagOpen_FlagEmbedding
Domains	Video Understanding, Summarization, Text Generation
Last Updated	2026-02-09 00:00 GMT

Overview

Benchmark dataset for video summarization requiring comprehensive understanding and generation of video content summaries.

Description

The MLVU Summary dataset contains 1521 questions that test models' ability to generate comprehensive summaries of entire videos. Unlike sub-scene descriptions, this task requires understanding the full video narrative arc, identifying key events, and synthesizing information into coherent summaries. Questions ask models to summarize main contents, key points, or provide overviews of videos, typically requiring long-form text generation.

This is one of the most challenging tasks in the MLVU benchmark as it requires:

Complete video comprehension from start to finish
Identifying salient events and filtering noise
Understanding narrative structure and progression
Long-form text generation capabilities
Balancing detail with brevity
Maintaining coherence across extended content

The reference answers provide detailed summaries that capture major plot points, character interactions, setting changes, and key events.

Usage

Use this dataset for evaluating video summarization capabilities, benchmarking long-form video understanding, or training models on comprehensive video-to-text generation tasks.

Code Reference

Source Location

Repository: FlagOpen_FlagEmbedding
File: research/MLVU/data/9_summary.json

Data Structure

{
    "video": str,              # Video filename
    "duration": float,         # Video duration in seconds
    "question": str,           # Question asking for summary
    "answer": str,             # Reference summary text
    "question_type": str       # Always "summary"
}

Import

import json

# Load summary dataset
with open("research/MLVU/data/9_summary.json", "r") as f:
    summary_data = [json.loads(line) for line in f]

I/O Contract

Inputs

Name	Type	Required	Description
file_path	str	Yes	Path to the summary dataset JSON file

Outputs

Field	Type	Description
video	str	Video filename
duration	float	Video duration in seconds
question	str	Question requesting video summary
answer	str	Reference summary text
question_type	str	Type identifier ("summary")

Usage Examples

import json
from typing import List, Dict

# Load summary dataset
def load_summary_data(file_path: str) -> List[Dict]:
    with open(file_path, "r") as f:
        return [json.loads(line) for line in f]

data = load_summary_data("research/MLVU/data/9_summary.json")

# Example entry
example = data[0]
print(f"Video: {example['video']}")
print(f"Duration: {example['duration']:.2f}s")
print(f"Question: {example['question']}")
print(f"Reference Summary: {example['answer'][:200]}...")  # First 200 chars

# Output:
# Video: 217.mp4
# Duration: 480.00s
# Question: Please summarize this video, including its main content.
# Reference Summary: The video starts with waves lapping against the rocks,
#                    creating a spray. Then, a boat appears with two men on board,
#                    one with a hat and the other without. The man without a hat...

# Evaluate summary quality with multiple metrics
def evaluate_summary(model_summary: str, reference_summary: str) -> Dict[str, float]:
    """Evaluate generated summary against reference"""
    from collections import Counter

    # Simple metrics (in practice, use ROUGE, BLEU, BERTScore, etc.)

    # Length ratio
    len_ratio = len(model_summary) / len(reference_summary)

    # Word overlap (unigram precision/recall)
    model_words = set(model_summary.lower().split())
    ref_words = set(reference_summary.lower().split())

    precision = len(model_words & ref_words) / len(model_words) if model_words else 0
    recall = len(model_words & ref_words) / len(ref_words) if ref_words else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

    return {
        "length_ratio": len_ratio,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

# Batch evaluation
def evaluate_summarization(model, data: List[Dict]) -> Dict[str, float]:
    total_metrics = {
        "length_ratio": 0,
        "precision": 0,
        "recall": 0,
        "f1": 0
    }

    # Track by video duration
    by_duration = {
        "short": {"metrics": total_metrics.copy(), "count": 0},    # < 5 min
        "medium": {"metrics": total_metrics.copy(), "count": 0},   # 5-15 min
        "long": {"metrics": total_metrics.copy(), "count": 0}      # > 15 min
    }

    for item in data:
        video_path = f"videos/{item['video']}"

        # Generate summary
        model_summary = model.summarize_video(
            video_path,
            item['question']
        )

        # Evaluate
        metrics = evaluate_summary(model_summary, item['answer'])

        # Update totals
        for key, value in metrics.items():
            total_metrics[key] += value

        # Categorize by duration
        duration = item['duration']
        if duration < 300:
            category = "short"
        elif duration < 900:
            category = "medium"
        else:
            category = "long"

        by_duration[category]["count"] += 1
        for key, value in metrics.items():
            by_duration[category]["metrics"][key] += value

    # Calculate averages
    total = len(data)
    avg_metrics = {k: v / total for k, v in total_metrics.items()}

    # Per-duration averages
    duration_metrics = {}
    for dur, stats in by_duration.items():
        if stats["count"] > 0:
            for key, value in stats["metrics"].items():
                duration_metrics[f"{dur}_{key}"] = value / stats["count"]

    return {
        **avg_metrics,
        **duration_metrics
    }

# Analyze summary characteristics
def analyze_summaries(data: List[Dict]) -> Dict:
    summary_lengths = [len(item['answer'].split()) for item in data]

    # Extract common question phrasings
    question_types = []
    for item in data:
        q_lower = item['question'].lower()
        if "summarize" in q_lower:
            question_types.append("summarize")
        elif "main content" in q_lower:
            question_types.append("main_content")
        elif "key points" in q_lower:
            question_types.append("key_points")
        else:
            question_types.append("other")

    from collections import Counter

    return {
        "num_videos": len(data),
        "avg_summary_words": sum(summary_lengths) / len(summary_lengths),
        "min_summary_words": min(summary_lengths),
        "max_summary_words": max(summary_lengths),
        "question_types": dict(Counter(question_types)),
        "avg_video_duration": sum(item['duration'] for item in data) / len(data)
    }

analysis = analyze_summaries(data)
print("Summary dataset analysis:", analysis)

# Sample videos by complexity
def sample_by_complexity(data: List[Dict], n: int = 5) -> Dict[str, List[Dict]]:
    """Sample videos of different complexities"""

    # Use duration and summary length as complexity proxies
    scored_items = []
    for item in data:
        complexity_score = item['duration'] + len(item['answer'].split())
        scored_items.append((complexity_score, item))

    scored_items.sort(key=lambda x: x[0])

    return {
        "easiest": [item for _, item in scored_items[:n]],
        "medium": [item for _, item in scored_items[len(scored_items)//2 - n//2:len(scored_items)//2 + n//2]],
        "hardest": [item for _, item in scored_items[-n:]]
    }

samples = sample_by_complexity(data, n=3)
print(f"Sampled {len(samples['easiest'])} easy, {len(samples['medium'])} medium, {len(samples['hardest'])} hard videos")

# Identify narrative elements in summaries
def analyze_narrative_elements(data: List[Dict]) -> Dict:
    """Analyze presence of narrative elements in summaries"""

    narrative_keywords = {
        "character": ["man", "woman", "person", "character", "he", "she"],
        "action": ["starts", "begins", "moves", "walks", "runs", "appears"],
        "setting": ["scene", "room", "outside", "beach", "house", "field"],
        "transition": ["then", "next", "after", "subsequently", "later"],
        "ending": ["finally", "ends", "concludes", "ultimately"]
    }

    element_counts = {key: 0 for key in narrative_keywords}

    for item in data:
        summary_lower = item['answer'].lower()
        for element, keywords in narrative_keywords.items():
            if any(kw in summary_lower for kw in keywords):
                element_counts[element] += 1

    return {
        elem: count / len(data) for elem, count in element_counts.items()
    }

narrative_analysis = analyze_narrative_elements(data)
print("Narrative element presence:", narrative_analysis)

Related Pages

Page Connections

Double-click a node to navigate. Hold to expand connections.

Principle

Implementation

Heuristic

Environment