Implementation:FlagOpen FlagEmbedding MLVU Topic Reasoning Data
| Knowledge Sources | |
|---|---|
| Domains | Video Understanding, Reasoning, Topic Classification |
| Last Updated | 2026-02-09 00:00 GMT |
Overview
Benchmark dataset for topic identification and reasoning about video content themes and subjects.
Description
The MLVU Topic Reasoning dataset contains 3434 questions that test models' ability to identify and reason about high-level topics, themes, and subjects in videos. Questions focus on understanding the overall context, genre, setting, and main subjects of videos rather than specific details. This requires holistic video comprehension and the ability to abstract from specific events to general themes.
Questions typically ask about:
- Main background or setting of videos
- Genre or film type classification
- Primary subjects or themes
- Overall context and environment
- Color schemes and visual styles
This tests higher-level semantic understanding beyond simple object or action recognition.
Usage
Use this dataset for evaluating semantic video understanding, benchmarking genre classification capabilities, or training models on high-level video comprehension tasks.
Code Reference
Source Location
- Repository: FlagOpen_FlagEmbedding
- File: research/MLVU/data/7_topic_reasoning.json
Data Structure
{
"video": str, # Video filename
"duration": int, # Video duration in seconds
"question": str, # Question about topic/theme
"candidates": List[str], # Four candidate answers
"answer": str, # Correct answer
"question_type": str # Always "topic_reasoning"
}
Import
import json
# Load topic reasoning dataset
with open("research/MLVU/data/7_topic_reasoning.json", "r") as f:
topic_data = [json.loads(line) for line in f]
I/O Contract
Inputs
| Name | Type | Required | Description |
|---|---|---|---|
| file_path | str | Yes | Path to the topic reasoning dataset JSON file |
Outputs
| Field | Type | Description |
|---|---|---|
| video | str | Video filename |
| duration | int | Video duration in seconds |
| question | str | Question about video topic/theme |
| candidates | List[str] | Four possible answers |
| answer | str | Correct answer |
| question_type | str | Type identifier ("topic_reasoning") |
Usage Examples
import json
from typing import List, Dict
from collections import Counter
# Load topic reasoning dataset
def load_topic_data(file_path: str) -> List[Dict]:
with open(file_path, "r") as f:
return [json.loads(line) for line in f]
data = load_topic_data("research/MLVU/data/7_topic_reasoning.json")
# Example entry
example = data[0]
print(f"Video: {example['video']}")
print(f"Duration: {example['duration']}s")
print(f"Question: {example['question']}")
print(f"Candidates: {example['candidates']}")
print(f"Answer: {example['answer']}")
# Output:
# Video: AWA-6.mp4
# Duration: 450s
# Question: What is the main background of the video?
# Candidates: ['Grassland', 'Lake', 'Ocean', 'Desert']
# Answer: Grassland
# Categorize question types
def categorize_questions(data: List[Dict]) -> Dict[str, List[Dict]]:
categories = {
"background": [],
"film_type": [],
"color": [],
"theme": [],
"other": []
}
for item in data:
question_lower = item['question'].lower()
if "background" in question_lower:
categories["background"].append(item)
elif "film" in question_lower or "type" in question_lower:
categories["film_type"].append(item)
elif "color" in question_lower:
categories["color"].append(item)
elif "theme" in question_lower or "main" in question_lower:
categories["theme"].append(item)
else:
categories["other"].append(item)
return categories
categories = categorize_questions(data)
print("Question categories:")
for cat, items in categories.items():
print(f" {cat}: {len(items)}")
# Evaluate topic reasoning
def evaluate_topic_reasoning(model, data: List[Dict]) -> Dict[str, float]:
correct = 0
total = len(data)
# Track accuracy by question category
category_results = {}
for item in data:
video_path = f"videos/{item['video']}"
# Model prediction
predicted = model.identify_topic(
video_path,
item['question'],
item['candidates']
)
is_correct = (predicted == item['answer'])
correct += is_correct
# Categorize question
question_lower = item['question'].lower()
if "background" in question_lower:
category = "background"
elif "film" in question_lower or "type" in question_lower:
category = "film_type"
elif "color" in question_lower:
category = "color"
else:
category = "general"
if category not in category_results:
category_results[category] = {"correct": 0, "total": 0}
category_results[category]["total"] += 1
category_results[category]["correct"] += is_correct
overall_acc = correct / total
# Calculate per-category accuracy
category_acc = {
f"{cat}_acc": stats["correct"] / stats["total"]
for cat, stats in category_results.items()
}
return {
"overall_acc": overall_acc,
**category_acc
}
# Analyze answer distributions
def analyze_answers(data: List[Dict]) -> Dict:
# Get all unique answer values
all_answers = [item['answer'] for item in data]
answer_counts = Counter(all_answers)
# Film types
film_questions = [
item for item in data
if "film" in item['question'].lower() or "type" in item['question'].lower()
]
film_types = Counter(item['answer'] for item in film_questions)
# Background types
background_questions = [
item for item in data
if "background" in item['question'].lower()
]
background_types = Counter(item['answer'] for item in background_questions)
return {
"total_unique_answers": len(answer_counts),
"most_common_answers": answer_counts.most_common(10),
"film_types": dict(film_types),
"background_types": dict(background_types)
}
analysis = analyze_answers(data)
print("Answer analysis:", analysis)
# Filter by video source
movie_videos = [item for item in data if "movie" in item['video'].lower()]
tv_videos = [item for item in data if "tv" in item['video'].lower()]
other_videos = [item for item in data if "movie" not in item['video'].lower()
and "tv" not in item['video'].lower()]
print(f"Movie videos: {len(movie_videos)}")
print(f"TV videos: {len(tv_videos)}")
print(f"Other videos: {len(other_videos)}")