Implementation:FlagOpen FlagEmbedding MLVU Count Data
| Knowledge Sources | |
|---|---|
| Domains | Video Understanding, Benchmark Data, Counting Tasks |
| Last Updated | 2026-02-09 00:00 GMT |
Overview
Benchmark dataset for counting occurrences of actions or events in long videos.
Description
The MLVU Count dataset contains 2680 questions that test models' ability to count the number of times specific actions, scenes, or events occur throughout a video. This requires comprehensive video understanding to track and count instances across the entire video duration. Questions typically ask "how many times" or "how many instances" of a particular action or scene appear, testing both recognition and counting capabilities.
This dataset is challenging because it requires:
- Complete video comprehension (cannot skip frames)
- Accurate action/scene recognition
- Maintaining count across video duration
- Distinguishing between similar but distinct occurrences
Usage
Use this dataset for evaluating temporal counting capabilities in video understanding models, benchmarking action recognition with frequency analysis, or training models on video-based counting tasks.
Code Reference
Source Location
- Repository: FlagOpen_FlagEmbedding
- File: research/MLVU/data/4_count.json
Data Structure
{
"video": str, # Video filename
"duration": float, # Video duration in seconds
"question": str, # Question about counting occurrences
"candidates": List[str], # Four candidate counts (as strings)
"answer": str, # Correct count
"question_type": str # Always "count"
}
Import
import json
# Load count dataset
with open("research/MLVU/data/4_count.json", "r") as f:
count_data = [json.loads(line) for line in f]
I/O Contract
Inputs
| Name | Type | Required | Description |
|---|---|---|---|
| file_path | str | Yes | Path to the count dataset JSON file |
Outputs
| Field | Type | Description |
|---|---|---|
| video | str | Video filename |
| duration | float | Video duration in seconds |
| question | str | Question about counting |
| candidates | List[str] | Four possible counts |
| answer | str | Correct count |
| question_type | str | Type identifier ("count") |
Usage Examples
import json
from typing import List, Dict
# Load count dataset
def load_count_data(file_path: str) -> List[Dict]:
with open(file_path, "r") as f:
return [json.loads(line) for line in f]
data = load_count_data("research/MLVU/data/4_count.json")
# Example entry
example = data[0]
print(f"Video: {example['video']}")
print(f"Duration: {example['duration']:.2f}s")
print(f"Question: {example['question']}")
print(f"Candidates: {example['candidates']}")
print(f"Answer: {example['answer']}")
# Output:
# Video: count_126.mp4
# Duration: 572.86s
# Question: Throughout this video, what is the total count of
# occurrences for the scene featuring the 'playing trombone' action
# Candidates: ['2', '1', '5', '4']
# Answer: 1
# Evaluate counting accuracy
def evaluate_counting(model, data: List[Dict]) -> Dict[str, float]:
correct = 0
total = len(data)
# Track accuracy by count range
low_count_correct = 0 # 1-2
mid_count_correct = 0 # 3-5
high_count_correct = 0 # 6+
low_total = mid_total = high_total = 0
for item in data:
video_path = f"videos/{item['video']}"
# Model prediction
predicted_count = model.count_occurrences(
video_path,
item['question'],
item['candidates']
)
is_correct = (predicted_count == item['answer'])
correct += is_correct
# Categorize by actual count
actual_count = int(item['answer'])
if actual_count <= 2:
low_total += 1
low_count_correct += is_correct
elif actual_count <= 5:
mid_total += 1
mid_count_correct += is_correct
else:
high_total += 1
high_count_correct += is_correct
return {
"overall_acc": correct / total,
"low_count_acc": low_count_correct / low_total if low_total > 0 else 0,
"mid_count_acc": mid_count_correct / mid_total if mid_total > 0 else 0,
"high_count_acc": high_count_correct / high_total if high_total > 0 else 0
}
# Analyze counting distribution
def analyze_counts(data: List[Dict]) -> Dict:
count_distribution = {}
for item in data:
count = item['answer']
count_distribution[count] = count_distribution.get(count, 0) + 1
return dict(sorted(count_distribution.items(), key=lambda x: int(x[0])))
distribution = analyze_counts(data)
print("Count distribution:", distribution)
# Extract action types from questions
def extract_actions(data: List[Dict]) -> List[str]:
actions = []
for item in data:
question = item['question']
# Extract action between quotes or after "the '"
if "'" in question:
start = question.find("'") + 1
end = question.find("'", start)
if end > start:
action = question[start:end]
actions.append(action)
return list(set(actions))
unique_actions = extract_actions(data)
print(f"Number of unique actions: {len(unique_actions)}")
print(f"Sample actions: {unique_actions[:10]}")
# Filter by video duration and count complexity
complex_samples = [
item for item in data
if item['duration'] > 600 and int(item['answer']) > 3
]
print(f"Complex samples (long video + high count): {len(complex_samples)}")