Jump to content

Connect Leeroopedia MCP: Equip your AI agents to search best practices, build plans, verify code, diagnose failures, and look up hyperparameter defaults.

Implementation:FlagOpen FlagEmbedding MLVU Count Data

From Leeroopedia


Knowledge Sources
Domains Video Understanding, Benchmark Data, Counting Tasks
Last Updated 2026-02-09 00:00 GMT

Overview

Benchmark dataset for counting occurrences of actions or events in long videos.

Description

The MLVU Count dataset contains 2680 questions that test models' ability to count the number of times specific actions, scenes, or events occur throughout a video. This requires comprehensive video understanding to track and count instances across the entire video duration. Questions typically ask "how many times" or "how many instances" of a particular action or scene appear, testing both recognition and counting capabilities.

This dataset is challenging because it requires:

  • Complete video comprehension (cannot skip frames)
  • Accurate action/scene recognition
  • Maintaining count across video duration
  • Distinguishing between similar but distinct occurrences

Usage

Use this dataset for evaluating temporal counting capabilities in video understanding models, benchmarking action recognition with frequency analysis, or training models on video-based counting tasks.

Code Reference

Source Location

Data Structure

{
    "video": str,              # Video filename
    "duration": float,         # Video duration in seconds
    "question": str,           # Question about counting occurrences
    "candidates": List[str],   # Four candidate counts (as strings)
    "answer": str,             # Correct count
    "question_type": str       # Always "count"
}

Import

import json

# Load count dataset
with open("research/MLVU/data/4_count.json", "r") as f:
    count_data = [json.loads(line) for line in f]

I/O Contract

Inputs

Name Type Required Description
file_path str Yes Path to the count dataset JSON file

Outputs

Field Type Description
video str Video filename
duration float Video duration in seconds
question str Question about counting
candidates List[str] Four possible counts
answer str Correct count
question_type str Type identifier ("count")

Usage Examples

import json
from typing import List, Dict

# Load count dataset
def load_count_data(file_path: str) -> List[Dict]:
    with open(file_path, "r") as f:
        return [json.loads(line) for line in f]

data = load_count_data("research/MLVU/data/4_count.json")

# Example entry
example = data[0]
print(f"Video: {example['video']}")
print(f"Duration: {example['duration']:.2f}s")
print(f"Question: {example['question']}")
print(f"Candidates: {example['candidates']}")
print(f"Answer: {example['answer']}")

# Output:
# Video: count_126.mp4
# Duration: 572.86s
# Question: Throughout this video, what is the total count of
#           occurrences for the scene featuring the 'playing trombone' action
# Candidates: ['2', '1', '5', '4']
# Answer: 1

# Evaluate counting accuracy
def evaluate_counting(model, data: List[Dict]) -> Dict[str, float]:
    correct = 0
    total = len(data)

    # Track accuracy by count range
    low_count_correct = 0  # 1-2
    mid_count_correct = 0  # 3-5
    high_count_correct = 0 # 6+

    low_total = mid_total = high_total = 0

    for item in data:
        video_path = f"videos/{item['video']}"

        # Model prediction
        predicted_count = model.count_occurrences(
            video_path,
            item['question'],
            item['candidates']
        )

        is_correct = (predicted_count == item['answer'])
        correct += is_correct

        # Categorize by actual count
        actual_count = int(item['answer'])
        if actual_count <= 2:
            low_total += 1
            low_count_correct += is_correct
        elif actual_count <= 5:
            mid_total += 1
            mid_count_correct += is_correct
        else:
            high_total += 1
            high_count_correct += is_correct

    return {
        "overall_acc": correct / total,
        "low_count_acc": low_count_correct / low_total if low_total > 0 else 0,
        "mid_count_acc": mid_count_correct / mid_total if mid_total > 0 else 0,
        "high_count_acc": high_count_correct / high_total if high_total > 0 else 0
    }

# Analyze counting distribution
def analyze_counts(data: List[Dict]) -> Dict:
    count_distribution = {}

    for item in data:
        count = item['answer']
        count_distribution[count] = count_distribution.get(count, 0) + 1

    return dict(sorted(count_distribution.items(), key=lambda x: int(x[0])))

distribution = analyze_counts(data)
print("Count distribution:", distribution)

# Extract action types from questions
def extract_actions(data: List[Dict]) -> List[str]:
    actions = []
    for item in data:
        question = item['question']
        # Extract action between quotes or after "the '"
        if "'" in question:
            start = question.find("'") + 1
            end = question.find("'", start)
            if end > start:
                action = question[start:end]
                actions.append(action)
    return list(set(actions))

unique_actions = extract_actions(data)
print(f"Number of unique actions: {len(unique_actions)}")
print(f"Sample actions: {unique_actions[:10]}")

# Filter by video duration and count complexity
complex_samples = [
    item for item in data
    if item['duration'] > 600 and int(item['answer']) > 3
]
print(f"Complex samples (long video + high count): {len(complex_samples)}")

Related Pages

Page Connections

Double-click a node to navigate. Hold to expand connections.
Principle
Implementation
Heuristic
Environment