Jump to content

Connect SuperML | Leeroopedia MCP: Equip your AI agents with best practices, code verification, and debugging knowledge. Powered by Leeroo — building Organizational Superintelligence. Contact us at founders@leeroo.com.

Implementation:Apache Paimon DeletionVector

From Leeroopedia


Knowledge Sources
Domains Deletion Vectors, Storage Layer
Last Updated 2026-02-08 00:00 GMT

Overview

DeletionVector defines the abstract interface for tracking deleted row positions in data files, enabling merge-on-read operations with efficient position-based deletion marking.

Description

The DeletionVector abstract base class establishes the contract for deletion vector implementations in Apache Paimon. It defines methods for marking positions as deleted (delete()), checking deletion status (is_deleted()), querying emptiness and cardinality, and merging multiple deletion vectors. The interface design supports both individual position checks and bulk operations.

The class provides a checked_delete() convenience method that returns whether a position was newly deleted (as opposed to already being deleted), useful for tracking changes during delete operations. The abstract bit_map() method exposes the underlying bitmap representation, enabling efficient set operations when working with deletion vectors.

The static read() method provides file-based deserialization, reading deletion vector data from FileIO streams at specified offsets. It validates the magic number to ensure format compatibility, checks declared lengths against actual data, and skips CRC checksums during reading. This design supports storing multiple deletion vectors in a single index file at different offsets, as referenced by DeletionFile metadata objects.

Usage

Use DeletionVector when implementing merge-on-read tables, marking rows as deleted without rewriting data files, or reading deletion vector information from metadata files to filter query results.

Code Reference

Source Location

Signature

class DeletionVector(ABC):
    """
    The DeletionVector can efficiently record the positions of rows that are deleted in a file,
    which can then be used to filter out deleted rows when processing the file.
    """

    @abstractmethod
    def bit_map(self) -> RoaringBitmap:
        pass

    @abstractmethod
    def delete(self, position: int) -> None:
        pass

    @abstractmethod
    def is_deleted(self, position: int) -> bool:
        pass

    @abstractmethod
    def is_empty(self) -> bool:
        pass

    @abstractmethod
    def get_cardinality(self) -> int:
        pass

    @abstractmethod
    def merge(self, deletion_vector: 'DeletionVector') -> None:
        pass

    def checked_delete(self, position: int) -> bool:
        pass

    @staticmethod
    def read(file_io: FileIO, deletion_file: DeletionFile) -> 'DeletionVector':
        pass

Import

from pypaimon.deletionvectors.deletion_vector import DeletionVector

I/O Contract

Inputs

Name Type Required Description
position int Yes Row position to delete or check
deletion_vector DeletionVector Yes Another deletion vector to merge
file_io FileIO Yes FileIO for reading deletion vectors
deletion_file DeletionFile Yes Metadata describing deletion vector location

Outputs

Name Type Description
is_deleted bool Whether position is marked as deleted
is_empty bool Whether deletion vector has no deletions
cardinality int Number of deleted positions
was_added bool Whether checked_delete actually added a new deletion
deletion_vector DeletionVector Loaded deletion vector instance
bitmap RoaringBitmap Underlying bitmap representation

Usage Examples

from pypaimon.deletionvectors.deletion_vector import DeletionVector
from pypaimon.deletionvectors.bitmap_deletion_vector import BitmapDeletionVector
from pypaimon.common.file_io import FileIO

# Create concrete implementation
dv: DeletionVector = BitmapDeletionVector()

# Mark positions as deleted
dv.delete(10)
dv.delete(25)
dv.delete(100)

# Check deletion status
print(dv.is_deleted(10))   # True
print(dv.is_deleted(11))   # False
print(dv.is_deleted(25))   # True

# Query state
print(dv.is_empty())  # False
print(dv.get_cardinality())  # 3

# checked_delete returns True if position was newly deleted
was_new = dv.checked_delete(200)
print(was_new)  # True (200 wasn't deleted before)

was_new = dv.checked_delete(200)
print(was_new)  # False (200 was already deleted)

# Merge deletion vectors
dv1: DeletionVector = BitmapDeletionVector()
dv1.delete(1)
dv1.delete(2)

dv2: DeletionVector = BitmapDeletionVector()
dv2.delete(2)
dv2.delete(3)

dv1.merge(dv2)
# dv1 now contains deletions at positions 1, 2, 3

# Access underlying bitmap for set operations
bitmap = dv1.bit_map()
print(f"Deleted positions: {list(bitmap)}")

# Read deletion vector from file
from pypaimon.table.source.deletion_file import DeletionFile

file_io = FileIO.get("/path/to/table")
deletion_file = DeletionFile(
    dv_index_path="/path/to/table/index/dv-index-file",
    offset=1024,
    length=256
)

dv = DeletionVector.read(file_io, deletion_file)
print(f"Loaded DV with {dv.get_cardinality()} deletions")

# Use in query filtering
def filter_results(records, dv: DeletionVector):
    filtered = []
    for position, record in enumerate(records):
        if not dv.is_deleted(position):
            filtered.append(record)
    return filtered

records = [...]  # Your data records
filtered_records = filter_results(records, dv)

# Polymorphic usage
def process_deletions(dv: DeletionVector):
    """Works with any DeletionVector implementation"""
    if dv.is_empty():
        print("No deletions")
        return

    print(f"Processing {dv.get_cardinality()} deletions")
    for pos in dv.bit_map():
        print(f"Position {pos} is deleted")

process_deletions(BitmapDeletionVector())

# Batch deletion operations
dv: DeletionVector = BitmapDeletionVector()
positions_to_delete = [10, 20, 30, 40, 50]

for pos in positions_to_delete:
    dv.delete(pos)

print(f"Deleted {dv.get_cardinality()} positions")

# Check multiple positions
positions_to_check = [10, 15, 20, 25, 30]
deleted_positions = [pos for pos in positions_to_check if dv.is_deleted(pos)]
print(f"Deleted: {deleted_positions}")  # [10, 20, 30]

# Combining with file operations
def save_deletion_vector(dv: DeletionVector, file_io: FileIO, path: str):
    """Save deletion vector to file"""
    from pypaimon.deletionvectors.bitmap_deletion_vector import BitmapDeletionVector
    if isinstance(dv, BitmapDeletionVector):
        serialized = dv.serialize()
        with file_io.new_output_stream(path) as out:
            out.write(serialized)

# Example: Track deleted rows across multiple files
file_deletions = {}  # file_path -> DeletionVector

def mark_row_deleted(file_path: str, row_position: int):
    if file_path not in file_deletions:
        file_deletions[file_path] = BitmapDeletionVector()

    dv = file_deletions[file_path]
    was_new = dv.checked_delete(row_position)
    if was_new:
        print(f"Marked {file_path}:{row_position} as deleted")
    else:
        print(f"{file_path}:{row_position} was already deleted")

mark_row_deleted("file1.parquet", 100)
mark_row_deleted("file1.parquet", 200)
mark_row_deleted("file2.parquet", 50)

Related Pages

Page Connections

Double-click a node to navigate. Hold to expand connections.
Principle
Implementation
Heuristic
Environment