Implementation:Apache Paimon FileStorePathFactory
| Knowledge Sources | |
|---|---|
| Domains | File Organization, Path Management |
| Last Updated | 2026-02-08 00:00 GMT |
Overview
FileStorePathFactory provides centralized path construction for Paimon file store components including manifest, index, statistics, and data file paths with partition and bucket support.
Description
The FileStorePathFactory class encapsulates all logic for constructing file system paths within a Paimon table's file store. It maintains configuration like the root path, partition key definitions, file format identifiers, compression settings, and optional external path providers. The factory ensures consistent path naming conventions across all components of the file store.
The class provides path construction methods for each major component: manifest_path() for manifest files, index_path() for index data, statistics_path() for table statistics, and data_file_path() for data files. The bucket_path() and relative_bucket_path() methods handle the complex logic of constructing partition-aware bucket paths, incorporating partition key=value pairs and bucket identifiers into the path hierarchy.
Special features include support for external paths through ExternalPathProvider, optional data file subdirectories for better organization, configurable file prefixes (data_file_prefix, changelog_file_prefix), and legacy partition naming compatibility. The IndexPathFactory nested class provides path management for global indexes with unique file naming using UUIDs. Path components follow Hive-style partition naming (key=value) unless legacy mode is enabled, ensuring compatibility with existing data layouts.
Usage
Use FileStorePathFactory when implementing file store operations that need consistent path generation, constructing partition and bucket paths for data files, or managing manifest and index file locations in table implementations.
Code Reference
Source Location
- Repository: Apache_Paimon
- File: paimon-python/pypaimon/utils/file_store_path_factory.py
Signature
class FileStorePathFactory:
MANIFEST_PATH = "manifest"
MANIFEST_PREFIX = "manifest-"
MANIFEST_LIST_PREFIX = "manifest-list-"
INDEX_MANIFEST_PREFIX = "index-manifest-"
INDEX_PATH = "index"
INDEX_PREFIX = "index-"
STATISTICS_PATH = "statistics"
STATISTICS_PREFIX = "stat-"
BUCKET_PATH_PREFIX = "bucket-"
def __init__(
self,
root: str,
partition_keys: List[str],
default_part_value: str,
format_identifier: str,
data_file_prefix: str,
changelog_file_prefix: str,
legacy_partition_name: bool,
file_suffix_include_compression: bool,
file_compression: str,
data_file_path_directory: Optional[str] = None,
external_paths: Optional[List[str]] = None,
index_file_in_data_file_dir: bool = False,
):
pass
def root(self) -> str:
pass
def manifest_path(self) -> str:
pass
def index_path(self) -> str:
pass
def statistics_path(self) -> str:
pass
def data_file_path(self) -> str:
pass
def relative_bucket_path(self, partition: Tuple, bucket: int) -> str:
pass
def bucket_path(self, partition: Tuple, bucket: int) -> str:
pass
def create_external_path_provider(
self, partition: Tuple, bucket: int
) -> Optional[ExternalPathProvider]:
pass
def global_index_path_factory(self) -> 'IndexPathFactory':
pass
class IndexPathFactory:
def __init__(self, index_path: str):
pass
def index_path(self) -> str:
pass
def to_path(self, file_name: str) -> str:
pass
def new_path(self, prefix: str = "index-") -> str:
pass
def is_external_path(self) -> bool:
pass
Import
from pypaimon.utils.file_store_path_factory import FileStorePathFactory, IndexPathFactory
I/O Contract
Inputs
| Name | Type | Required | Description |
|---|---|---|---|
| root | str | Yes | Root path of the table |
| partition_keys | List[str] | Yes | List of partition column names |
| partition | Tuple | Yes | Partition values tuple |
| bucket | int | Yes | Bucket number |
| format_identifier | str | Yes | File format (e.g., "orc", "parquet") |
| file_compression | str | Yes | Compression codec name |
Outputs
| Name | Type | Description |
|---|---|---|
| path | str | Full file system path |
| relative_path | str | Relative path from root |
| path_provider | ExternalPathProvider | External path provider if configured |
| index_factory | IndexPathFactory | Factory for index paths |
Usage Examples
from pypaimon.utils.file_store_path_factory import FileStorePathFactory
from pypaimon.table.bucket_mode import BucketMode
# Create factory for partitioned table
factory = FileStorePathFactory(
root="/warehouse/my_table",
partition_keys=["year", "month"],
default_part_value="__DEFAULT_PARTITION__",
format_identifier="orc",
data_file_prefix="data-",
changelog_file_prefix="changelog-",
legacy_partition_name=False,
file_suffix_include_compression=True,
file_compression="zstd"
)
# Get component paths
print(factory.root()) # "/warehouse/my_table"
print(factory.manifest_path()) # "/warehouse/my_table/manifest"
print(factory.index_path()) # "/warehouse/my_table/index"
print(factory.statistics_path()) # "/warehouse/my_table/statistics"
print(factory.data_file_path()) # "/warehouse/my_table"
# Construct bucket path with partition
partition = ("2024", "06")
bucket = 0
path = factory.bucket_path(partition, bucket)
print(path) # "/warehouse/my_table/year=2024/month=06/bucket-0"
# Relative bucket path
rel_path = factory.relative_bucket_path(partition, bucket)
print(rel_path) # "year=2024/month=06/bucket-0"
# Multiple buckets
for bucket in range(4):
path = factory.bucket_path(partition, bucket)
print(path)
# /warehouse/my_table/year=2024/month=06/bucket-0
# /warehouse/my_table/year=2024/month=06/bucket-1
# /warehouse/my_table/year=2024/month=06/bucket-2
# /warehouse/my_table/year=2024/month=06/bucket-3
# Non-partitioned table
factory_non_part = FileStorePathFactory(
root="/warehouse/non_partitioned",
partition_keys=[], # No partitions
default_part_value="__DEFAULT_PARTITION__",
format_identifier="parquet",
data_file_prefix="data-",
changelog_file_prefix="changelog-",
legacy_partition_name=False,
file_suffix_include_compression=False,
file_compression="snappy"
)
path = factory_non_part.bucket_path((), 0)
print(path) # "/warehouse/non_partitioned/bucket-0"
# Postpone bucket (special bucket mode)
path = factory.bucket_path(partition, BucketMode.POSTPONE_BUCKET.value)
print(path) # "/warehouse/my_table/year=2024/month=06/postpone"
# With data file subdirectory
factory_with_subdir = FileStorePathFactory(
root="/warehouse/table",
partition_keys=["date"],
default_part_value="__DEFAULT_PARTITION__",
format_identifier="orc",
data_file_prefix="data-",
changelog_file_prefix="changelog-",
legacy_partition_name=False,
file_suffix_include_compression=True,
file_compression="zstd",
data_file_path_directory="data" # Data files in "data" subdirectory
)
partition = ("2024-06-15",)
path = factory_with_subdir.bucket_path(partition, 0)
print(path) # "/warehouse/table/data/date=2024-06-15/bucket-0"
# External paths for additional storage locations
factory_external = FileStorePathFactory(
root="/warehouse/table",
partition_keys=["year"],
default_part_value="__DEFAULT_PARTITION__",
format_identifier="orc",
data_file_prefix="data-",
changelog_file_prefix="changelog-",
legacy_partition_name=False,
file_suffix_include_compression=True,
file_compression="zstd",
external_paths=["/external/storage1", "/external/storage2"]
)
partition = ("2024",)
provider = factory_external.create_external_path_provider(partition, 0)
if provider:
print("External paths available")
# Global index path factory
index_factory = factory.global_index_path_factory()
print(index_factory.index_path()) # "/warehouse/my_table/index"
# Generate new index file paths with UUIDs
index_file1 = index_factory.new_path()
index_file2 = index_factory.new_path()
print(index_file1) # "/warehouse/my_table/index/index-<uuid>-1"
print(index_file2) # "/warehouse/my_table/index/index-<uuid>-2"
# Convert file name to path
path = index_factory.to_path("my-index-file.idx")
print(path) # "/warehouse/my_table/index/my-index-file.idx"
# Custom index prefix
custom_index_path = index_factory.new_path(prefix="custom-")
print(custom_index_path) # "/warehouse/my_table/index/custom-<uuid>-3"
# Check if external path
print(index_factory.is_external_path()) # False
# Path constants
print(FileStorePathFactory.MANIFEST_PREFIX) # "manifest-"
print(FileStorePathFactory.INDEX_PREFIX) # "index-"
print(FileStorePathFactory.STATISTICS_PREFIX) # "stat-"
print(FileStorePathFactory.BUCKET_PATH_PREFIX) # "bucket-"
# Legacy partition naming
factory_legacy = FileStorePathFactory(
root="/warehouse/table",
partition_keys=["year", "month"],
default_part_value="__DEFAULT_PARTITION__",
format_identifier="orc",
data_file_prefix="data-",
changelog_file_prefix="changelog-",
legacy_partition_name=True, # Use legacy naming
file_suffix_include_compression=True,
file_compression="zstd"
)
# Legacy mode may use different partition path format
partition = ("2024", "06")
path = factory_legacy.bucket_path(partition, 0)
print(path) # Path format depends on legacy implementation