Jump to content

Connect Leeroopedia MCP: Equip your AI agents to search best practices, build plans, verify code, diagnose failures, and look up hyperparameter defaults.

Implementation:Huggingface Diffusers Video Pipeline From Pretrained

From Leeroopedia
Field Value
Type API Doc
Overview Instantiating video generation pipelines using from_pretrained for Wan, HunyuanVideo, and CogVideoX architectures
Domains Video Generation, Diffusion Models
Workflow Video_Generation
Related Principle Huggingface_Diffusers_Video_Pipeline_Selection
Source src/diffusers/pipelines/wan/pipeline_wan.py:L96-L156, src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py:L144-L197, src/diffusers/pipelines/cogvideo/pipeline_cogvideox.py:L147-L200
Last Updated 2026-02-13 00:00 GMT

Code Reference

WanPipeline Initialization

Source: src/diffusers/pipelines/wan/pipeline_wan.py:L96-L156

class WanPipeline(DiffusionPipeline, WanLoraLoaderMixin):
    """Pipeline for text-to-video generation using Wan."""

    model_cpu_offload_seq = "text_encoder->transformer->transformer_2->vae"
    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
    _optional_components = ["transformer", "transformer_2"]

    def __init__(
        self,
        tokenizer: AutoTokenizer,
        text_encoder: UMT5EncoderModel,
        vae: AutoencoderKLWan,
        scheduler: FlowMatchEulerDiscreteScheduler,
        transformer: WanTransformer3DModel | None = None,
        transformer_2: WanTransformer3DModel | None = None,
        boundary_ratio: float | None = None,
        expand_timesteps: bool = False,
    ):
        super().__init__()
        self.register_modules(
            vae=vae, text_encoder=text_encoder, tokenizer=tokenizer,
            transformer=transformer, scheduler=scheduler, transformer_2=transformer_2,
        )
        self.vae_scale_factor_temporal = self.vae.config.scale_factor_temporal
        self.vae_scale_factor_spatial = self.vae.config.scale_factor_spatial
        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)

HunyuanVideoPipeline Initialization

Source: src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py:L144-L197

class HunyuanVideoPipeline(DiffusionPipeline, HunyuanVideoLoraLoaderMixin):
    """Pipeline for text-to-video generation using HunyuanVideo."""

    model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"

    def __init__(
        self,
        text_encoder: LlamaModel,
        tokenizer: LlamaTokenizerFast,
        transformer: HunyuanVideoTransformer3DModel,
        vae: AutoencoderKLHunyuanVideo,
        scheduler: FlowMatchEulerDiscreteScheduler,
        text_encoder_2: CLIPTextModel,
        tokenizer_2: CLIPTokenizer,
    ):
        super().__init__()
        self.register_modules(
            vae=vae, text_encoder=text_encoder, tokenizer=tokenizer,
            transformer=transformer, scheduler=scheduler,
            text_encoder_2=text_encoder_2, tokenizer_2=tokenizer_2,
        )
        self.vae_scale_factor_temporal = self.vae.temporal_compression_ratio
        self.vae_scale_factor_spatial = self.vae.spatial_compression_ratio
        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)

CogVideoXPipeline Initialization

Source: src/diffusers/pipelines/cogvideo/pipeline_cogvideox.py:L147-L200

class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
    """Pipeline for text-to-video generation using CogVideoX."""

    model_cpu_offload_seq = "text_encoder->transformer->vae"

    def __init__(
        self,
        tokenizer: T5Tokenizer,
        text_encoder: T5EncoderModel,
        vae: AutoencoderKLCogVideoX,
        transformer: CogVideoXTransformer3DModel,
        scheduler: CogVideoXDDIMScheduler | CogVideoXDPMScheduler,
    ):
        super().__init__()
        self.register_modules(
            tokenizer=tokenizer, text_encoder=text_encoder,
            vae=vae, transformer=transformer, scheduler=scheduler,
        )
        self.vae_scale_factor_spatial = 2 ** (len(self.vae.config.block_out_channels) - 1)
        self.vae_scale_factor_temporal = self.vae.config.temporal_compression_ratio
        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)

Key Parameters

Parameter Description Values
pretrained_model_name_or_path HuggingFace model ID or local path e.g., "Wan-AI/Wan2.1-T2V-14B-Diffusers"
torch_dtype Precision for model weights torch.bfloat16, torch.float16, torch.float32
vae Override the default VAE (useful for forcing float32 VAE) Pre-loaded AutoencoderKLWan instance
subfolder Load a specific component from a subfolder "vae", "transformer"
variant Weight variant to load "fp16", None

I/O Contract

Inputs

  • pretrained_model_name_or_path (str): A HuggingFace Hub model ID (e.g., "Wan-AI/Wan2.1-T2V-14B-Diffusers") or a local directory path containing model weights and config files

Outputs

  • Pipeline instance: A fully initialized pipeline object with all components loaded (tokenizer, text_encoder, vae, transformer, scheduler)

External Dependencies

  • transformers (for text encoders: UMT5EncoderModel, LlamaModel, CLIPTextModel, T5EncoderModel)
  • safetensors (for loading .safetensors weight files)
  • accelerate (for device placement and offloading)

Usage Examples

Loading WanPipeline for Text-to-Video

import torch
from diffusers import AutoencoderKLWan, WanPipeline
from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler

model_id = "Wan-AI/Wan2.1-T2V-14B-Diffusers"
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
pipe = WanPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)

# Configure scheduler
flow_shift = 5.0  # 5.0 for 720P, 3.0 for 480P
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=flow_shift)
pipe.to("cuda")

Loading HunyuanVideoPipeline

import torch
from diffusers import HunyuanVideoPipeline, HunyuanVideoTransformer3DModel

model_id = "hunyuanvideo-community/HunyuanVideo"
transformer = HunyuanVideoTransformer3DModel.from_pretrained(
    model_id, subfolder="transformer", torch_dtype=torch.bfloat16
)
pipe = HunyuanVideoPipeline.from_pretrained(model_id, transformer=transformer, torch_dtype=torch.float16)
pipe.vae.enable_tiling()
pipe.to("cuda")

Loading CogVideoXPipeline

import torch
from diffusers import CogVideoXPipeline

pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-5b", torch_dtype=torch.float16)
pipe.to("cuda")

Related Pages

Principle:Huggingface_Diffusers_Video_Pipeline_Selection

Requires Environment

Page Connections

Double-click a node to navigate. Hold to expand connections.
Principle
Implementation
Heuristic
Environment