Jump to content

Connect Leeroopedia MCP: Equip your AI agents to search best practices, build plans, verify code, diagnose failures, and look up hyperparameter defaults.

Implementation:Microsoft DeepSpeedExamples DeepSpeed Initialize VisualChat

From Leeroopedia


  1. Implementation: DeepSpeed_Initialize_VisualChat

Metadata

Field Value
Page Type Implementation (Wrapper Doc)
Title DeepSpeed_Initialize_VisualChat
Repository Microsoft/DeepSpeedExamples
Application DeepSpeed-VisualChat
Files applications/DeepSpeed-VisualChat/training/main.py, utils/ds_utils.py (Lines 9-74), utils/utils.py (Lines 78-130)
Language Python
Status Active

Overview

Concrete usage of deepspeed.initialize() with multi-group optimizer for DeepSpeed-VisualChat training.

Code Reference

DeepSpeed Configuration (utils/ds_utils.py, Lines 9-74)

def get_train_ds_config(args, offload, stage=2,
                        enable_hybrid_engine=False,
                        inference_tp_size=1,
                        release_inference_cache=False,
                        pin_parameters=True,
                        tp_gather_partition_size=8,
                        max_out_tokens=512):
    if args.precision == 'fp16':
        enable_fp16 = True
        enable_bf16 = False
    elif args.precision == 'bf16':
        enable_fp16 = False
        enable_bf16 = True

    device = "cpu" if offload else "none"
    zero_opt_dict = {
        "stage": stage,
        "offload_param": {"device": device},
        "offload_optimizer": {"device": device},
        "stage3_param_persistence_threshold": 1e4,
        "stage3_max_live_parameters": 3e7,
        "stage3_prefetch_bucket_size": 0,
        "memory_efficient_linear": False,
    }
    output = {
        "train_batch_size": GLOBAL_BATCH_SIZE,
        "train_micro_batch_size_per_gpu": MICRO_BATCH_SIZE,
        "steps_per_print": 10,
        "zero_optimization": zero_opt_dict,
        "zero_allow_untested_optimizer": True,
        "zero_force_ds_cpu_optimizer": False,
        "fp16": {"enabled": enable_fp16, "loss_scale_window": 100},
        "bf16": {"enabled": enable_bf16},
        "gradient_clipping": 1.0,
        "prescale_gradients": False,
        "wall_clock_breakdown": False,
        "hybrid_engine": {
            "enabled": enable_hybrid_engine,
            "max_out_tokens": max_out_tokens,
            "inference_tp_size": inference_tp_size,
            "release_inference_cache": release_inference_cache,
            "pin_parameters": pin_parameters,
            "tp_gather_partition_size": tp_gather_partition_size,
        },
    }
    if args.enable_tensorboard:
        output.update({"tensorboard": {
            "enabled": True,
            "output_path": args.output_dir,
            "job_name": 'tb_logging'
        }})
    return output

Optimizer Group Configuration (utils/utils.py, Lines 78-130)

def get_optimizer_grouped_parameters(model, weight_decay,
                                     no_decay_name_list=[
                                         "bias", "LayerNorm.weight"
                                     ],
                                     small_learning_rate_list=["embed"],
                                     small_lr=1e-4):
    optimizer_grouped_parameters = [
        {
            # Group 1: Normal LR + weight decay (non-embedding, non-bias/LN)
            "params": [
                p for n, p in model.named_parameters()
                if (not any(nd in n for nd in no_decay_name_list)
                    and not any(nd in n for nd in small_learning_rate_list)
                    and p.requires_grad)
            ],
            "weight_decay": weight_decay,
        },
        {
            # Group 2: Normal LR + no weight decay (bias/LayerNorm, non-embedding)
            "params": [
                p for n, p in model.named_parameters()
                if (any(nd in n for nd in no_decay_name_list)
                    and not any(nd in n for nd in small_learning_rate_list)
                    and p.requires_grad)
            ],
            "weight_decay": 0.0,
        },
        {
            # Group 3: Small LR + weight decay (embedding params)
            "params": [
                p for n, p in model.named_parameters()
                if (not any(nd in n for nd in no_decay_name_list)
                    and any(nd in n for nd in small_learning_rate_list)
                    and p.requires_grad)
            ],
            "weight_decay": weight_decay,
            "lr": small_lr,
        },
        {
            # Group 4: Small LR + no weight decay (embedding bias/LN)
            "params": [
                p for n, p in model.named_parameters()
                if (any(nd in n for nd in no_decay_name_list)
                    and any(nd in n for nd in small_learning_rate_list)
                    and p.requires_grad)
            ],
            "weight_decay": 0.0,
            "lr": small_lr,
        },
    ]
    return optimizer_grouped_parameters

DeepSpeed Initialization (training/main.py, Lines 316-342)

# Configure DeepSpeed
ds_config = get_train_ds_config(args, offload=False, stage=args.zero_stage)
ds_config['train_micro_batch_size_per_gpu'] = args.per_device_train_batch_size
ds_config['train_batch_size'] = (args.per_device_train_batch_size
                                  * torch.distributed.get_world_size()
                                  * args.gradient_accumulation_steps)

# Create optimizer with grouped parameters
optimizer_grouped_parameters = get_optimizer_grouped_parameters(
    model, args.weight_decay,
    small_lr=args.learning_rate_pretraining_components)

optimizer = AdamW(optimizer_grouped_parameters,
                  lr=args.learning_rate,
                  betas=(0.9, 0.95))

# Configure learning rate scheduler
num_update_steps_per_epoch = math.ceil(
    len(train_dataloader) / args.gradient_accumulation_steps)
if args.num_warmup_steps <= 1:
    args.num_warmup_steps = int(
        args.num_warmup_steps * args.num_train_epochs * num_update_steps_per_epoch)
else:
    args.num_warmup_steps = int(args.num_warmup_steps)

lr_scheduler = get_scheduler(
    name=args.lr_scheduler_type,
    optimizer=optimizer,
    num_warmup_steps=args.num_warmup_steps,
    num_training_steps=args.num_train_epochs * num_update_steps_per_epoch,
)

# Initialize DeepSpeed engine
model, optimizer, _, lr_scheduler = deepspeed.initialize(
    model=model,
    optimizer=optimizer,
    args=args,
    config=ds_config,
    lr_scheduler=lr_scheduler,
    dist_init_required=True)

I/O Contract

deepspeed.initialize()

Direction Parameter Type Description
Input model DeepSpeedViLModel The composed multimodal model
Input optimizer AdamW Pre-configured optimizer with parameter groups
Input args argparse.Namespace Must contain local_rank for device placement
Input config dict DeepSpeed configuration from get_train_ds_config()
Input lr_scheduler LRScheduler Learning rate scheduler
Input dist_init_required bool True to let DeepSpeed handle distributed init
Output model DeepSpeedEngine Wrapped model with ZeRO, mixed precision, etc.
Output optimizer DeepSpeedOptimizer Wrapped optimizer
Output _ -- Unused (data loader, not used here)
Output lr_scheduler LRScheduler Potentially wrapped scheduler

get_train_ds_config()

Direction Parameter Type Description
Input args argparse.Namespace Must contain precision, enable_tensorboard, output_dir
Input offload bool Whether to offload to CPU
Input stage int ZeRO stage (0, 1, 2, or 3)
Output (return) dict DeepSpeed JSON config dictionary

get_optimizer_grouped_parameters()

Direction Parameter Type Description
Input model nn.Module Model with named parameters
Input weight_decay float Weight decay for applicable groups
Input no_decay_name_list list[str] Parameter name substrings exempt from weight decay (default: ["bias", "LayerNorm.weight"])
Input small_learning_rate_list list[str] Parameter name substrings receiving small LR (default: ["embed"])
Input small_lr float Learning rate for the small LR group
Output (return) list[dict] Four parameter group dicts for the optimizer

Optimizer Group Summary

Group Weight Decay Learning Rate Name Filter Typical Parameters
1 args.weight_decay args.learning_rate Not bias/LN, not embed Projection weights, LoRA weights
2 0.0 args.learning_rate bias or LN, not embed Projection bias, LayerNorm
3 args.weight_decay args.learning_rate_pretraining_components Not bias/LN, contains embed Language embedding weight
4 0.0 args.learning_rate_pretraining_components bias or LN, contains embed Language embedding bias

Usage Example

Full Training Launch

deepspeed --num_gpus 8 training/main.py \
    --lm_model_name_or_path meta-llama/Llama-2-7b-hf \
    --vision_model_name_or_path openai/clip-vit-large-patch14 \
    --vis_proj perceiver \
    --zero_stage 2 \
    --precision bf16 \
    --learning_rate 1e-3 \
    --learning_rate_pretraining_components 1e-5 \
    --weight_decay 0.01 \
    --num_train_epochs 6 \
    --per_device_train_batch_size 2 \
    --gradient_accumulation_steps 4 \
    --num_warmup_steps 0.03 \
    --lr_scheduler_type cosine \
    --lang_lora_dim 16 \
    --lang_lora_module_name model.layers. \
    --only_optimize_lora \
    --dataset_names llava coco_caption \
    --dataset_samples all all \
    --output_dir ./output/ \
    --enable_tensorboard

Checkpoint Resumption

# In training/main.py - automatic checkpoint detection
if os.path.exists(os.path.join(args.output_dir, 'latest')):
    _, client_state = model.load_checkpoint(args.output_dir)
    start_epoch = client_state['epoch']
    best_loss = client_state['best_loss']
    random.setstate(client_state['random_rng_state'])
    np.random.set_state(client_state['np_rng_state'])
    torch.set_rng_state(client_state['torch_rng_state'])
    torch.cuda.set_rng_state(client_state['torch_cuda_rng_state'])

DeepSpeed Config Key Parameters

Parameter Default Description
zero_optimization.stage 0 ZeRO stage (0, 1, 2, 3)
fp16.enabled True Enable FP16 mixed precision
bf16.enabled False Enable BF16 mixed precision
gradient_clipping 1.0 Maximum gradient norm
stage3_param_persistence_threshold 1e4 Params smaller than this stay replicated in ZeRO-3
stage3_max_live_parameters 3e7 Max parameters gathered at once in ZeRO-3
zero_allow_untested_optimizer True Allow AdamW with ZeRO
prescale_gradients False Whether to prescale gradients

Dependencies

  • deepspeed -- Core DeepSpeed library for distributed training
  • transformers.AdamW -- AdamW optimizer implementation
  • transformers.get_scheduler -- Learning rate scheduler factory
  • utils.ds_utils.get_train_ds_config -- DeepSpeed config builder
  • utils.utils.get_optimizer_grouped_parameters -- Parameter group builder

Related Pages

Page Connections

Double-click a node to navigate. Hold to expand connections.
Principle
Implementation
Heuristic
Environment