Jump to content

Connect SuperML | Leeroopedia MCP: Equip your AI agents with best practices, code verification, and debugging knowledge. Powered by Leeroo — building Organizational Superintelligence. Contact us at founders@leeroo.com.

Implementation:Ggml org Llama cpp Context Header

From Leeroopedia
Revision as of 12:39, 16 February 2026 by Admin (talk | contribs) (Auto-imported from implementations/Ggml_org_Llama_cpp_Context_Header.md)
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Knowledge Sources
Domains Inference, Context
Last Updated 2026-02-15 00:00 GMT

Overview

Declares the `llama_context` struct and the `llama_memory_breakdown_data` helper, defining the full interface for the llama.cpp inference context.

Description

Exposes methods for context lifecycle (constructor, destructor, `sched_reserve`), inference (`encode`, `decode`, `process_ubatch`), output access (logits, embeddings, sampled tokens/probs/candidates), state serialization (save/load to buffers and files), memory management (KV cache access, memory updates), adapter management (LoRA, control vectors), and training support (`opt_init`, `opt_epoch`). The `llama_memory_breakdown_data` struct tracks physical memory allocation across model, context, and compute buffers.

Usage

This is a core header that defines the central runtime object through which all inference, sampling, and state management operations flow. Include it when implementing or extending llama.cpp internal functionality that interacts with the inference context.

Code Reference

Source Location

Signature

struct llama_memory_breakdown_data {
    size_t model   = 0;
    size_t context = 0;
    size_t compute = 0;
    size_t total() const;
};

struct llama_context {
    // Lifecycle
    llama_context(const llama_model & model, llama_context_params params);
    ~llama_context();
    void sched_reserve();
    void synchronize();

    // Model and parameter access
    const llama_model   & get_model() const;
    const llama_cparams & get_cparams() const;
    ggml_backend_sched_t get_sched() const;

    // Dimension queries
    uint32_t n_ctx() const;
    uint32_t n_ctx_seq() const;
    uint32_t n_batch() const;
    uint32_t n_ubatch() const;
    uint32_t n_seq_max() const;
    uint32_t n_threads() const;
    uint32_t n_threads_batch() const;

    // Memory management
    llama_memory_t get_memory() const;
    bool memory_update(bool optimize);
    enum llama_pooling_type pooling_type() const;

    // Output access
    float * get_logits();
    float * get_logits_ith(int32_t i);
    float * get_embeddings();
    float * get_embeddings_ith(int32_t i);
    float * get_embeddings_seq(llama_seq_id seq_id);

    // Inference
    int encode(llama_batch & batch);
    int decode(llama_batch & batch);

    // State serialization
    size_t state_get_size() const;
    size_t state_get_data(uint8_t * dst, size_t size) const;
    size_t state_set_data(const uint8_t * src, size_t size);
    bool   state_save_file(const char * filepath, ...);
    bool   state_load_file(const char * filepath, ...);

    // Adapter management
    int32_t lora_adapter_set(llama_adapter_lora * adapter, float scale);
    int32_t lora_adapter_remove(llama_adapter_lora * adapter);
    void    lora_adapter_clear();
    int32_t set_cvec(float * data, ...);
};

Import

#pragma once
#include "llama.h"
#include "llama-cparams.h"
#include "llama-graph.h"
#include "llama-adapter.h"
#include "llama-impl.h"
#include "ggml-cpp.h"
#include "ggml-opt.h"
#include <map>
#include <vector>

I/O Contract

Inputs

Name Type Required Description
model const llama_model & Yes Reference to the loaded model containing weights and architecture info
params llama_context_params Yes Context creation parameters (context size, batch size, thread count, etc.)
batch llama_batch Yes Token batch for encode/decode operations
adapter llama_adapter_lora* No LoRA adapter for dynamic weight modification
scale float No LoRA adapter scaling factor

Outputs

Name Type Description
logits float* Pointer to output logits buffer for the last decoded batch
embeddings float* Pointer to output embeddings buffer (for embedding models)
n_ctx uint32_t Current context window size
memory llama_memory_t Handle to the context's memory (KV cache) interface
state_data uint8_t* Serialized context state for save/load operations

Usage Examples

// Create a context from a loaded model
llama_context_params ctx_params = llama_context_default_params();
ctx_params.n_ctx = 4096;
ctx_params.n_batch = 512;
ctx_params.n_threads = 4;

llama_context ctx(model, ctx_params);

// Decode a batch of tokens
llama_batch batch = llama_batch_init(512, 0, 1);
// ... fill batch with tokens ...
int result = ctx.decode(batch);

// Access output logits
float * logits = ctx.get_logits_ith(batch.n_tokens - 1);

// Get memory usage
auto memory = ctx.get_memory();
uint32_t ctx_size = ctx.n_ctx();

Related Pages

Page Connections

Double-click a node to navigate. Hold to expand connections.
Principle
Implementation
Heuristic
Environment