Implementation:Ggml org Llama cpp Context Header
| Knowledge Sources | |
|---|---|
| Domains | Inference, Context |
| Last Updated | 2026-02-15 00:00 GMT |
Overview
Declares the `llama_context` struct and the `llama_memory_breakdown_data` helper, defining the full interface for the llama.cpp inference context.
Description
Exposes methods for context lifecycle (constructor, destructor, `sched_reserve`), inference (`encode`, `decode`, `process_ubatch`), output access (logits, embeddings, sampled tokens/probs/candidates), state serialization (save/load to buffers and files), memory management (KV cache access, memory updates), adapter management (LoRA, control vectors), and training support (`opt_init`, `opt_epoch`). The `llama_memory_breakdown_data` struct tracks physical memory allocation across model, context, and compute buffers.
Usage
This is a core header that defines the central runtime object through which all inference, sampling, and state management operations flow. Include it when implementing or extending llama.cpp internal functionality that interacts with the inference context.
Code Reference
Source Location
- Repository: Ggml_org_Llama_cpp
- File: src/llama-context.h
- Lines: 1-356
Signature
struct llama_memory_breakdown_data {
size_t model = 0;
size_t context = 0;
size_t compute = 0;
size_t total() const;
};
struct llama_context {
// Lifecycle
llama_context(const llama_model & model, llama_context_params params);
~llama_context();
void sched_reserve();
void synchronize();
// Model and parameter access
const llama_model & get_model() const;
const llama_cparams & get_cparams() const;
ggml_backend_sched_t get_sched() const;
// Dimension queries
uint32_t n_ctx() const;
uint32_t n_ctx_seq() const;
uint32_t n_batch() const;
uint32_t n_ubatch() const;
uint32_t n_seq_max() const;
uint32_t n_threads() const;
uint32_t n_threads_batch() const;
// Memory management
llama_memory_t get_memory() const;
bool memory_update(bool optimize);
enum llama_pooling_type pooling_type() const;
// Output access
float * get_logits();
float * get_logits_ith(int32_t i);
float * get_embeddings();
float * get_embeddings_ith(int32_t i);
float * get_embeddings_seq(llama_seq_id seq_id);
// Inference
int encode(llama_batch & batch);
int decode(llama_batch & batch);
// State serialization
size_t state_get_size() const;
size_t state_get_data(uint8_t * dst, size_t size) const;
size_t state_set_data(const uint8_t * src, size_t size);
bool state_save_file(const char * filepath, ...);
bool state_load_file(const char * filepath, ...);
// Adapter management
int32_t lora_adapter_set(llama_adapter_lora * adapter, float scale);
int32_t lora_adapter_remove(llama_adapter_lora * adapter);
void lora_adapter_clear();
int32_t set_cvec(float * data, ...);
};
Import
#pragma once
#include "llama.h"
#include "llama-cparams.h"
#include "llama-graph.h"
#include "llama-adapter.h"
#include "llama-impl.h"
#include "ggml-cpp.h"
#include "ggml-opt.h"
#include <map>
#include <vector>
I/O Contract
Inputs
| Name | Type | Required | Description |
|---|---|---|---|
| model | const llama_model & | Yes | Reference to the loaded model containing weights and architecture info |
| params | llama_context_params | Yes | Context creation parameters (context size, batch size, thread count, etc.) |
| batch | llama_batch | Yes | Token batch for encode/decode operations |
| adapter | llama_adapter_lora* | No | LoRA adapter for dynamic weight modification |
| scale | float | No | LoRA adapter scaling factor |
Outputs
| Name | Type | Description |
|---|---|---|
| logits | float* | Pointer to output logits buffer for the last decoded batch |
| embeddings | float* | Pointer to output embeddings buffer (for embedding models) |
| n_ctx | uint32_t | Current context window size |
| memory | llama_memory_t | Handle to the context's memory (KV cache) interface |
| state_data | uint8_t* | Serialized context state for save/load operations |
Usage Examples
// Create a context from a loaded model
llama_context_params ctx_params = llama_context_default_params();
ctx_params.n_ctx = 4096;
ctx_params.n_batch = 512;
ctx_params.n_threads = 4;
llama_context ctx(model, ctx_params);
// Decode a batch of tokens
llama_batch batch = llama_batch_init(512, 0, 1);
// ... fill batch with tokens ...
int result = ctx.decode(batch);
// Access output logits
float * logits = ctx.get_logits_ith(batch.n_tokens - 1);
// Get memory usage
auto memory = ctx.get_memory();
uint32_t ctx_size = ctx.n_ctx();