Jump to content

Connect SuperML | Leeroopedia MCP: Equip your AI agents with best practices, code verification, and debugging knowledge. Powered by Leeroo — building Organizational Superintelligence. Contact us at founders@leeroo.com.

Implementation:Togethercomputer Together python ChatCompletions Create

From Leeroopedia
Attribute Value
Implementation Name ChatCompletions_Create
Overview Synchronous and asynchronous methods for sending chat completion requests to Together AI models.
Source File src/together/resources/chat/completions.py
Lines L16-155 (ChatCompletions.create), L158-297 (AsyncChatCompletions.create)
Domain NLP, API_Client, Inference
Repository togethercomputer/together-python
Last Updated 2026-02-15 16:00 GMT

Code Reference

Synchronous (L20-155)

class ChatCompletions:
    def __init__(self, client: TogetherClient) -> None:
        self._client = client

    def create(
        self,
        *,
        messages: List[Dict[str, Any]],
        model: str,
        max_tokens: int | None = None,
        stop: List[str] | None = None,
        temperature: float | None = None,
        top_p: float | None = None,
        top_k: int | None = None,
        repetition_penalty: float | None = None,
        presence_penalty: float | None = None,
        frequency_penalty: float | None = None,
        min_p: float | None = None,
        logit_bias: Dict[str, float] | None = None,
        seed: int | None = None,
        stream: bool = False,
        logprobs: int | None = None,
        echo: bool | None = None,
        n: int | None = None,
        safety_model: str | None = None,
        response_format: Dict[str, Any] | None = None,
        tools: List[Dict[str, Any]] | None = None,
        tool_choice: str | Dict[str, str | Dict[str, str]] | None = None,
        **kwargs: Any,
    ) -> ChatCompletionResponse | Iterator[ChatCompletionChunk]:

Asynchronous (L162-297)

class AsyncChatCompletions:
    def __init__(self, client: TogetherClient) -> None:
        self._client = client

    async def create(
        self,
        *,
        messages: List[Dict[str, str]],
        model: str,
        max_tokens: int | None = None,
        stop: List[str] | None = None,
        temperature: float | None = None,
        top_p: float | None = None,
        top_k: int | None = None,
        repetition_penalty: float | None = None,
        presence_penalty: float | None = None,
        frequency_penalty: float | None = None,
        min_p: float | None = None,
        logit_bias: Dict[str, float] | None = None,
        seed: int | None = None,
        stream: bool = False,
        logprobs: int | None = None,
        echo: bool | None = None,
        n: int | None = None,
        safety_model: str | None = None,
        response_format: Dict[str, Any] | None = None,
        tools: Dict[str, str | Dict[str, str | Dict[str, Any]]] | None = None,
        tool_choice: str | Dict[str, str | Dict[str, str]] | None = None,
        **kwargs: Any,
    ) -> AsyncGenerator[ChatCompletionChunk, None] | ChatCompletionResponse:

Import

from together import Together

client = Together()
response = client.chat.completions.create(...)

For async usage:

from together import AsyncTogether

client = AsyncTogether()
response = await client.chat.completions.create(...)

API Signature

client.chat.completions.create(
    *,
    messages: List[Dict[str, Any]],        # Required: conversation messages
    model: str,                             # Required: model identifier
    max_tokens: int | None = None,          # Max tokens to generate
    stop: List[str] | None = None,          # Stop sequences
    temperature: float | None = None,       # Sampling temperature
    top_p: float | None = None,             # Nucleus sampling threshold
    top_k: int | None = None,               # Top-k sampling limit
    repetition_penalty: float | None = None,# Repetition penalty factor
    presence_penalty: float | None = None,  # Presence penalty [-2, 2]
    frequency_penalty: float | None = None, # Frequency penalty [-2, 2]
    min_p: float | None = None,             # Minimum probability [0, 1]
    logit_bias: Dict[str, float] | None = None, # Token logit adjustments [-100, 100]
    seed: int | None = None,                # Reproducibility seed
    stream: bool = False,                   # Enable SSE streaming
    logprobs: int | None = None,            # Number of top logprobs to return
    echo: bool | None = None,               # Echo prompt in output
    n: int | None = None,                   # Number of completions
    safety_model: str | None = None,        # Moderation model
    response_format: Dict[str, Any] | None = None, # Output format constraint
    tools: List[Dict[str, Any]] | None = None,     # Available tool definitions
    tool_choice: str | Dict[str, str | Dict[str, str]] | None = None, # Tool selection
    **kwargs: Any,
) -> ChatCompletionResponse | Iterator[ChatCompletionChunk]

I/O Contract

Required Input Parameters

Parameter Type Description
messages List[Dict[str, Any]] Ordered list of conversation messages, each with role and content keys.
model str Model identifier string (e.g., "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo").

Optional Input Parameters

Parameter Type Default Description
max_tokens None None (server default: 512) Maximum number of tokens to generate.
stop None None List of stop sequences that terminate generation.
temperature None None Sampling temperature controlling randomness.
top_p None None Nucleus sampling cumulative probability threshold.
top_k None None Top-k token selection limit.
repetition_penalty None None Penalty for repeated sequences (higher = less repetition).
presence_penalty None None Penalty based on token presence in text [-2, 2].
frequency_penalty None None Penalty based on token frequency in text [-2, 2].
min_p None None Minimum probability threshold [0, 1].
logit_bias None None Token logit adjustments [-100, 100].
seed None None Seed for reproducible generation.
stream bool False Enable Server-Sent Events streaming.
logprobs None None Number of top-k logprobs to return per token.
echo None None Echo prompt in output (useful with logprobs).
n None None Number of completions to generate.
safety_model None None Moderation model name for content filtering.
response_format None None Output format constraint (json_object, json_schema, or regex).
tools None None List of tool/function definitions the model may call.
tool_choice Dict | None None Controls tool selection: "auto", "required", or specific function.

Output

Condition Return Type Description
stream=False (default) ChatCompletionResponse Complete response with choices, usage, and metadata.
stream=True (sync) Iterator[ChatCompletionChunk] Generator yielding incremental response chunks.
stream=True (async) AsyncGenerator[ChatCompletionChunk, None] Async generator yielding incremental response chunks.

Usage Examples

Basic Chat Completion

from together import Together

client = Together()

response = client.chat.completions.create(
    model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Explain quantum computing in one sentence."},
    ],
    max_tokens=100,
    temperature=0.7,
    top_p=0.9,
)

print(response.choices[0].message.content)

Streaming Response

from together import Together

client = Together()

stream = client.chat.completions.create(
    model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
    messages=[{"role": "user", "content": "Write a haiku about Python."}],
    stream=True,
)

for chunk in stream:
    if chunk.choices[0].delta.content:
        print(chunk.choices[0].delta.content, end="", flush=True)
print()

JSON Mode (Structured Output)

from together import Together

client = Together()

response = client.chat.completions.create(
    model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
    messages=[
        {"role": "system", "content": "Output valid JSON only."},
        {"role": "user", "content": "List three programming languages with their year of creation."},
    ],
    response_format={"type": "json_object"},
)

print(response.choices[0].message.content)

Function/Tool Calling

from together import Together

client = Together()

tools = [
    {
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "Get the current weather for a location.",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {"type": "string", "description": "City name"},
                },
                "required": ["location"],
            },
        },
    }
]

response = client.chat.completions.create(
    model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
    messages=[{"role": "user", "content": "What's the weather in Tokyo?"}],
    tools=tools,
    tool_choice="auto",
)

# Check if the model made a tool call
choice = response.choices[0]
if choice.finish_reason == "tool_calls":
    for tool_call in choice.message.tool_calls:
        print(f"Function: {tool_call.function.name}")
        print(f"Arguments: {tool_call.function.arguments}")

Async Chat Completion

import asyncio
from together import AsyncTogether

async def main():
    client = AsyncTogether()

    response = await client.chat.completions.create(
        model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
        messages=[{"role": "user", "content": "Hello, world!"}],
        max_tokens=50,
    )
    print(response.choices[0].message.content)

asyncio.run(main())

Key Implementation Details

  • The method is keyword-only (bare * in signature) -- all parameters must be passed by name.
  • Parameters are validated by constructing a ChatCompletionRequest Pydantic model internally, then serialized via .model_dump(exclude_none=True) to omit unset optional fields.
  • A Pydantic model_validator on ChatCompletionRequest warns if repetition_penalty is used alongside presence_penalty or frequency_penalty.
  • The HTTP request is dispatched as a POST to the chat/completions endpoint via the APIRequestor.
  • When stream=True, the response is a generator expression that wraps each SSE line as a ChatCompletionChunk.
  • When stream=False, the response is a TogetherResponse unpacked into a ChatCompletionResponse.
  • The **kwargs parameter allows passing additional fields not explicitly defined in the signature through to the request payload.

Related

Page Connections

Double-click a node to navigate. Hold to expand connections.
Principle
Implementation
Heuristic
Environment