Token Management

Unified parameter vocabulary with budget validation across all LLM providers.

🎯 Key Benefits

Unified Parameters

Same token parameters work across ALL providers - no more provider-specific configurations.

Budget Validation

Automatic token estimation and cost calculation with warnings for budget overruns.

Auto-Calculation

Smart input/output token allocation based on context window and requirements.

Unified Token Parameters

AbstractCore provides a consistent token parameter vocabulary that works across all providers:

from abstractcore import create_llm

# Unified token parameters work across ALL providers
llm = create_llm(
    "anthropic",
    model="claude-3-5-haiku-latest",
    max_tokens=32000,           # Context window (input + output)
    max_output_tokens=8000,     # Maximum output tokens
    max_input_tokens=24000      # Maximum input tokens (auto-calculated if not set)
)

# Same parameters work with any provider
openai_llm = create_llm(
    "openai",
    model="gpt-4o-mini",
    max_tokens=16384,           # Context window
    max_output_tokens=4000,     # Output limit
    max_input_tokens=12384      # Input limit (auto-calculated: 16384 - 4000)
)

# Local providers too
ollama_llm = create_llm(
    "ollama",
    model="qwen3-coder:30b",
    max_tokens=8192,            # Context window
    max_output_tokens=2048      # Output limit (input auto-calculated: 6144)
)

Parameter Definitions

max_tokens: Total context window (input + output combined)
max_output_tokens: Maximum tokens for generated response
max_input_tokens: Maximum tokens for input (auto-calculated if not set)

Token Estimation & Validation

Estimate token usage before making requests and validate against budgets:

from abstractcore.utils.token_utils import estimate_tokens, calculate_token_budget

# Estimate tokens for input text
text = "Your input text here..."
estimated = estimate_tokens(text, model="claude-3-5-haiku-latest")
print(f"Estimated tokens: {estimated}")

# Calculate optimal token budget
budget = calculate_token_budget(
    context_window=32000,
    target_output=8000,
    safety_margin=0.1  # 10% safety margin
)
print(f"Recommended input limit: {budget.max_input_tokens}")
print(f"Recommended output limit: {budget.max_output_tokens}")

# Validate before generation
if estimated > budget.max_input_tokens:
    print(f"Warning: Input too long ({estimated} > {budget.max_input_tokens})")
    # Truncate or split input

Cost Tracking & Monitoring

Track token usage and costs across all providers:

from abstractcore import create_llm
from abstractcore.events import EventType, on_global

# Cost monitoring with events
def cost_monitor(event):
    usage = event.data.get('usage', {})
    if usage:
        print(f"Input tokens: {usage.get('input_tokens', 0)}")
        print(f"Output tokens: {usage.get('output_tokens', 0)}")
        print(f"Total tokens: {usage.get('total_tokens', 0)}")
        
        cost = usage.get('cost_usd', 0)
        if cost > 0.10:  # Alert for high-cost requests
            print(f"⚠️  High cost request: ${cost:.4f}")

on_global(EventType.GENERATION_COMPLETED, cost_monitor)

# Generate with cost tracking
llm = create_llm("openai", model="gpt-4o-mini")
response = llm.generate("Write a detailed analysis...")

# Access usage information
print(f"Input tokens: {response.usage.input_tokens}")
print(f"Output tokens: {response.usage.output_tokens}")
print(f"Cost estimate: ${response.usage.cost_usd:.4f}")

Provider-Specific Mapping

AbstractCore automatically maps unified parameters to provider-specific formats:

Parameter Mapping Table

Provider	max_tokens	max_output_tokens	Notes
OpenAI	→ max_tokens (output only)	→ max_tokens	Input managed by context
Anthropic	→ context window	→ max_tokens	Input truncated if needed
Ollama	→ num_ctx	→ num_predict	Local model parameters
LMStudio	→ max_tokens (total)	→ max_tokens	OpenAI-compatible format
MLX	→ context_length	→ max_tokens	Apple Silicon optimization
HuggingFace	→ max_length	→ max_new_tokens	Transformers library format

Best Practices

1. Set Reasonable Limits

# Good: Reasonable limits
llm = create_llm(
    "openai",
    model="gpt-4o-mini",
    max_tokens=16384,       # Model's context window
    max_output_tokens=4000, # 25% for output
    max_input_tokens=12384  # 75% for input
)

# Avoid: Unrealistic limits
# max_tokens=1000000  # Exceeds model capability

2. Use Token Estimation

# Always estimate before generation
def safe_generate(llm, prompt):
    estimated = estimate_tokens(prompt)
    
    if estimated > llm.max_input_tokens:
        # Truncate or split prompt
        prompt = truncate_to_tokens(prompt, llm.max_input_tokens)
    
    return llm.generate(prompt)

3. Monitor Costs

# Set up cost monitoring
daily_budget = 10.00  # $10 daily budget
current_spend = 0.00

def budget_monitor(event):
    global current_spend
    cost = event.data.get('usage', {}).get('cost_usd', 0)
    current_spend += cost
    
    if current_spend > daily_budget:
        raise Exception(f"Daily budget exceeded: ${current_spend:.2f}")

on_global(EventType.GENERATION_COMPLETED, budget_monitor)

4. Handle Truncation Gracefully

# Handle truncated responses
response = llm.generate(prompt)

if response.finish_reason == "length":
    print("⚠️  Response was truncated due to token limit")
    # Consider increasing max_output_tokens
    # or breaking task into smaller parts

Advanced Features

Dynamic Token Allocation

from abstractcore.utils.token_utils import dynamic_token_allocation

# Automatically adjust token limits based on content
def adaptive_generate(llm, prompt, desired_output_length="medium"):
    # Estimate input tokens
    input_tokens = estimate_tokens(prompt)
    
    # Calculate optimal allocation
    allocation = dynamic_token_allocation(
        context_window=llm.max_tokens,
        input_tokens=input_tokens,
        output_preference=desired_output_length  # "short", "medium", "long"
    )
    
    # Update LLM configuration
    llm.max_output_tokens = allocation.output_tokens
    llm.max_input_tokens = allocation.input_tokens
    
    return llm.generate(prompt)

Batch Token Optimization

# Optimize token usage for batch processing
def batch_process_with_token_optimization(llm, prompts):
    results = []
    
    for prompt in prompts:
        # Estimate and optimize for each prompt
        estimated = estimate_tokens(prompt)
        
        if estimated > llm.max_input_tokens:
            # Split large prompts
            chunks = split_prompt_by_tokens(prompt, llm.max_input_tokens)
            for chunk in chunks:
                result = llm.generate(chunk)
                results.append(result)
        else:
            result = llm.generate(prompt)
            results.append(result)
    
    return results