Skip to main content
Chapter 7 Production Deployment

Cost Optimization Strategies

18 min read Lesson 26 / 28

Cost Optimization Strategies

AI inference costs can grow unexpectedly in production. These techniques can reduce your Anthropic API spend by 50–90% without sacrificing quality.

1. Model Tier Routing

Route simple tasks to Haiku and reserve Sonnet/Opus for complex ones:

import anthropic
from enum import Enum

client = anthropic.Anthropic()

class TaskComplexity(Enum):
    SIMPLE = "simple"
    STANDARD = "standard"
    COMPLEX = "complex"

MODEL_MAP = {
    TaskComplexity.SIMPLE: "claude-haiku-4-5",
    TaskComplexity.STANDARD: "claude-sonnet-4-5",
    TaskComplexity.COMPLEX: "claude-opus-4-5",
}

def classify_complexity(task: str) -> TaskComplexity:
    """Use the cheapest model to classify complexity."""
    response = client.messages.create(
        model="claude-haiku-4-5",
        max_tokens=8,
        system="Classify task complexity. Reply with ONLY: simple, standard, or complex.",
        messages=[{"role": "user", "content": task}],
    )
    label = response.content[0].text.strip().lower()
    return TaskComplexity(label) if label in [e.value for e in TaskComplexity] else TaskComplexity.STANDARD

def smart_complete(task: str) -> str:
    complexity = classify_complexity(task)
    model = MODEL_MAP[complexity]
    print(f"Routing '{task[:40]}...' to {model}")

    response = client.messages.create(
        model=model,
        max_tokens=2048,
        messages=[{"role": "user", "content": task}],
    )
    return response.content[0].text

2. Prompt Caching

Cache large, repeated system prompts to dramatically reduce input token costs:

def analyze_with_cache(document: str, questions: list[str]) -> list[str]:
    """Analyze multiple questions against the same document using prompt caching."""
    # The document is cached after the first request — subsequent requests
    # pay only for the small question portion of the input
    answers = []

    for question in questions:
        response = client.messages.create(
            model="claude-sonnet-4-5",
            max_tokens=512,
            system=[
                {
                    "type": "text",
                    "text": f"You are a document analyst. Answer questions about this document:\n\n{document}",
                    "cache_control": {"type": "ephemeral"},  # Cache this block
                }
            ],
            messages=[{"role": "user", "content": question}],
        )
        answers.append(response.content[0].text)

    return answers

3. Response Caching

Cache deterministic responses to avoid redundant API calls:

import hashlib
import json
from functools import lru_cache

response_cache: dict[str, str] = {}

def cached_complete(prompt: str, model: str = "claude-haiku-4-5", temperature: float = 0.0) -> str:
    """Cache responses for identical prompts at temperature=0."""
    if temperature > 0:
        # Non-deterministic responses should not be cached
        return direct_complete(prompt, model, temperature)

    cache_key = hashlib.sha256(f"{model}:{prompt}".encode()).hexdigest()

    if cache_key in response_cache:
        print("[CACHE HIT]")
        return response_cache[cache_key]

    result = direct_complete(prompt, model, temperature)
    response_cache[cache_key] = result
    return result


def direct_complete(prompt: str, model: str, temperature: float) -> str:
    response = client.messages.create(
        model=model,
        max_tokens=1024,
        temperature=temperature,
        messages=[{"role": "user", "content": prompt}],
    )
    return response.content[0].text

4. Output Token Control

Overly generous max_tokens settings waste budget. Set them based on expected output length:

# Bad: same max_tokens for all tasks
response = client.messages.create(model="claude-sonnet-4-5", max_tokens=4096, ...)

# Good: calibrated to task
TASK_TOKEN_LIMITS = {
    "classify": 16,
    "summarize_short": 256,
    "summarize_long": 1024,
    "code_review": 2048,
    "full_analysis": 4096,
}

A systematic cost optimization strategy can reduce API spend by 70%+ without any loss in output quality.