Cost Optimization Strategies
AI inference costs can grow unexpectedly in production. These techniques can reduce your Anthropic API spend by 50–90% without sacrificing quality.
1. Model Tier Routing
Route simple tasks to Haiku and reserve Sonnet/Opus for complex ones:
import anthropic
from enum import Enum
client = anthropic.Anthropic()
class TaskComplexity(Enum):
SIMPLE = "simple"
STANDARD = "standard"
COMPLEX = "complex"
MODEL_MAP = {
TaskComplexity.SIMPLE: "claude-haiku-4-5",
TaskComplexity.STANDARD: "claude-sonnet-4-5",
TaskComplexity.COMPLEX: "claude-opus-4-5",
}
def classify_complexity(task: str) -> TaskComplexity:
"""Use the cheapest model to classify complexity."""
response = client.messages.create(
model="claude-haiku-4-5",
max_tokens=8,
system="Classify task complexity. Reply with ONLY: simple, standard, or complex.",
messages=[{"role": "user", "content": task}],
)
label = response.content[0].text.strip().lower()
return TaskComplexity(label) if label in [e.value for e in TaskComplexity] else TaskComplexity.STANDARD
def smart_complete(task: str) -> str:
complexity = classify_complexity(task)
model = MODEL_MAP[complexity]
print(f"Routing '{task[:40]}...' to {model}")
response = client.messages.create(
model=model,
max_tokens=2048,
messages=[{"role": "user", "content": task}],
)
return response.content[0].text
2. Prompt Caching
Cache large, repeated system prompts to dramatically reduce input token costs:
def analyze_with_cache(document: str, questions: list[str]) -> list[str]:
"""Analyze multiple questions against the same document using prompt caching."""
# The document is cached after the first request — subsequent requests
# pay only for the small question portion of the input
answers = []
for question in questions:
response = client.messages.create(
model="claude-sonnet-4-5",
max_tokens=512,
system=[
{
"type": "text",
"text": f"You are a document analyst. Answer questions about this document:\n\n{document}",
"cache_control": {"type": "ephemeral"}, # Cache this block
}
],
messages=[{"role": "user", "content": question}],
)
answers.append(response.content[0].text)
return answers
3. Response Caching
Cache deterministic responses to avoid redundant API calls:
import hashlib
import json
from functools import lru_cache
response_cache: dict[str, str] = {}
def cached_complete(prompt: str, model: str = "claude-haiku-4-5", temperature: float = 0.0) -> str:
"""Cache responses for identical prompts at temperature=0."""
if temperature > 0:
# Non-deterministic responses should not be cached
return direct_complete(prompt, model, temperature)
cache_key = hashlib.sha256(f"{model}:{prompt}".encode()).hexdigest()
if cache_key in response_cache:
print("[CACHE HIT]")
return response_cache[cache_key]
result = direct_complete(prompt, model, temperature)
response_cache[cache_key] = result
return result
def direct_complete(prompt: str, model: str, temperature: float) -> str:
response = client.messages.create(
model=model,
max_tokens=1024,
temperature=temperature,
messages=[{"role": "user", "content": prompt}],
)
return response.content[0].text
4. Output Token Control
Overly generous max_tokens settings waste budget. Set them based on expected output length:
# Bad: same max_tokens for all tasks
response = client.messages.create(model="claude-sonnet-4-5", max_tokens=4096, ...)
# Good: calibrated to task
TASK_TOKEN_LIMITS = {
"classify": 16,
"summarize_short": 256,
"summarize_long": 1024,
"code_review": 2048,
"full_analysis": 4096,
}
A systematic cost optimization strategy can reduce API spend by 70%+ without any loss in output quality.