Select frameworks to compare
Pick one or more frameworks from the bar above
Runtime Context
OpenAI
import json
from openai import OpenAI
LLM_MODEL = "gpt-5.4"
client = OpenAI()
# mock database — in production these are real queries
ORDERS = {
"user_123": [
{"id": "ORD_99", "item": "Laptop Stand", "status": "shipped"},
{"id": "ORD_100", "item": "USB Hub", "status": "processing"},
],
}
# status is chosen by the LLM — user_id comes from your app (auth, session, etc.)
# only status appears in the tool schema — the LLM can't see or fabricate user_id
user_id = "user_123"
def get_orders(status: str) -> str:
"""Get orders filtered by status."""
print(f"-> call: get_orders({status}) for {user_id}")
matches = [o for o in ORDERS[user_id] if o["status"] == status]
result = ", ".join(f'{o["id"]}: {o["item"]}' for o in matches) or "No orders found."
print(f"-> result: {result}")
return result
tools = [{
"type": "function",
"name": "get_orders",
"description": "Get orders filtered by status.",
"parameters": {
"type": "object",
"properties": {
"status": {"type": "string"},
},
"required": ["status"],
},
}]
input = [{"role": "user", "content": "Do I have any shipped orders?"}]
# step 1: LLM decides to call the tool
response = client.responses.create(
model=LLM_MODEL, input=input, tools=tools,
)
tool_call = next(i for i in response.output if i.type == "function_call")
result = get_orders(**json.loads(tool_call.arguments))
# step 2: send tool result back, LLM generates final response
input += response.output
input.append({
"type": "function_call_output",
"call_id": tool_call.call_id,
"output": result,
})
response = client.responses.create(
model=LLM_MODEL, input=input, tools=tools,
)
print(response.output_text)
# -> call: get_orders(shipped) for user_123
# -> result: ORD_99: Laptop Stand
# "Your shipped order is ORD_99: Laptop Stand."
Anthropic
import anthropic
LLM_MODEL = "claude-opus-4-6"
client = anthropic.Anthropic()
# mock database — in production these are real queries
ORDERS = {
"user_123": [
{"id": "ORD_99", "item": "Laptop Stand", "status": "shipped"},
{"id": "ORD_100", "item": "USB Hub", "status": "processing"},
],
}
# status is chosen by the LLM — user_id comes from your app (auth, session, etc.)
# only status appears in the tool schema — the LLM can't see or fabricate user_id
user_id = "user_123"
def get_orders(status: str) -> str:
"""Get orders filtered by status."""
print(f"-> call: get_orders({status}) for {user_id}")
matches = [o for o in ORDERS[user_id] if o["status"] == status]
result = ", ".join(f'{o["id"]}: {o["item"]}' for o in matches) or "No orders found."
print(f"-> result: {result}")
return result
tools = [{
"name": "get_orders",
"description": "Get orders filtered by status.",
"input_schema": {
"type": "object",
"properties": {
"status": {"type": "string"},
},
"required": ["status"],
},
}]
messages = [{"role": "user", "content": "Do I have any shipped orders?"}]
# step 1: LLM decides to call the tool
response = client.messages.create(
model=LLM_MODEL, max_tokens=1024, tools=tools, messages=messages,
)
tool_block = next(b for b in response.content if b.type == "tool_use")
result = get_orders(**tool_block.input)
# step 2: send tool result back, LLM generates final response
messages.append({"role": "assistant", "content": response.content})
messages.append({"role": "user", "content": [{
"type": "tool_result",
"tool_use_id": tool_block.id,
"content": result,
}]})
response = client.messages.create(
model=LLM_MODEL, max_tokens=1024, tools=tools, messages=messages,
)
print(response.content[0].text)
# -> call: get_orders(shipped) for user_123
# -> result: ORD_99: Laptop Stand
# "Your shipped order is ORD_99: Laptop Stand."
Gemini
from google import genai
from google.genai import types
LLM_MODEL = "gemini-pro-latest"
client = genai.Client()
# mock database — in production these are real queries
ORDERS = {
"user_123": [
{"id": "ORD_99", "item": "Laptop Stand", "status": "shipped"},
{"id": "ORD_100", "item": "USB Hub", "status": "processing"},
],
}
# status is chosen by the LLM — user_id comes from your app (auth, session, etc.)
# only status appears in the tool schema — the LLM can't see or fabricate user_id
user_id = "user_123"
def get_orders(status: str) -> str:
"""Get orders filtered by status."""
print(f"-> call: get_orders({status}) for {user_id}")
matches = [o for o in ORDERS[user_id] if o["status"] == status]
result = ", ".join(f'{o["id"]}: {o["item"]}' for o in matches) or "No orders found."
print(f"-> result: {result}")
return result
# automatic function calling: SDK executes the tool and feeds results back
config = types.GenerateContentConfig(tools=[get_orders])
response = client.models.generate_content(
model=LLM_MODEL,
config=config,
contents="Do I have any shipped orders?",
)
print(response.text)
# -> call: get_orders(shipped) for user_123
# -> result: ORD_99: Laptop Stand
# "Your shipped order is ORD_99: Laptop Stand."
Pydantic AI
from dataclasses import dataclass
from pydantic_ai import Agent, RunContext
LLM_MODEL = "openai:gpt-5.4"
# mock database — in production these are real queries
ORDERS = {
"user_123": [
{"id": "ORD_99", "item": "Laptop Stand", "status": "shipped"},
{"id": "ORD_100", "item": "USB Hub", "status": "processing"},
],
}
@dataclass
class Deps:
user_id: str
agent = Agent(LLM_MODEL, deps_type=Deps)
# status is chosen by the LLM — user_id comes from your app (auth, session, etc.)
# RunContext is hidden from the LLM — it can't see or fabricate user_id
@agent.tool
def get_orders(ctx: RunContext[Deps], status: str) -> str:
"""Get orders filtered by status."""
print(f"-> call: get_orders({status}) for {ctx.deps.user_id}")
matches = [o for o in ORDERS[ctx.deps.user_id] if o["status"] == status]
result = ", ".join(f'{o["id"]}: {o["item"]}' for o in matches) or "No orders found."
print(f"-> result: {result}")
return result
result = agent.run_sync(
"Do I have any shipped orders?",
deps=Deps(user_id="user_123"),
)
print(result.output)
# -> call: get_orders(shipped) for user_123
# -> result: ORD_99: Laptop Stand
# "Your shipped order is ORD_99: Laptop Stand."
LangGraph
from dataclasses import dataclass
from langchain.tools import tool, ToolRuntime
from langchain.agents import create_agent
from langchain_openai import ChatOpenAI
LLM_MODEL = "gpt-5.4"
model = ChatOpenAI(model=LLM_MODEL)
# mock database — in production these are real queries
ORDERS = {
"user_123": [
{"id": "ORD_99", "item": "Laptop Stand", "status": "shipped"},
{"id": "ORD_100", "item": "USB Hub", "status": "processing"},
],
}
@dataclass
class UserContext:
user_id: str
# status is chosen by the LLM — user_id comes from your app (auth, session, etc.)
# ToolRuntime is hidden from the LLM — it can't see or fabricate user_id
@tool
def get_orders(status: str, runtime: ToolRuntime) -> str:
"""Get orders filtered by status."""
user_id = runtime.context.user_id
print(f"-> call: get_orders({status}) for {user_id}")
matches = [o for o in ORDERS[user_id] if o["status"] == status]
result = ", ".join(f'{o["id"]}: {o["item"]}' for o in matches) or "No orders found."
print(f"-> result: {result}")
return result
agent = create_agent(model, [get_orders], context_schema=UserContext)
result = agent.invoke(
{"messages": [("user", "Do I have any shipped orders?")]},
context=UserContext(user_id="user_123"),
)
print(result["messages"][-1].content)
# -> call: get_orders(shipped) for user_123
# -> result: ORD_99: Laptop Stand
# "Your shipped order is ORD_99: Laptop Stand."
AI SDK
import { ToolLoopAgent, tool } from "ai";
import { openai } from "@ai-sdk/openai";
import { z } from "zod";
const LLM_MODEL = "gpt-5.4";
// mock database — in production these are real queries
const ORDERS: Record<string, { id: string; item: string; status: string }[]> = {
user_123: [
{ id: "ORD_99", item: "Laptop Stand", status: "shipped" },
{ id: "ORD_100", item: "USB Hub", status: "processing" },
],
};
// status is chosen by the LLM — userId comes from your app (auth, session, etc.)
// only status appears in the tool schema — the LLM can't see or fabricate userId
function createTools(userId: string) {
const getOrders = tool({
description: "Get orders filtered by status.",
inputSchema: z.object({ status: z.string() }),
execute: async ({ status }) => {
console.log(`-> call: get_orders(${status}) for ${userId}`);
const matches = ORDERS[userId].filter((o) => o.status === status);
const result = matches.map((o) => `${o.id}: ${o.item}`).join(", ") || "No orders found.";
console.log(`-> result: ${result}`);
return result;
},
});
return { getOrders };
}
const tools = createTools("user_123");
const agent = new ToolLoopAgent({
model: openai(LLM_MODEL),
tools,
});
const result = await agent.generate({
prompt: "Do I have any shipped orders?",
});
console.log(result.text);
// -> call: get_orders(shipped) for user_123
// -> result: ORD_99: Laptop Stand
// "Your shipped order is ORD_99: Laptop Stand."
Mastra
import { Agent } from "@mastra/core/agent";
import { createTool } from "@mastra/core/tools";
import { RequestContext } from "@mastra/core/request-context";
import { z } from "zod";
const LLM_MODEL = "openai/gpt-5.4";
// mock database — in production these are real queries
const ORDERS: Record<string, { id: string; item: string; status: string }[]> = {
user_123: [
{ id: "ORD_99", item: "Laptop Stand", status: "shipped" },
{ id: "ORD_100", item: "USB Hub", status: "processing" },
],
};
// status is chosen by the LLM — userId comes from your app (auth, session, etc.)
// requestContext is hidden from the LLM — it can't see or fabricate userId
const getOrders = createTool({
id: "get-orders",
description: "Get orders filtered by status.",
inputSchema: z.object({ status: z.string() }),
requestContextSchema: z.object({ userId: z.string() }),
execute: async ({ status }, { requestContext }) => {
const userId = requestContext.get("userId");
console.log(`-> call: get_orders(${status}) for ${userId}`);
const matches = ORDERS[userId].filter((o) => o.status === status);
const result = matches.map((o) => `${o.id}: ${o.item}`).join(", ") || "No orders found.";
console.log(`-> result: ${result}`);
return result;
},
});
const agent = new Agent({
name: "order-agent",
instructions: "You are a helpful assistant.",
model: LLM_MODEL,
tools: { getOrders },
});
const requestContext = new RequestContext([["userId", "user_123"]]);
const result = await agent.generate("Do I have any shipped orders?", {
requestContext,
});
console.log(result.text);
// -> call: get_orders(shipped) for user_123
// -> result: ORD_99: Laptop Stand
// "Your shipped order is ORD_99: Laptop Stand."Dynamic Instructions
OpenAI
from openai import OpenAI
LLM_MODEL = "gpt-5.4"
client = OpenAI()
# mock database — in production these are real queries
CUSTOMERS = {
"user_123": {"name": "Acme Corp", "plan": "enterprise"},
"user_456": {"name": "Jane Smith", "plan": "free"},
}
OVERDUE_INVOICES = {
"user_123": [{"id": "INV-42", "amount": 1200}],
"user_456": [],
}
# instructions are built at request time from live data — not a static string
# the LLM sees personalized context without querying the database itself
def build_instructions(user_id: str) -> str:
customer = CUSTOMERS[user_id]
overdue = OVERDUE_INVOICES[user_id]
lines = [f"Customer: {customer['name']}, plan: {customer['plan']}."]
if overdue:
lines.append(
f"ALERT: {len(overdue)} overdue invoice(s). Prioritize payment resolution."
)
if customer["plan"] == "enterprise":
lines.append("This is a premium customer. Offer direct escalation.")
return "\n".join(lines)
# same agent, same question — behavior changes based on who's asking
response = client.responses.create(
model=LLM_MODEL,
instructions=build_instructions("user_123"),
input="I need help with my account.",
)
print(response.output_text)
# "I see there's an overdue invoice on your account. Let me help
# resolve that. As a premium customer, I can escalate directly."
response = client.responses.create(
model=LLM_MODEL,
instructions=build_instructions("user_456"),
input="I need help with my account.",
)
print(response.output_text)
# "Sure, I'd be happy to help! What do you need assistance with?"
Anthropic
import anthropic
LLM_MODEL = "claude-opus-4-6"
client = anthropic.Anthropic()
# mock database — in production these are real queries
CUSTOMERS = {
"user_123": {"name": "Acme Corp", "plan": "enterprise"},
"user_456": {"name": "Jane Smith", "plan": "free"},
}
OVERDUE_INVOICES = {
"user_123": [{"id": "INV-42", "amount": 1200}],
"user_456": [],
}
# instructions are built at request time from live data — not a static string
# the LLM sees personalized context without querying the database itself
def build_instructions(user_id: str) -> str:
customer = CUSTOMERS[user_id]
overdue = OVERDUE_INVOICES[user_id]
lines = [f"Customer: {customer['name']}, plan: {customer['plan']}."]
if overdue:
lines.append(
f"ALERT: {len(overdue)} overdue invoice(s). Prioritize payment resolution."
)
if customer["plan"] == "enterprise":
lines.append("This is a premium customer. Offer direct escalation.")
return "\n".join(lines)
# same agent, same question — behavior changes based on who's asking
response = client.messages.create(
model=LLM_MODEL,
max_tokens=1024,
system=build_instructions("user_123"),
messages=[{"role": "user", "content": "I need help with my account."}],
)
print(response.content[0].text)
# "I see there's an overdue invoice on your account. Let me help
# resolve that. As a premium customer, I can escalate directly."
response = client.messages.create(
model=LLM_MODEL,
max_tokens=1024,
system=build_instructions("user_456"),
messages=[{"role": "user", "content": "I need help with my account."}],
)
print(response.content[0].text)
# "Sure, I'd be happy to help! What do you need assistance with?"
Gemini
from google import genai
from google.genai import types
LLM_MODEL = "gemini-pro-latest"
client = genai.Client()
# mock database — in production these are real queries
CUSTOMERS = {
"user_123": {"name": "Acme Corp", "plan": "enterprise"},
"user_456": {"name": "Jane Smith", "plan": "free"},
}
OVERDUE_INVOICES = {
"user_123": [{"id": "INV-42", "amount": 1200}],
"user_456": [],
}
# instructions are built at request time from live data — not a static string
# the LLM sees personalized context without querying the database itself
def build_instructions(user_id: str) -> str:
customer = CUSTOMERS[user_id]
overdue = OVERDUE_INVOICES[user_id]
lines = [f"Customer: {customer['name']}, plan: {customer['plan']}."]
if overdue:
lines.append(
f"ALERT: {len(overdue)} overdue invoice(s). Prioritize payment resolution."
)
if customer["plan"] == "enterprise":
lines.append("This is a premium customer. Offer direct escalation.")
return "\n".join(lines)
# same agent, same question — behavior changes based on who's asking
response = client.models.generate_content(
model=LLM_MODEL,
config=types.GenerateContentConfig(
system_instruction=build_instructions("user_123"),
),
contents="I need help with my account.",
)
print(response.text)
# "I see there's an overdue invoice on your account. Let me help
# resolve that. As a premium customer, I can escalate directly."
response = client.models.generate_content(
model=LLM_MODEL,
config=types.GenerateContentConfig(
system_instruction=build_instructions("user_456"),
),
contents="I need help with my account.",
)
print(response.text)
# "Sure, I'd be happy to help! What do you need assistance with?"
Pydantic AI
from dataclasses import dataclass
from pydantic_ai import Agent, RunContext
LLM_MODEL = "openai:gpt-5.4"
# mock database — in production these are real queries
CUSTOMERS = {
"user_123": {"name": "Acme Corp", "plan": "enterprise"},
"user_456": {"name": "Jane Smith", "plan": "free"},
}
OVERDUE_INVOICES = {
"user_123": [{"id": "INV-42", "amount": 1200}],
"user_456": [],
}
@dataclass
class Deps:
user_id: str
agent = Agent(LLM_MODEL, deps_type=Deps)
# instructions are built at request time from live data — not a static string
# the LLM sees personalized context without querying the database itself
@agent.instructions
def dynamic_instructions(ctx: RunContext[Deps]) -> str:
customer = CUSTOMERS[ctx.deps.user_id]
overdue = OVERDUE_INVOICES[ctx.deps.user_id]
lines = [f"Customer: {customer['name']}, plan: {customer['plan']}."]
if overdue:
lines.append(
f"ALERT: {len(overdue)} overdue invoice(s). Prioritize payment resolution."
)
if customer["plan"] == "enterprise":
lines.append("This is a premium customer. Offer direct escalation.")
return "\n".join(lines)
# same agent, same question — behavior changes based on who's asking
result = agent.run_sync(
"I need help with my account.",
deps=Deps(user_id="user_123"),
)
print(result.output)
# "I see there's an overdue invoice on your account. Let me help
# resolve that. As a premium customer, I can escalate directly."
result = agent.run_sync(
"I need help with my account.",
deps=Deps(user_id="user_456"),
)
print(result.output)
# "Sure, I'd be happy to help! What do you need assistance with?"
LangGraph
from dataclasses import dataclass
from langchain.agents import create_agent
from langchain.agents.middleware import dynamic_prompt
from langchain_openai import ChatOpenAI
LLM_MODEL = "gpt-5.4"
model = ChatOpenAI(model=LLM_MODEL)
# mock database — in production these are real queries
CUSTOMERS = {
"user_123": {"name": "Acme Corp", "plan": "enterprise"},
"user_456": {"name": "Jane Smith", "plan": "free"},
}
OVERDUE_INVOICES = {
"user_123": [{"id": "INV-42", "amount": 1200}],
"user_456": [],
}
@dataclass
class UserContext:
user_id: str
# @dynamic_prompt middleware builds the system message from context at runtime
# the LLM sees personalized context without querying the database itself
@dynamic_prompt
def build_prompt(request):
user_id = request.runtime.context.user_id
customer = CUSTOMERS[user_id]
overdue = OVERDUE_INVOICES[user_id]
lines = [f"Customer: {customer['name']}, plan: {customer['plan']}."]
if overdue:
lines.append(
f"ALERT: {len(overdue)} overdue invoice(s). Prioritize payment resolution."
)
if customer["plan"] == "enterprise":
lines.append("This is a premium customer. Offer direct escalation.")
return "\n".join(lines)
agent = create_agent(
model, tools=[], middleware=[build_prompt], context_schema=UserContext,
)
# same agent, same question — behavior changes based on who's asking
result = agent.invoke(
{"messages": [("user", "I need help with my account.")]},
context=UserContext(user_id="user_123"),
)
print(result["messages"][-1].content)
# "I see there's an overdue invoice on your account. Let me help
# resolve that. As a premium customer, I can escalate directly."
result = agent.invoke(
{"messages": [("user", "I need help with my account.")]},
context=UserContext(user_id="user_456"),
)
print(result["messages"][-1].content)
# "Sure, I'd be happy to help! What do you need assistance with?"
AI SDK
import { ToolLoopAgent } from "ai";
import { openai } from "@ai-sdk/openai";
import { z } from "zod";
const LLM_MODEL = "gpt-5.4";
// mock database — in production these are real queries
const CUSTOMERS: Record<string, { name: string; plan: string }> = {
user_123: { name: "Acme Corp", plan: "enterprise" },
user_456: { name: "Jane Smith", plan: "free" },
};
const OVERDUE_INVOICES: Record<string, { id: string; amount: number }[]> = {
user_123: [{ id: "INV-42", amount: 1200 }],
user_456: [],
};
function buildInstructions(userId: string): string {
const customer = CUSTOMERS[userId];
const overdue = OVERDUE_INVOICES[userId];
const lines = [`Customer: ${customer.name}, plan: ${customer.plan}.`];
if (overdue.length) {
lines.push(
`ALERT: ${overdue.length} overdue invoice(s). Prioritize payment resolution.`,
);
}
if (customer.plan === "enterprise") {
lines.push("This is a premium customer. Offer direct escalation.");
}
return lines.join("\n");
}
// callOptionsSchema defines per-request parameters — prepareCall builds the prompt
// the LLM sees personalized context without querying the database itself
const agent = new ToolLoopAgent({
model: openai(LLM_MODEL),
callOptionsSchema: z.object({
userId: z.string(),
}),
prepareCall: async ({ options, ...settings }) => ({
...settings,
instructions: buildInstructions(options.userId),
}),
});
// same agent, same question — behavior changes based on who's asking
let result = await agent.generate({
prompt: "I need help with my account.",
options: { userId: "user_123" },
});
console.log(result.text);
// "I see there's an overdue invoice on your account. Let me help
// resolve that. As a premium customer, I can escalate directly."
result = await agent.generate({
prompt: "I need help with my account.",
options: { userId: "user_456" },
});
console.log(result.text);
// "Sure, I'd be happy to help! What do you need assistance with?"
Mastra
import { Agent } from "@mastra/core/agent";
import { RequestContext } from "@mastra/core/request-context";
const LLM_MODEL = "openai/gpt-5.4";
// mock database — in production these are real queries
const CUSTOMERS: Record<string, { name: string; plan: string }> = {
user_123: { name: "Acme Corp", plan: "enterprise" },
user_456: { name: "Jane Smith", plan: "free" },
};
const OVERDUE_INVOICES: Record<string, { id: string; amount: number }[]> = {
user_123: [{ id: "INV-42", amount: 1200 }],
user_456: [],
};
// instructions function receives requestContext — not a static string
// the LLM sees personalized context without querying the database itself
const agent = new Agent({
name: "support-agent",
model: LLM_MODEL,
instructions: ({ requestContext }) => {
const userId = requestContext.get("userId");
const customer = CUSTOMERS[userId];
const overdue = OVERDUE_INVOICES[userId];
const lines = [`Customer: ${customer.name}, plan: ${customer.plan}.`];
if (overdue.length) {
lines.push(
`ALERT: ${overdue.length} overdue invoice(s). Prioritize payment resolution.`,
);
}
if (customer.plan === "enterprise") {
lines.push("This is a premium customer. Offer direct escalation.");
}
return lines.join("\n");
},
});
// same agent, same question — behavior changes based on who's asking
let requestContext = new RequestContext([["userId", "user_123"]]);
let result = await agent.generate("I need help with my account.", {
requestContext,
});
console.log(result.text);
// "I see there's an overdue invoice on your account. Let me help
// resolve that. As a premium customer, I can escalate directly."
requestContext = new RequestContext([["userId", "user_456"]]);
result = await agent.generate("I need help with my account.", {
requestContext,
});
console.log(result.text);
// "Sure, I'd be happy to help! What do you need assistance with?"Prompt Caching
OpenAI
from pathlib import Path
from openai import OpenAI
LLM_MODEL = "gpt-5.4"
client = OpenAI()
# prompt caching is automatic — no opt-in, no code changes
# any prefix >= 1024 tokens is cached on first request, reused on subsequent ones
# static content (instructions, examples) should go first for best hit rate
KNOWLEDGE_BASE = Path("knowledge_base.txt").read_text() # ~4100 tokens
# request 1: cold cache — prompt is processed and cached automatically
r1 = client.responses.create(
model=LLM_MODEL,
instructions=KNOWLEDGE_BASE,
input="I keep getting 429 errors. What should I do?",
)
print(r1.output_text)
cached_1 = r1.usage.input_tokens_details.cached_tokens
print(f"Cached tokens: {cached_1}")
# -> Cached tokens: 0 (cache miss — prefix is now stored)
# request 2: warm cache — identical instruction prefix served from cache
r2 = client.responses.create(
model=LLM_MODEL,
instructions=KNOWLEDGE_BASE,
input="How do I fix SSO login failures?",
)
print(r2.output_text)
cached_2 = r2.usage.input_tokens_details.cached_tokens
print(f"Cached tokens: {cached_2}")
# -> Cached tokens: 3328 (cache hit — lower cost, lower latency)
Anthropic
from pathlib import Path
import anthropic
LLM_MODEL = "claude-opus-4-6"
client = anthropic.Anthropic()
# prompt caching requires explicit opt-in via cache_control breakpoints
# marks a prefix boundary — everything up to this point is cached
# 25% surcharge on cache writes, 90% discount on cache reads, 5-min TTL
KNOWLEDGE_BASE = Path("knowledge_base.txt").read_text() # ~4100 tokens
# cache_control breakpoint on the system block — up to 4 breakpoints allowed
system = [{
"type": "text",
"text": KNOWLEDGE_BASE,
"cache_control": {"type": "ephemeral"}, # 5-min TTL, refreshed on each hit
}]
# request 1: cache write — prompt stored, surcharge applies
r1 = client.messages.create(
model=LLM_MODEL, max_tokens=1024, system=system,
messages=[{"role": "user", "content": "I keep getting 429 errors. What should I do?"}],
)
print(r1.content[0].text)
print(f"Cache write: {r1.usage.cache_creation_input_tokens} tokens")
print(f"Cache read: {r1.usage.cache_read_input_tokens} tokens")
# -> Cache write: 4182 tokens, Cache read: 0 tokens
# request 2: cache hit — prefix served from cache at 90% discount
r2 = client.messages.create(
model=LLM_MODEL, max_tokens=1024, system=system,
messages=[{"role": "user", "content": "How do I fix SSO login failures?"}],
)
print(r2.content[0].text)
print(f"Cache write: {r2.usage.cache_creation_input_tokens} tokens")
print(f"Cache read: {r2.usage.cache_read_input_tokens} tokens")
# -> Cache write: 0 tokens, Cache read: 4182 tokens
Gemini
from pathlib import Path
from google import genai
from google.genai import types
LLM_MODEL = "gemini-pro-latest"
client = genai.Client()
# Gemini caching creates a named server-side resource with explicit lifecycle
# create once, reference across many requests, delete when done
# note: cached content must be >= 1024 tokens
KNOWLEDGE_BASE = Path("knowledge_base.txt").read_text() # ~4100 tokens
# step 1: create a named cache with explicit TTL
cache = client.caches.create(
model=LLM_MODEL,
config=types.CreateCachedContentConfig(
system_instruction=KNOWLEDGE_BASE,
ttl="300s", # 5-minute TTL — extend with client.caches.update()
),
)
print(f"Cache created: {cache.name}")
# step 2: reference the cache — no need to resend the prompt
r1 = client.models.generate_content(
model=LLM_MODEL,
contents="I keep getting 429 errors. What should I do?",
config=types.GenerateContentConfig(cached_content=cache.name),
)
print(r1.text)
print(f"Cached tokens: {r1.usage_metadata.cached_content_token_count}")
# -> Cached tokens: 4182
# same cache, different question — cached tokens reused
r2 = client.models.generate_content(
model=LLM_MODEL,
contents="How do I fix SSO login failures?",
config=types.GenerateContentConfig(cached_content=cache.name),
)
print(r2.text)
print(f"Cached tokens: {r2.usage_metadata.cached_content_token_count}")
# -> Cached tokens: 4182
# step 3: cleanup — delete when done, or let TTL expire
client.caches.delete(name=cache.name)
Pydantic AI
from pathlib import Path
from pydantic_ai import Agent
LLM_MODEL = "openai:gpt-5.4"
# prompt caching is automatic with OpenAI — no extra settings needed
# any prefix >= 1024 tokens is cached on first request, reused on subsequent ones
# static content (instructions, examples) should go first for best hit rate
KNOWLEDGE_BASE = Path("knowledge_base.txt").read_text() # ~4100 tokens
agent = Agent(LLM_MODEL, instructions=KNOWLEDGE_BASE)
# request 1: cold cache — prompt is processed and cached automatically
result = agent.run_sync("I keep getting 429 errors. What should I do?")
print(result.output)
print(f"Usage: {result.usage()}")
# -> cache_read_tokens = 0 (cache miss — prefix is now stored)
# request 2: warm cache — identical instruction prefix served from cache
result = agent.run_sync("How do I fix SSO login failures?")
print(result.output)
print(f"Usage: {result.usage()}")
# -> cache_read_tokens = 2816 (cache hit — lower cost, lower latency)
LangGraph
from pathlib import Path
from langchain.agents import create_agent
from langchain_openai import ChatOpenAI
from langchain_core.messages import SystemMessage
LLM_MODEL = "gpt-5.4"
model = ChatOpenAI(model=LLM_MODEL)
# OpenAI caching is automatic — no cache_control needed
# identical prefixes >= 1024 tokens are cached and reused
# static content (instructions, examples) should go first for best hit rate
KNOWLEDGE_BASE = Path("knowledge_base.txt").read_text() # ~4100 tokens
system = SystemMessage(content=KNOWLEDGE_BASE)
agent = create_agent(model, tools=[])
# request 1: cold cache — prompt is processed and cached automatically
result = agent.invoke({
"messages": [system, ("user", "I keep getting 429 errors. What should I do?")],
})
ai_msg = result["messages"][-1]
print(ai_msg.content)
usage = ai_msg.response_metadata["token_usage"]
print(f"Cached tokens: {usage['prompt_tokens_details']['cached_tokens']}")
# -> Cached tokens: 0 (cache miss — prefix is now stored)
# request 2: warm cache — identical prefix served from cache
result = agent.invoke({
"messages": [system, ("user", "How do I fix SSO login failures?")],
})
ai_msg = result["messages"][-1]
print(ai_msg.content)
usage = ai_msg.response_metadata["token_usage"]
print(f"Cached tokens: {usage['prompt_tokens_details']['cached_tokens']}")
# -> Cached tokens: 2816 (cache hit — lower cost, lower latency)
AI SDK
import { readFileSync } from "fs";
import { generateText } from "ai";
import { openai } from "@ai-sdk/openai";
const LLM_MODEL = "gpt-5.4";
// prompt caching is automatic with OpenAI — no providerOptions needed
// identical prefixes >= 1024 tokens are cached and reused
// static content (instructions, examples) should go first for best hit rate
const KNOWLEDGE_BASE = readFileSync("knowledge_base.txt", "utf-8"); // ~4100 tokens
// request 1: cold cache — prompt is processed and cached automatically
const r1 = await generateText({
model: openai(LLM_MODEL),
system: KNOWLEDGE_BASE,
prompt: "I keep getting 429 errors. What should I do?",
});
console.log(r1.text);
console.log(`Cached tokens: ${r1.usage.cachedInputTokens ?? 0}`);
// -> Cached tokens: 0 (cache miss — prefix is now stored)
// request 2: warm cache — identical prefix served from cache
const r2 = await generateText({
model: openai(LLM_MODEL),
system: KNOWLEDGE_BASE,
prompt: "How do I fix SSO login failures?",
});
console.log(r2.text);
console.log(`Cached tokens: ${r2.usage.cachedInputTokens ?? 0}`);
// -> Cached tokens: 3200 (cache hit — lower cost, lower latency)
Mastra
import { readFileSync } from "fs";
import { Agent } from "@mastra/core/agent";
const LLM_MODEL = "openai/gpt-5.4";
// prompt caching is automatic with OpenAI — no providerOptions needed
// identical prefixes >= 1024 tokens are cached and reused
// static content (instructions, examples) should go first for best hit rate
const KNOWLEDGE_BASE = readFileSync("knowledge_base.txt", "utf-8"); // ~4100 tokens
const agent = new Agent({
name: "support-agent",
instructions: KNOWLEDGE_BASE,
model: LLM_MODEL,
});
// request 1: cold cache — prompt is processed and cached automatically
const r1 = await agent.generate("I keep getting 429 errors. What should I do?");
console.log(r1.text);
console.log(`Cached tokens: ${r1.usage.cachedInputTokens ?? 0}`);
// -> Cached tokens: 0 (cache miss — prefix is now stored)
// request 2: warm cache — identical instruction prefix served from cache
const r2 = await agent.generate("How do I fix SSO login failures?");
console.log(r2.text);
console.log(`Cached tokens: ${r2.usage.cachedInputTokens ?? 0}`);
// -> Cached tokens: 3200 (cache hit — lower cost, lower latency)