🤖
Agent Party

Context

Select frameworks to compare

Pick one or more frameworks from the bar above

Runtime Context

OpenAI

import json
from openai import OpenAI

LLM_MODEL = "gpt-5.4"
client = OpenAI()

# mock database — in production these are real queries
ORDERS = {
    "user_123": [
        {"id": "ORD_99", "item": "Laptop Stand", "status": "shipped"},
        {"id": "ORD_100", "item": "USB Hub", "status": "processing"},
    ],
}

# status is chosen by the LLM — user_id comes from your app (auth, session, etc.)
# only status appears in the tool schema — the LLM can't see or fabricate user_id
user_id = "user_123"

def get_orders(status: str) -> str:
    """Get orders filtered by status."""
    print(f"-> call: get_orders({status}) for {user_id}")
    matches = [o for o in ORDERS[user_id] if o["status"] == status]
    result = ", ".join(f'{o["id"]}: {o["item"]}' for o in matches) or "No orders found."
    print(f"-> result: {result}")
    return result

tools = [{
    "type": "function",
    "name": "get_orders",
    "description": "Get orders filtered by status.",
    "parameters": {
        "type": "object",
        "properties": {
            "status": {"type": "string"},
        },
        "required": ["status"],
    },
}]

input = [{"role": "user", "content": "Do I have any shipped orders?"}]

# step 1: LLM decides to call the tool
response = client.responses.create(
    model=LLM_MODEL, input=input, tools=tools,
)
tool_call = next(i for i in response.output if i.type == "function_call")
result = get_orders(**json.loads(tool_call.arguments))

# step 2: send tool result back, LLM generates final response
input += response.output
input.append({
    "type": "function_call_output",
    "call_id": tool_call.call_id,
    "output": result,
})

response = client.responses.create(
    model=LLM_MODEL, input=input, tools=tools,
)
print(response.output_text)
# -> call: get_orders(shipped) for user_123
# -> result: ORD_99: Laptop Stand
# "Your shipped order is ORD_99: Laptop Stand."

Anthropic

import anthropic

LLM_MODEL = "claude-opus-4-6"
client = anthropic.Anthropic()

# mock database — in production these are real queries
ORDERS = {
    "user_123": [
        {"id": "ORD_99", "item": "Laptop Stand", "status": "shipped"},
        {"id": "ORD_100", "item": "USB Hub", "status": "processing"},
    ],
}

# status is chosen by the LLM — user_id comes from your app (auth, session, etc.)
# only status appears in the tool schema — the LLM can't see or fabricate user_id
user_id = "user_123"

def get_orders(status: str) -> str:
    """Get orders filtered by status."""
    print(f"-> call: get_orders({status}) for {user_id}")
    matches = [o for o in ORDERS[user_id] if o["status"] == status]
    result = ", ".join(f'{o["id"]}: {o["item"]}' for o in matches) or "No orders found."
    print(f"-> result: {result}")
    return result

tools = [{
    "name": "get_orders",
    "description": "Get orders filtered by status.",
    "input_schema": {
        "type": "object",
        "properties": {
            "status": {"type": "string"},
        },
        "required": ["status"],
    },
}]

messages = [{"role": "user", "content": "Do I have any shipped orders?"}]

# step 1: LLM decides to call the tool
response = client.messages.create(
    model=LLM_MODEL, max_tokens=1024, tools=tools, messages=messages,
)
tool_block = next(b for b in response.content if b.type == "tool_use")
result = get_orders(**tool_block.input)

# step 2: send tool result back, LLM generates final response
messages.append({"role": "assistant", "content": response.content})
messages.append({"role": "user", "content": [{
    "type": "tool_result",
    "tool_use_id": tool_block.id,
    "content": result,
}]})

response = client.messages.create(
    model=LLM_MODEL, max_tokens=1024, tools=tools, messages=messages,
)
print(response.content[0].text)
# -> call: get_orders(shipped) for user_123
# -> result: ORD_99: Laptop Stand
# "Your shipped order is ORD_99: Laptop Stand."

Gemini

from google import genai
from google.genai import types

LLM_MODEL = "gemini-pro-latest"
client = genai.Client()

# mock database — in production these are real queries
ORDERS = {
    "user_123": [
        {"id": "ORD_99", "item": "Laptop Stand", "status": "shipped"},
        {"id": "ORD_100", "item": "USB Hub", "status": "processing"},
    ],
}

# status is chosen by the LLM — user_id comes from your app (auth, session, etc.)
# only status appears in the tool schema — the LLM can't see or fabricate user_id
user_id = "user_123"

def get_orders(status: str) -> str:
    """Get orders filtered by status."""
    print(f"-> call: get_orders({status}) for {user_id}")
    matches = [o for o in ORDERS[user_id] if o["status"] == status]
    result = ", ".join(f'{o["id"]}: {o["item"]}' for o in matches) or "No orders found."
    print(f"-> result: {result}")
    return result

# automatic function calling: SDK executes the tool and feeds results back
config = types.GenerateContentConfig(tools=[get_orders])

response = client.models.generate_content(
    model=LLM_MODEL,
    config=config,
    contents="Do I have any shipped orders?",
)
print(response.text)
# -> call: get_orders(shipped) for user_123
# -> result: ORD_99: Laptop Stand
# "Your shipped order is ORD_99: Laptop Stand."

Pydantic AI

from dataclasses import dataclass
from pydantic_ai import Agent, RunContext

LLM_MODEL = "openai:gpt-5.4"

# mock database — in production these are real queries
ORDERS = {
    "user_123": [
        {"id": "ORD_99", "item": "Laptop Stand", "status": "shipped"},
        {"id": "ORD_100", "item": "USB Hub", "status": "processing"},
    ],
}

@dataclass
class Deps:
    user_id: str

agent = Agent(LLM_MODEL, deps_type=Deps)

# status is chosen by the LLM — user_id comes from your app (auth, session, etc.)
# RunContext is hidden from the LLM — it can't see or fabricate user_id
@agent.tool
def get_orders(ctx: RunContext[Deps], status: str) -> str:
    """Get orders filtered by status."""
    print(f"-> call: get_orders({status}) for {ctx.deps.user_id}")
    matches = [o for o in ORDERS[ctx.deps.user_id] if o["status"] == status]
    result = ", ".join(f'{o["id"]}: {o["item"]}' for o in matches) or "No orders found."
    print(f"-> result: {result}")
    return result

result = agent.run_sync(
    "Do I have any shipped orders?",
    deps=Deps(user_id="user_123"),
)
print(result.output)
# -> call: get_orders(shipped) for user_123
# -> result: ORD_99: Laptop Stand
# "Your shipped order is ORD_99: Laptop Stand."

LangGraph

from dataclasses import dataclass
from langchain.tools import tool, ToolRuntime
from langchain.agents import create_agent
from langchain_openai import ChatOpenAI

LLM_MODEL = "gpt-5.4"
model = ChatOpenAI(model=LLM_MODEL)

# mock database — in production these are real queries
ORDERS = {
    "user_123": [
        {"id": "ORD_99", "item": "Laptop Stand", "status": "shipped"},
        {"id": "ORD_100", "item": "USB Hub", "status": "processing"},
    ],
}

@dataclass
class UserContext:
    user_id: str

# status is chosen by the LLM — user_id comes from your app (auth, session, etc.)
# ToolRuntime is hidden from the LLM — it can't see or fabricate user_id
@tool
def get_orders(status: str, runtime: ToolRuntime) -> str:
    """Get orders filtered by status."""
    user_id = runtime.context.user_id
    print(f"-> call: get_orders({status}) for {user_id}")
    matches = [o for o in ORDERS[user_id] if o["status"] == status]
    result = ", ".join(f'{o["id"]}: {o["item"]}' for o in matches) or "No orders found."
    print(f"-> result: {result}")
    return result

agent = create_agent(model, [get_orders], context_schema=UserContext)
result = agent.invoke(
    {"messages": [("user", "Do I have any shipped orders?")]},
    context=UserContext(user_id="user_123"),
)
print(result["messages"][-1].content)
# -> call: get_orders(shipped) for user_123
# -> result: ORD_99: Laptop Stand
# "Your shipped order is ORD_99: Laptop Stand."

AI SDK

import { ToolLoopAgent, tool } from "ai";
import { openai } from "@ai-sdk/openai";
import { z } from "zod";

const LLM_MODEL = "gpt-5.4";

// mock database — in production these are real queries
const ORDERS: Record<string, { id: string; item: string; status: string }[]> = {
  user_123: [
    { id: "ORD_99", item: "Laptop Stand", status: "shipped" },
    { id: "ORD_100", item: "USB Hub", status: "processing" },
  ],
};

// status is chosen by the LLM — userId comes from your app (auth, session, etc.)
// only status appears in the tool schema — the LLM can't see or fabricate userId
function createTools(userId: string) {
  const getOrders = tool({
    description: "Get orders filtered by status.",
    inputSchema: z.object({ status: z.string() }),
    execute: async ({ status }) => {
      console.log(`-> call: get_orders(${status}) for ${userId}`);
      const matches = ORDERS[userId].filter((o) => o.status === status);
      const result = matches.map((o) => `${o.id}: ${o.item}`).join(", ") || "No orders found.";
      console.log(`-> result: ${result}`);
      return result;
    },
  });
  return { getOrders };
}

const tools = createTools("user_123");

const agent = new ToolLoopAgent({
  model: openai(LLM_MODEL),
  tools,
});

const result = await agent.generate({
  prompt: "Do I have any shipped orders?",
});
console.log(result.text);
// -> call: get_orders(shipped) for user_123
// -> result: ORD_99: Laptop Stand
// "Your shipped order is ORD_99: Laptop Stand."

Mastra

import { Agent } from "@mastra/core/agent";
import { createTool } from "@mastra/core/tools";
import { RequestContext } from "@mastra/core/request-context";
import { z } from "zod";

const LLM_MODEL = "openai/gpt-5.4";

// mock database — in production these are real queries
const ORDERS: Record<string, { id: string; item: string; status: string }[]> = {
  user_123: [
    { id: "ORD_99", item: "Laptop Stand", status: "shipped" },
    { id: "ORD_100", item: "USB Hub", status: "processing" },
  ],
};

// status is chosen by the LLM — userId comes from your app (auth, session, etc.)
// requestContext is hidden from the LLM — it can't see or fabricate userId
const getOrders = createTool({
  id: "get-orders",
  description: "Get orders filtered by status.",
  inputSchema: z.object({ status: z.string() }),
  requestContextSchema: z.object({ userId: z.string() }),
  execute: async ({ status }, { requestContext }) => {
    const userId = requestContext.get("userId");
    console.log(`-> call: get_orders(${status}) for ${userId}`);
    const matches = ORDERS[userId].filter((o) => o.status === status);
    const result = matches.map((o) => `${o.id}: ${o.item}`).join(", ") || "No orders found.";
    console.log(`-> result: ${result}`);
    return result;
  },
});

const agent = new Agent({
  name: "order-agent",
  instructions: "You are a helpful assistant.",
  model: LLM_MODEL,
  tools: { getOrders },
});

const requestContext = new RequestContext([["userId", "user_123"]]);

const result = await agent.generate("Do I have any shipped orders?", {
  requestContext,
});
console.log(result.text);
// -> call: get_orders(shipped) for user_123
// -> result: ORD_99: Laptop Stand
// "Your shipped order is ORD_99: Laptop Stand."

Dynamic Instructions

OpenAI

from openai import OpenAI

LLM_MODEL = "gpt-5.4"
client = OpenAI()

# mock database — in production these are real queries
CUSTOMERS = {
    "user_123": {"name": "Acme Corp", "plan": "enterprise"},
    "user_456": {"name": "Jane Smith", "plan": "free"},
}
OVERDUE_INVOICES = {
    "user_123": [{"id": "INV-42", "amount": 1200}],
    "user_456": [],
}

# instructions are built at request time from live data — not a static string
# the LLM sees personalized context without querying the database itself
def build_instructions(user_id: str) -> str:
    customer = CUSTOMERS[user_id]
    overdue = OVERDUE_INVOICES[user_id]
    lines = [f"Customer: {customer['name']}, plan: {customer['plan']}."]
    if overdue:
        lines.append(
            f"ALERT: {len(overdue)} overdue invoice(s). Prioritize payment resolution."
        )
    if customer["plan"] == "enterprise":
        lines.append("This is a premium customer. Offer direct escalation.")
    return "\n".join(lines)

# same agent, same question — behavior changes based on who's asking
response = client.responses.create(
    model=LLM_MODEL,
    instructions=build_instructions("user_123"),
    input="I need help with my account.",
)
print(response.output_text)
# "I see there's an overdue invoice on your account. Let me help
#  resolve that. As a premium customer, I can escalate directly."

response = client.responses.create(
    model=LLM_MODEL,
    instructions=build_instructions("user_456"),
    input="I need help with my account.",
)
print(response.output_text)
# "Sure, I'd be happy to help! What do you need assistance with?"

Anthropic

import anthropic

LLM_MODEL = "claude-opus-4-6"
client = anthropic.Anthropic()

# mock database — in production these are real queries
CUSTOMERS = {
    "user_123": {"name": "Acme Corp", "plan": "enterprise"},
    "user_456": {"name": "Jane Smith", "plan": "free"},
}
OVERDUE_INVOICES = {
    "user_123": [{"id": "INV-42", "amount": 1200}],
    "user_456": [],
}

# instructions are built at request time from live data — not a static string
# the LLM sees personalized context without querying the database itself
def build_instructions(user_id: str) -> str:
    customer = CUSTOMERS[user_id]
    overdue = OVERDUE_INVOICES[user_id]
    lines = [f"Customer: {customer['name']}, plan: {customer['plan']}."]
    if overdue:
        lines.append(
            f"ALERT: {len(overdue)} overdue invoice(s). Prioritize payment resolution."
        )
    if customer["plan"] == "enterprise":
        lines.append("This is a premium customer. Offer direct escalation.")
    return "\n".join(lines)

# same agent, same question — behavior changes based on who's asking
response = client.messages.create(
    model=LLM_MODEL,
    max_tokens=1024,
    system=build_instructions("user_123"),
    messages=[{"role": "user", "content": "I need help with my account."}],
)
print(response.content[0].text)
# "I see there's an overdue invoice on your account. Let me help
#  resolve that. As a premium customer, I can escalate directly."

response = client.messages.create(
    model=LLM_MODEL,
    max_tokens=1024,
    system=build_instructions("user_456"),
    messages=[{"role": "user", "content": "I need help with my account."}],
)
print(response.content[0].text)
# "Sure, I'd be happy to help! What do you need assistance with?"

Gemini

from google import genai
from google.genai import types

LLM_MODEL = "gemini-pro-latest"
client = genai.Client()

# mock database — in production these are real queries
CUSTOMERS = {
    "user_123": {"name": "Acme Corp", "plan": "enterprise"},
    "user_456": {"name": "Jane Smith", "plan": "free"},
}
OVERDUE_INVOICES = {
    "user_123": [{"id": "INV-42", "amount": 1200}],
    "user_456": [],
}

# instructions are built at request time from live data — not a static string
# the LLM sees personalized context without querying the database itself
def build_instructions(user_id: str) -> str:
    customer = CUSTOMERS[user_id]
    overdue = OVERDUE_INVOICES[user_id]
    lines = [f"Customer: {customer['name']}, plan: {customer['plan']}."]
    if overdue:
        lines.append(
            f"ALERT: {len(overdue)} overdue invoice(s). Prioritize payment resolution."
        )
    if customer["plan"] == "enterprise":
        lines.append("This is a premium customer. Offer direct escalation.")
    return "\n".join(lines)

# same agent, same question — behavior changes based on who's asking
response = client.models.generate_content(
    model=LLM_MODEL,
    config=types.GenerateContentConfig(
        system_instruction=build_instructions("user_123"),
    ),
    contents="I need help with my account.",
)
print(response.text)
# "I see there's an overdue invoice on your account. Let me help
#  resolve that. As a premium customer, I can escalate directly."

response = client.models.generate_content(
    model=LLM_MODEL,
    config=types.GenerateContentConfig(
        system_instruction=build_instructions("user_456"),
    ),
    contents="I need help with my account.",
)
print(response.text)
# "Sure, I'd be happy to help! What do you need assistance with?"

Pydantic AI

from dataclasses import dataclass
from pydantic_ai import Agent, RunContext

LLM_MODEL = "openai:gpt-5.4"

# mock database — in production these are real queries
CUSTOMERS = {
    "user_123": {"name": "Acme Corp", "plan": "enterprise"},
    "user_456": {"name": "Jane Smith", "plan": "free"},
}
OVERDUE_INVOICES = {
    "user_123": [{"id": "INV-42", "amount": 1200}],
    "user_456": [],
}

@dataclass
class Deps:
    user_id: str

agent = Agent(LLM_MODEL, deps_type=Deps)

# instructions are built at request time from live data — not a static string
# the LLM sees personalized context without querying the database itself
@agent.instructions
def dynamic_instructions(ctx: RunContext[Deps]) -> str:
    customer = CUSTOMERS[ctx.deps.user_id]
    overdue = OVERDUE_INVOICES[ctx.deps.user_id]
    lines = [f"Customer: {customer['name']}, plan: {customer['plan']}."]
    if overdue:
        lines.append(
            f"ALERT: {len(overdue)} overdue invoice(s). Prioritize payment resolution."
        )
    if customer["plan"] == "enterprise":
        lines.append("This is a premium customer. Offer direct escalation.")
    return "\n".join(lines)

# same agent, same question — behavior changes based on who's asking
result = agent.run_sync(
    "I need help with my account.",
    deps=Deps(user_id="user_123"),
)
print(result.output)
# "I see there's an overdue invoice on your account. Let me help
#  resolve that. As a premium customer, I can escalate directly."

result = agent.run_sync(
    "I need help with my account.",
    deps=Deps(user_id="user_456"),
)
print(result.output)
# "Sure, I'd be happy to help! What do you need assistance with?"

LangGraph

from dataclasses import dataclass
from langchain.agents import create_agent
from langchain.agents.middleware import dynamic_prompt
from langchain_openai import ChatOpenAI

LLM_MODEL = "gpt-5.4"
model = ChatOpenAI(model=LLM_MODEL)

# mock database — in production these are real queries
CUSTOMERS = {
    "user_123": {"name": "Acme Corp", "plan": "enterprise"},
    "user_456": {"name": "Jane Smith", "plan": "free"},
}
OVERDUE_INVOICES = {
    "user_123": [{"id": "INV-42", "amount": 1200}],
    "user_456": [],
}

@dataclass
class UserContext:
    user_id: str

# @dynamic_prompt middleware builds the system message from context at runtime
# the LLM sees personalized context without querying the database itself
@dynamic_prompt
def build_prompt(request):
    user_id = request.runtime.context.user_id
    customer = CUSTOMERS[user_id]
    overdue = OVERDUE_INVOICES[user_id]
    lines = [f"Customer: {customer['name']}, plan: {customer['plan']}."]
    if overdue:
        lines.append(
            f"ALERT: {len(overdue)} overdue invoice(s). Prioritize payment resolution."
        )
    if customer["plan"] == "enterprise":
        lines.append("This is a premium customer. Offer direct escalation.")
    return "\n".join(lines)

agent = create_agent(
    model, tools=[], middleware=[build_prompt], context_schema=UserContext,
)

# same agent, same question — behavior changes based on who's asking
result = agent.invoke(
    {"messages": [("user", "I need help with my account.")]},
    context=UserContext(user_id="user_123"),
)
print(result["messages"][-1].content)
# "I see there's an overdue invoice on your account. Let me help
#  resolve that. As a premium customer, I can escalate directly."

result = agent.invoke(
    {"messages": [("user", "I need help with my account.")]},
    context=UserContext(user_id="user_456"),
)
print(result["messages"][-1].content)
# "Sure, I'd be happy to help! What do you need assistance with?"

AI SDK

import { ToolLoopAgent } from "ai";
import { openai } from "@ai-sdk/openai";
import { z } from "zod";

const LLM_MODEL = "gpt-5.4";

// mock database — in production these are real queries
const CUSTOMERS: Record<string, { name: string; plan: string }> = {
  user_123: { name: "Acme Corp", plan: "enterprise" },
  user_456: { name: "Jane Smith", plan: "free" },
};
const OVERDUE_INVOICES: Record<string, { id: string; amount: number }[]> = {
  user_123: [{ id: "INV-42", amount: 1200 }],
  user_456: [],
};

function buildInstructions(userId: string): string {
  const customer = CUSTOMERS[userId];
  const overdue = OVERDUE_INVOICES[userId];
  const lines = [`Customer: ${customer.name}, plan: ${customer.plan}.`];
  if (overdue.length) {
    lines.push(
      `ALERT: ${overdue.length} overdue invoice(s). Prioritize payment resolution.`,
    );
  }
  if (customer.plan === "enterprise") {
    lines.push("This is a premium customer. Offer direct escalation.");
  }
  return lines.join("\n");
}

// callOptionsSchema defines per-request parameters — prepareCall builds the prompt
// the LLM sees personalized context without querying the database itself
const agent = new ToolLoopAgent({
  model: openai(LLM_MODEL),
  callOptionsSchema: z.object({
    userId: z.string(),
  }),
  prepareCall: async ({ options, ...settings }) => ({
    ...settings,
    instructions: buildInstructions(options.userId),
  }),
});

// same agent, same question — behavior changes based on who's asking
let result = await agent.generate({
  prompt: "I need help with my account.",
  options: { userId: "user_123" },
});
console.log(result.text);
// "I see there's an overdue invoice on your account. Let me help
//  resolve that. As a premium customer, I can escalate directly."

result = await agent.generate({
  prompt: "I need help with my account.",
  options: { userId: "user_456" },
});
console.log(result.text);
// "Sure, I'd be happy to help! What do you need assistance with?"

Mastra

import { Agent } from "@mastra/core/agent";
import { RequestContext } from "@mastra/core/request-context";

const LLM_MODEL = "openai/gpt-5.4";

// mock database — in production these are real queries
const CUSTOMERS: Record<string, { name: string; plan: string }> = {
  user_123: { name: "Acme Corp", plan: "enterprise" },
  user_456: { name: "Jane Smith", plan: "free" },
};
const OVERDUE_INVOICES: Record<string, { id: string; amount: number }[]> = {
  user_123: [{ id: "INV-42", amount: 1200 }],
  user_456: [],
};

// instructions function receives requestContext — not a static string
// the LLM sees personalized context without querying the database itself
const agent = new Agent({
  name: "support-agent",
  model: LLM_MODEL,
  instructions: ({ requestContext }) => {
    const userId = requestContext.get("userId");
    const customer = CUSTOMERS[userId];
    const overdue = OVERDUE_INVOICES[userId];
    const lines = [`Customer: ${customer.name}, plan: ${customer.plan}.`];
    if (overdue.length) {
      lines.push(
        `ALERT: ${overdue.length} overdue invoice(s). Prioritize payment resolution.`,
      );
    }
    if (customer.plan === "enterprise") {
      lines.push("This is a premium customer. Offer direct escalation.");
    }
    return lines.join("\n");
  },
});

// same agent, same question — behavior changes based on who's asking
let requestContext = new RequestContext([["userId", "user_123"]]);
let result = await agent.generate("I need help with my account.", {
  requestContext,
});
console.log(result.text);
// "I see there's an overdue invoice on your account. Let me help
//  resolve that. As a premium customer, I can escalate directly."

requestContext = new RequestContext([["userId", "user_456"]]);
result = await agent.generate("I need help with my account.", {
  requestContext,
});
console.log(result.text);
// "Sure, I'd be happy to help! What do you need assistance with?"

Prompt Caching

OpenAI

from pathlib import Path
from openai import OpenAI

LLM_MODEL = "gpt-5.4"
client = OpenAI()

# prompt caching is automatic — no opt-in, no code changes
# any prefix >= 1024 tokens is cached on first request, reused on subsequent ones
# static content (instructions, examples) should go first for best hit rate

KNOWLEDGE_BASE = Path("knowledge_base.txt").read_text()  # ~4100 tokens

# request 1: cold cache — prompt is processed and cached automatically
r1 = client.responses.create(
    model=LLM_MODEL,
    instructions=KNOWLEDGE_BASE,
    input="I keep getting 429 errors. What should I do?",
)
print(r1.output_text)
cached_1 = r1.usage.input_tokens_details.cached_tokens
print(f"Cached tokens: {cached_1}")
# -> Cached tokens: 0 (cache miss — prefix is now stored)

# request 2: warm cache — identical instruction prefix served from cache
r2 = client.responses.create(
    model=LLM_MODEL,
    instructions=KNOWLEDGE_BASE,
    input="How do I fix SSO login failures?",
)
print(r2.output_text)
cached_2 = r2.usage.input_tokens_details.cached_tokens
print(f"Cached tokens: {cached_2}")
# -> Cached tokens: 3328 (cache hit — lower cost, lower latency)

Anthropic

from pathlib import Path
import anthropic

LLM_MODEL = "claude-opus-4-6"
client = anthropic.Anthropic()

# prompt caching requires explicit opt-in via cache_control breakpoints
# marks a prefix boundary — everything up to this point is cached
# 25% surcharge on cache writes, 90% discount on cache reads, 5-min TTL

KNOWLEDGE_BASE = Path("knowledge_base.txt").read_text()  # ~4100 tokens

# cache_control breakpoint on the system block — up to 4 breakpoints allowed
system = [{
    "type": "text",
    "text": KNOWLEDGE_BASE,
    "cache_control": {"type": "ephemeral"},  # 5-min TTL, refreshed on each hit
}]

# request 1: cache write — prompt stored, surcharge applies
r1 = client.messages.create(
    model=LLM_MODEL, max_tokens=1024, system=system,
    messages=[{"role": "user", "content": "I keep getting 429 errors. What should I do?"}],
)
print(r1.content[0].text)
print(f"Cache write: {r1.usage.cache_creation_input_tokens} tokens")
print(f"Cache read:  {r1.usage.cache_read_input_tokens} tokens")
# -> Cache write: 4182 tokens, Cache read: 0 tokens

# request 2: cache hit — prefix served from cache at 90% discount
r2 = client.messages.create(
    model=LLM_MODEL, max_tokens=1024, system=system,
    messages=[{"role": "user", "content": "How do I fix SSO login failures?"}],
)
print(r2.content[0].text)
print(f"Cache write: {r2.usage.cache_creation_input_tokens} tokens")
print(f"Cache read:  {r2.usage.cache_read_input_tokens} tokens")
# -> Cache write: 0 tokens, Cache read: 4182 tokens

Gemini

from pathlib import Path
from google import genai
from google.genai import types

LLM_MODEL = "gemini-pro-latest"
client = genai.Client()

# Gemini caching creates a named server-side resource with explicit lifecycle
# create once, reference across many requests, delete when done
# note: cached content must be >= 1024 tokens

KNOWLEDGE_BASE = Path("knowledge_base.txt").read_text()  # ~4100 tokens

# step 1: create a named cache with explicit TTL
cache = client.caches.create(
    model=LLM_MODEL,
    config=types.CreateCachedContentConfig(
        system_instruction=KNOWLEDGE_BASE,
        ttl="300s",  # 5-minute TTL — extend with client.caches.update()
    ),
)
print(f"Cache created: {cache.name}")

# step 2: reference the cache — no need to resend the prompt
r1 = client.models.generate_content(
    model=LLM_MODEL,
    contents="I keep getting 429 errors. What should I do?",
    config=types.GenerateContentConfig(cached_content=cache.name),
)
print(r1.text)
print(f"Cached tokens: {r1.usage_metadata.cached_content_token_count}")
# -> Cached tokens: 4182

# same cache, different question — cached tokens reused
r2 = client.models.generate_content(
    model=LLM_MODEL,
    contents="How do I fix SSO login failures?",
    config=types.GenerateContentConfig(cached_content=cache.name),
)
print(r2.text)
print(f"Cached tokens: {r2.usage_metadata.cached_content_token_count}")
# -> Cached tokens: 4182

# step 3: cleanup — delete when done, or let TTL expire
client.caches.delete(name=cache.name)

Pydantic AI

from pathlib import Path
from pydantic_ai import Agent

LLM_MODEL = "openai:gpt-5.4"

# prompt caching is automatic with OpenAI — no extra settings needed
# any prefix >= 1024 tokens is cached on first request, reused on subsequent ones
# static content (instructions, examples) should go first for best hit rate

KNOWLEDGE_BASE = Path("knowledge_base.txt").read_text()  # ~4100 tokens

agent = Agent(LLM_MODEL, instructions=KNOWLEDGE_BASE)

# request 1: cold cache — prompt is processed and cached automatically
result = agent.run_sync("I keep getting 429 errors. What should I do?")
print(result.output)
print(f"Usage: {result.usage()}")
# -> cache_read_tokens = 0 (cache miss — prefix is now stored)

# request 2: warm cache — identical instruction prefix served from cache
result = agent.run_sync("How do I fix SSO login failures?")
print(result.output)
print(f"Usage: {result.usage()}")
# -> cache_read_tokens = 2816 (cache hit — lower cost, lower latency)

LangGraph

from pathlib import Path
from langchain.agents import create_agent
from langchain_openai import ChatOpenAI
from langchain_core.messages import SystemMessage

LLM_MODEL = "gpt-5.4"
model = ChatOpenAI(model=LLM_MODEL)

# OpenAI caching is automatic — no cache_control needed
# identical prefixes >= 1024 tokens are cached and reused
# static content (instructions, examples) should go first for best hit rate

KNOWLEDGE_BASE = Path("knowledge_base.txt").read_text()  # ~4100 tokens

system = SystemMessage(content=KNOWLEDGE_BASE)

agent = create_agent(model, tools=[])

# request 1: cold cache — prompt is processed and cached automatically
result = agent.invoke({
    "messages": [system, ("user", "I keep getting 429 errors. What should I do?")],
})
ai_msg = result["messages"][-1]
print(ai_msg.content)
usage = ai_msg.response_metadata["token_usage"]
print(f"Cached tokens: {usage['prompt_tokens_details']['cached_tokens']}")
# -> Cached tokens: 0 (cache miss — prefix is now stored)

# request 2: warm cache — identical prefix served from cache
result = agent.invoke({
    "messages": [system, ("user", "How do I fix SSO login failures?")],
})
ai_msg = result["messages"][-1]
print(ai_msg.content)
usage = ai_msg.response_metadata["token_usage"]
print(f"Cached tokens: {usage['prompt_tokens_details']['cached_tokens']}")
# -> Cached tokens: 2816 (cache hit — lower cost, lower latency)

AI SDK

import { readFileSync } from "fs";
import { generateText } from "ai";
import { openai } from "@ai-sdk/openai";

const LLM_MODEL = "gpt-5.4";

// prompt caching is automatic with OpenAI — no providerOptions needed
// identical prefixes >= 1024 tokens are cached and reused
// static content (instructions, examples) should go first for best hit rate

const KNOWLEDGE_BASE = readFileSync("knowledge_base.txt", "utf-8"); // ~4100 tokens

// request 1: cold cache — prompt is processed and cached automatically
const r1 = await generateText({
  model: openai(LLM_MODEL),
  system: KNOWLEDGE_BASE,
  prompt: "I keep getting 429 errors. What should I do?",
});
console.log(r1.text);
console.log(`Cached tokens: ${r1.usage.cachedInputTokens ?? 0}`);
// -> Cached tokens: 0 (cache miss — prefix is now stored)

// request 2: warm cache — identical prefix served from cache
const r2 = await generateText({
  model: openai(LLM_MODEL),
  system: KNOWLEDGE_BASE,
  prompt: "How do I fix SSO login failures?",
});
console.log(r2.text);
console.log(`Cached tokens: ${r2.usage.cachedInputTokens ?? 0}`);
// -> Cached tokens: 3200 (cache hit — lower cost, lower latency)

Mastra

import { readFileSync } from "fs";
import { Agent } from "@mastra/core/agent";

const LLM_MODEL = "openai/gpt-5.4";

// prompt caching is automatic with OpenAI — no providerOptions needed
// identical prefixes >= 1024 tokens are cached and reused
// static content (instructions, examples) should go first for best hit rate

const KNOWLEDGE_BASE = readFileSync("knowledge_base.txt", "utf-8"); // ~4100 tokens

const agent = new Agent({
  name: "support-agent",
  instructions: KNOWLEDGE_BASE,
  model: LLM_MODEL,
});

// request 1: cold cache — prompt is processed and cached automatically
const r1 = await agent.generate("I keep getting 429 errors. What should I do?");
console.log(r1.text);
console.log(`Cached tokens: ${r1.usage.cachedInputTokens ?? 0}`);
// -> Cached tokens: 0 (cache miss — prefix is now stored)

// request 2: warm cache — identical instruction prefix served from cache
const r2 = await agent.generate("How do I fix SSO login failures?");
console.log(r2.text);
console.log(`Cached tokens: ${r2.usage.cachedInputTokens ?? 0}`);
// -> Cached tokens: 3200 (cache hit — lower cost, lower latency)