Most "Claude integrations" wire a chatbot input to the API and call it a day. Production AI looks different. The function below is what we ship: a typed call to messages.create, system prompt and reference document cached, a lookupCustomer tool that the model can call, the response streamed back to the client over Server-Sent Events.
1. The typed client
The Anthropic SDK is wrapped in a thin client that adds retries, error mapping and a typed model union. Model selection happens at the call site; the client does not pick.
// src/lib/claude/client.ts
import Anthropic from '@anthropic-ai/sdk'
export const claude = new Anthropic({
apiKey: process.env.ANTHROPIC_API_KEY!,
maxRetries: 3,
})
export type ClaudeModel =
| 'claude-opus-4-7'
| 'claude-sonnet-4-6'
| 'claude-haiku-4-5'
export const MODEL_BY_TASK: Record<'reasoning' | 'standard' | 'cheap', ClaudeModel> = {
reasoning: 'claude-opus-4-7',
standard: 'claude-sonnet-4-6',
cheap: 'claude-haiku-4-5',
}
2. The tool definition
A tool is a typed function with a JSON schema. Zod produces the schema; the handler runs in the application's normal transactional flow. The model never touches the database directly.
// src/lib/claude/tools/lookup-customer.ts
import { z } from 'zod'
import { zodToJsonSchema } from 'zod-to-json-schema'
import { adminClient } from '@/lib/supabase/admin'
export const lookupCustomerInput = z.object({
email: z.string().email(),
})
export const lookupCustomerTool = {
name: 'lookupCustomer',
description:
'Find a customer by email. Returns the canonical record or null.',
input_schema: zodToJsonSchema(lookupCustomerInput, {
target: 'jsonSchema7',
}) as Record<string, unknown>,
} as const
export async function runLookupCustomer(
input: z.infer<typeof lookupCustomerInput>,
): Promise<unknown> {
const { data, error } = await adminClient
.from('customers')
.select('id, name, plan, status, created_at')
.eq('email', input.email)
.maybeSingle()
if (error) throw error
return data ?? null
}
3. The streaming call with prompt caching
The system prompt and the reference document both carry cache_control. After the first request, subsequent calls pay the cache-read rate (roughly ten percent of standard input) on those blocks. The conversation messages themselves stay uncached because they change every turn.
// src/lib/claude/run.ts
import { claude, MODEL_BY_TASK } from './client'
import { lookupCustomerTool, runLookupCustomer, lookupCustomerInput } from './tools/lookup-customer'
import { systemPrompt } from './prompts/system'
import { referenceDoc } from './prompts/reference'
interface RunInput {
conversation: { role: 'user' | 'assistant'; content: string }[]
tenantId: string
}
export async function* runClaude(input: RunInput) {
const stream = await claude.messages.create({
model: MODEL_BY_TASK.standard,
max_tokens: 4096,
system: [
{
type: 'text',
text: systemPrompt,
cache_control: { type: 'ephemeral' },
},
{
type: 'text',
text: referenceDoc,
cache_control: { type: 'ephemeral' },
},
],
messages: input.conversation,
tools: [lookupCustomerTool],
stream: true,
})
for await (const chunk of stream) {
yield chunk
// When the model decides to call a tool, the stream emits a
// content_block_stop with the tool_use payload. We execute the tool and
// continue the conversation with the result.
if (
chunk.type === 'content_block_stop' &&
'content_block' in chunk &&
chunk.content_block?.type === 'tool_use' &&
chunk.content_block.name === 'lookupCustomer'
) {
const parsed = lookupCustomerInput.parse(chunk.content_block.input)
const result = await runLookupCustomer(parsed)
yield {
type: 'tool_result' as const,
tool_use_id: chunk.content_block.id,
content: JSON.stringify(result),
}
}
}
}
4. The Server-Sent Events route
The streaming generator is piped through a Server-Sent Events transport so the browser receives partial tokens as they arrive. The route handler also writes one row to the request log per call, so cost accounting and audit are not optional.
// app/api/chat/route.ts
import { runClaude } from '@/lib/claude/run'
import { logClaudeRequest } from '@/lib/claude/logging'
import { getServerSession } from '@/lib/auth/server'
export const runtime = 'nodejs'
export async function POST(request: Request) {
const session = await getServerSession()
if (!session) return new Response('unauthorised', { status: 401 })
const { conversation } = await request.json()
const start = Date.now()
const stream = new ReadableStream({
async start(controller) {
let inputTokens = 0
let outputTokens = 0
let cacheHits = 0
try {
for await (const chunk of runClaude({
conversation,
tenantId: session.tenantId,
})) {
if (chunk.type === 'message_start') {
inputTokens = chunk.message.usage.input_tokens
cacheHits = chunk.message.usage.cache_read_input_tokens ?? 0
}
if (chunk.type === 'message_delta') {
outputTokens = chunk.usage.output_tokens
}
controller.enqueue(
new TextEncoder().encode(`data: ${JSON.stringify(chunk)}\n\n`),
)
}
} finally {
controller.close()
await logClaudeRequest({
tenantId: session.tenantId,
userId: session.userId,
model: 'claude-sonnet-4-6',
inputTokens,
outputTokens,
cacheHits,
durationMs: Date.now() - start,
})
}
},
})
return new Response(stream, {
headers: {
'Content-Type': 'text/event-stream',
'Cache-Control': 'no-cache',
Connection: 'keep-alive',
},
})
}
5. What this buys you
Prompt caching turns a five-cent call into a half-cent call. Tool use turns the model from a chat surface into an operator that reads your database, calls your APIs, runs your workflows. The audit log gives finance a number to plan around and security a record to sign off on. The whole thing fits in four files because the SDK is well designed and the application stops fighting it.
This is what "AI is the team" means in code, not in the marketing slide.
Further reading