Deploy y Arquitectura AI Full Stack
Desplegar una app AI-First requiere consideraciones únicas: GPUs, latencia de inferencia, cold starts, costos variables y observabilidad de respuestas del modelo.
Arquitectura de referencia
┌─────────────────────────────────────────────┐
│ CDN / Edge │
│ (Cloudflare, Vercel) │
├─────────────────────────────────────────────┤
│ Frontend (React/Next.js) │
│ - Static assets en edge │
│ - Streaming SSR │
├─────────────────────────────────────────────┤
│ API Gateway / Load Balancer │
│ - Rate limiting │
│ - Auth (JWT) │
├──────┬──────────────────────┬───────────────┤
│ API │ AI Service │ Background │
│Server│ - LLM routing │ Workers │
│ │ - Prompt mgmt │ - Embeddings │
│ │ - Cache │ - Fine-tune │
│ │ - Guardrails │ - Batch jobs │
├──────┴──────────────────────┴───────────────┤
│ PostgreSQL │ Redis │ Vector DB │ Object Store│
│ (users, │(cache,│(Pinecone, │(S3, uploads)│
│ billing) │ rate) │ Qdrant) │ │
└─────────────────────────────────────────────┘
│
▼
┌─────────────────────────────────────────────┐
│ LLM Providers (External) │
│ OpenAI │ Anthropic │ Google │ Self-hosted │
└─────────────────────────────────────────────┘
Docker para AI Apps
Dockerfile multi-stage
# Build stage
FROM node:20-alpine AS builder
WORKDIR /app
COPY package*.json ./
RUN npm ci
COPY . .
RUN npm run build
# Production stage
FROM node:20-alpine AS production
WORKDIR /app
# Instalar solo dependencias de producción
COPY package*.json ./
RUN npm ci --production && npm cache clean --force
# Copiar build
COPY --from=builder /app/dist ./dist
COPY --from=builder /app/prisma ./prisma
# Variables de entorno por defecto
ENV NODE_ENV=production
ENV PORT=3000
# Health check
HEALTHCHECK --interval=30s --timeout=10s --retries=3 \
CMD curl -f http://localhost:3000/health || exit 1
# Usuario no-root
RUN addgroup -g 1001 -S appgroup && \
adduser -S appuser -u 1001 -G appgroup
USER appuser
EXPOSE 3000
CMD ["node", "dist/server.js"]
docker-compose.yml
version: '3.8'
services:
api:
build: .
ports:
- "3000:3000"
environment:
DATABASE_URL: postgresql://postgres:password@db:5432/aiapp
REDIS_URL: redis://redis:6379
OPENAI_API_KEY: ${OPENAI_API_KEY}
ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY}
depends_on:
db:
condition: service_healthy
redis:
condition: service_healthy
restart: unless-stopped
worker:
build: .
command: ["node", "dist/worker.js"]
environment:
DATABASE_URL: postgresql://postgres:password@db:5432/aiapp
REDIS_URL: redis://redis:6379
OPENAI_API_KEY: ${OPENAI_API_KEY}
depends_on:
- db
- redis
restart: unless-stopped
db:
image: pgvector/pgvector:pg16
volumes:
- pgdata:/var/lib/postgresql/data
environment:
POSTGRES_DB: aiapp
POSTGRES_PASSWORD: password
healthcheck:
test: ["CMD-SHELL", "pg_isready -U postgres"]
interval: 5s
timeout: 5s
retries: 5
redis:
image: redis:7-alpine
volumes:
- redisdata:/data
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 5s
timeout: 5s
retries: 5
volumes:
pgdata:
redisdata:
CI/CD para apps AI
GitHub Actions
name: Deploy AI App
on:
push:
branches: [main]
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-node@v4
with:
node-version: 20
cache: 'npm'
- run: npm ci
- run: npm run lint
- run: npm run test:unit
# Tests de prompts (no requieren API key real)
- run: npm run test:prompts
env:
LLM_PROVIDER: mock
test-integration:
needs: test
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-node@v4
with:
node-version: 20
- run: npm ci
# Tests de integración con API real (solo en main)
- run: npm run test:integration
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
if: github.ref == 'refs/heads/main'
deploy:
needs: test-integration
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Deploy to production
run: |
# Build y push Docker image
docker build -t myapp:${{ github.sha }} .
docker tag myapp:${{ github.sha }} registry.example.com/myapp:latest
docker push registry.example.com/myapp:latest
- name: Run migrations
run: npx prisma migrate deploy
Serverless y Edge Functions
Vercel AI SDK con Edge Runtime
// app/api/chat/route.ts (Next.js App Router)
import { openai } from '@ai-sdk/openai';
import { streamText } from 'ai';
export const runtime = 'edge'; // Edge runtime = baja latencia global
export async function POST(req: Request) {
const { messages } = await req.json();
const result = streamText({
model: openai('gpt-4o-mini'),
messages,
maxTokens: 1000,
});
return result.toDataStreamResponse();
}
Cloudflare Workers AI
// worker.ts
export default {
async fetch(request: Request, env: Env): Promise<Response> {
const { message } = await request.json();
// Modelo ejecutándose en la edge de Cloudflare
const response = await env.AI.run('@cf/meta/llama-3-8b-instruct', {
messages: [
{ role: 'system', content: 'Eres un asistente útil.' },
{ role: 'user', content: message },
],
stream: true,
});
return new Response(response, {
headers: { 'content-type': 'text/event-stream' },
});
},
};
Monitoring y Observabilidad
Health checks
// health.ts
interface HealthStatus {
status: 'healthy' | 'degraded' | 'unhealthy';
checks: Record<string, { status: string; latency: number }>;
timestamp: string;
}
async function healthCheck(): Promise<HealthStatus> {
const checks: Record<string, { status: string; latency: number }> = {};
// Check database
const dbStart = Date.now();
try {
await prisma.$queryRaw`SELECT 1`;
checks.database = { status: 'ok', latency: Date.now() - dbStart };
} catch {
checks.database = { status: 'error', latency: Date.now() - dbStart };
}
// Check Redis
const redisStart = Date.now();
try {
await redis.ping();
checks.redis = { status: 'ok', latency: Date.now() - redisStart };
} catch {
checks.redis = { status: 'error', latency: Date.now() - redisStart };
}
// Check LLM Provider
const llmStart = Date.now();
try {
await openai.models.list();
checks.llm = { status: 'ok', latency: Date.now() - llmStart };
} catch {
checks.llm = { status: 'error', latency: Date.now() - llmStart };
}
const allOk = Object.values(checks).every(c => c.status === 'ok');
const anyError = Object.values(checks).some(c => c.status === 'error');
return {
status: allOk ? 'healthy' : anyError ? 'unhealthy' : 'degraded',
checks,
timestamp: new Date().toISOString(),
};
}
Métricas AI específicas
// metrics.ts
import { Counter, Histogram, Gauge } from 'prom-client';
// Tokens por request
const tokenCounter = new Counter({
name: 'ai_tokens_total',
help: 'Total tokens used',
labelNames: ['model', 'type', 'endpoint'], // type: input/output
});
// Latencia de LLM
const llmLatency = new Histogram({
name: 'ai_llm_latency_seconds',
help: 'LLM response latency',
labelNames: ['model', 'endpoint'],
buckets: [0.1, 0.5, 1, 2, 5, 10, 30],
});
// Costo estimado
const costGauge = new Gauge({
name: 'ai_cost_usd',
help: 'Estimated cost in USD',
labelNames: ['model'],
});
// Guardrail violations
const guardrailCounter = new Counter({
name: 'ai_guardrail_violations_total',
help: 'Guardrail violations count',
labelNames: ['guardrail', 'action'],
});
// Wrapper para tracking automático
async function trackedLLMCall(
model: string,
endpoint: string,
fn: () => Promise<{ text: string; usage: { input: number; output: number } }>
) {
const timer = llmLatency.startTimer({ model, endpoint });
try {
const result = await fn();
timer();
tokenCounter.inc({ model, type: 'input', endpoint }, result.usage.input);
tokenCounter.inc({ model, type: 'output', endpoint }, result.usage.output);
return result;
} catch (error) {
timer();
throw error;
}
}
Escalado
Horizontal scaling con queue
// Para operaciones costosas (embeddings, batch processing)
import { Queue, Worker } from 'bullmq';
const embeddingQueue = new Queue('embeddings', {
connection: { host: 'redis', port: 6379 },
});
// Producer: encolar trabajo
await embeddingQueue.add('embed-document', {
documentId: 'doc-123',
content: 'texto del documento...',
});
// Worker: procesar de forma distribuida
const worker = new Worker('embeddings', async (job) => {
const { content } = job.data;
const embedding = await openai.embeddings.create({
model: 'text-embedding-3-small',
input: content,
});
await db.storeEmbedding(job.data.documentId, embedding.data[0].embedding);
}, {
connection: { host: 'redis', port: 6379 },
concurrency: 5, // 5 jobs en paralelo por worker
});
Auto-scaling basado en métricas
# kubernetes HPA
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: ai-api
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: ai-api
minReplicas: 2
maxReplicas: 10
metrics:
- type: Pods
pods:
metric:
name: ai_active_requests
target:
type: AverageValue
averageValue: "10"
Checklist de deploy
- Variables de entorno para todas las API keys
- Health checks configurados
- Métricas y logging operacional
- Rate limiting en producción
- CORS configurado correctamente
- HTTPS forzado
- Backups de base de datos automatizados
- Alertas para errores de LLM y costos
- Fallback cuando el LLM provider está caído
- Documentación de runbook para incidentes