Deploy y Arquitectura AI Full Stack

Desplegar una app AI-First requiere consideraciones únicas: GPUs, latencia de inferencia, cold starts, costos variables y observabilidad de respuestas del modelo.

Arquitectura de referencia

┌─────────────────────────────────────────────┐
│                   CDN / Edge                │
│              (Cloudflare, Vercel)            │
├─────────────────────────────────────────────┤
│              Frontend (React/Next.js)        │
│              - Static assets en edge         │
│              - Streaming SSR                 │
├─────────────────────────────────────────────┤
│              API Gateway / Load Balancer     │
│              - Rate limiting                 │
│              - Auth (JWT)                    │
├──────┬──────────────────────┬───────────────┤
│  API │    AI Service        │  Background   │
│Server│    - LLM routing     │  Workers      │
│      │    - Prompt mgmt     │  - Embeddings │
│      │    - Cache           │  - Fine-tune  │
│      │    - Guardrails      │  - Batch jobs │
├──────┴──────────────────────┴───────────────┤
│  PostgreSQL │ Redis │ Vector DB │ Object Store│
│  (users,    │(cache,│(Pinecone, │(S3, uploads)│
│   billing)  │ rate) │ Qdrant)   │             │
└─────────────────────────────────────────────┘
       │
       ▼
┌─────────────────────────────────────────────┐
│           LLM Providers (External)           │
│  OpenAI │ Anthropic │ Google │ Self-hosted   │
└─────────────────────────────────────────────┘

Docker para AI Apps

Dockerfile multi-stage

# Build stage
FROM node:20-alpine AS builder
WORKDIR /app
COPY package*.json ./
RUN npm ci
COPY . .
RUN npm run build

# Production stage
FROM node:20-alpine AS production
WORKDIR /app

# Instalar solo dependencias de producción
COPY package*.json ./
RUN npm ci --production && npm cache clean --force

# Copiar build
COPY --from=builder /app/dist ./dist
COPY --from=builder /app/prisma ./prisma

# Variables de entorno por defecto
ENV NODE_ENV=production
ENV PORT=3000

# Health check
HEALTHCHECK --interval=30s --timeout=10s --retries=3 \
  CMD curl -f http://localhost:3000/health || exit 1

# Usuario no-root
RUN addgroup -g 1001 -S appgroup && \
    adduser -S appuser -u 1001 -G appgroup
USER appuser

EXPOSE 3000
CMD ["node", "dist/server.js"]

docker-compose.yml

version: '3.8'

services:
  api:
    build: .
    ports:
      - "3000:3000"
    environment:
      DATABASE_URL: postgresql://postgres:password@db:5432/aiapp
      REDIS_URL: redis://redis:6379
      OPENAI_API_KEY: ${OPENAI_API_KEY}
      ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY}
    depends_on:
      db:
        condition: service_healthy
      redis:
        condition: service_healthy
    restart: unless-stopped

  worker:
    build: .
    command: ["node", "dist/worker.js"]
    environment:
      DATABASE_URL: postgresql://postgres:password@db:5432/aiapp
      REDIS_URL: redis://redis:6379
      OPENAI_API_KEY: ${OPENAI_API_KEY}
    depends_on:
      - db
      - redis
    restart: unless-stopped

  db:
    image: pgvector/pgvector:pg16
    volumes:
      - pgdata:/var/lib/postgresql/data
    environment:
      POSTGRES_DB: aiapp
      POSTGRES_PASSWORD: password
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U postgres"]
      interval: 5s
      timeout: 5s
      retries: 5

  redis:
    image: redis:7-alpine
    volumes:
      - redisdata:/data
    healthcheck:
      test: ["CMD", "redis-cli", "ping"]
      interval: 5s
      timeout: 5s
      retries: 5

volumes:
  pgdata:
  redisdata:

CI/CD para apps AI

GitHub Actions

name: Deploy AI App
on:
  push:
    branches: [main]

jobs:
  test:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-node@v4
        with:
          node-version: 20
          cache: 'npm'
      - run: npm ci
      - run: npm run lint
      - run: npm run test:unit
      # Tests de prompts (no requieren API key real)
      - run: npm run test:prompts
        env:
          LLM_PROVIDER: mock

  test-integration:
    needs: test
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-node@v4
        with:
          node-version: 20
      - run: npm ci
      # Tests de integración con API real (solo en main)
      - run: npm run test:integration
        env:
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
        if: github.ref == 'refs/heads/main'

  deploy:
    needs: test-integration
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - name: Deploy to production
        run: |
          # Build y push Docker image
          docker build -t myapp:${{ github.sha }} .
          docker tag myapp:${{ github.sha }} registry.example.com/myapp:latest
          docker push registry.example.com/myapp:latest
      - name: Run migrations
        run: npx prisma migrate deploy

Serverless y Edge Functions

Vercel AI SDK con Edge Runtime

// app/api/chat/route.ts (Next.js App Router)
import { openai } from '@ai-sdk/openai';
import { streamText } from 'ai';

export const runtime = 'edge'; // Edge runtime = baja latencia global

export async function POST(req: Request) {
  const { messages } = await req.json();

  const result = streamText({
    model: openai('gpt-4o-mini'),
    messages,
    maxTokens: 1000,
  });

  return result.toDataStreamResponse();
}

Cloudflare Workers AI

// worker.ts
export default {
  async fetch(request: Request, env: Env): Promise<Response> {
    const { message } = await request.json();

    // Modelo ejecutándose en la edge de Cloudflare
    const response = await env.AI.run('@cf/meta/llama-3-8b-instruct', {
      messages: [
        { role: 'system', content: 'Eres un asistente útil.' },
        { role: 'user', content: message },
      ],
      stream: true,
    });

    return new Response(response, {
      headers: { 'content-type': 'text/event-stream' },
    });
  },
};

Monitoring y Observabilidad

Health checks

// health.ts
interface HealthStatus {
  status: 'healthy' | 'degraded' | 'unhealthy';
  checks: Record<string, { status: string; latency: number }>;
  timestamp: string;
}

async function healthCheck(): Promise<HealthStatus> {
  const checks: Record<string, { status: string; latency: number }> = {};

  // Check database
  const dbStart = Date.now();
  try {
    await prisma.$queryRaw`SELECT 1`;
    checks.database = { status: 'ok', latency: Date.now() - dbStart };
  } catch {
    checks.database = { status: 'error', latency: Date.now() - dbStart };
  }

  // Check Redis
  const redisStart = Date.now();
  try {
    await redis.ping();
    checks.redis = { status: 'ok', latency: Date.now() - redisStart };
  } catch {
    checks.redis = { status: 'error', latency: Date.now() - redisStart };
  }

  // Check LLM Provider
  const llmStart = Date.now();
  try {
    await openai.models.list();
    checks.llm = { status: 'ok', latency: Date.now() - llmStart };
  } catch {
    checks.llm = { status: 'error', latency: Date.now() - llmStart };
  }

  const allOk = Object.values(checks).every(c => c.status === 'ok');
  const anyError = Object.values(checks).some(c => c.status === 'error');

  return {
    status: allOk ? 'healthy' : anyError ? 'unhealthy' : 'degraded',
    checks,
    timestamp: new Date().toISOString(),
  };
}

Métricas AI específicas

// metrics.ts
import { Counter, Histogram, Gauge } from 'prom-client';

// Tokens por request
const tokenCounter = new Counter({
  name: 'ai_tokens_total',
  help: 'Total tokens used',
  labelNames: ['model', 'type', 'endpoint'], // type: input/output
});

// Latencia de LLM
const llmLatency = new Histogram({
  name: 'ai_llm_latency_seconds',
  help: 'LLM response latency',
  labelNames: ['model', 'endpoint'],
  buckets: [0.1, 0.5, 1, 2, 5, 10, 30],
});

// Costo estimado
const costGauge = new Gauge({
  name: 'ai_cost_usd',
  help: 'Estimated cost in USD',
  labelNames: ['model'],
});

// Guardrail violations
const guardrailCounter = new Counter({
  name: 'ai_guardrail_violations_total',
  help: 'Guardrail violations count',
  labelNames: ['guardrail', 'action'],
});

// Wrapper para tracking automático
async function trackedLLMCall(
  model: string,
  endpoint: string,
  fn: () => Promise<{ text: string; usage: { input: number; output: number } }>
) {
  const timer = llmLatency.startTimer({ model, endpoint });

  try {
    const result = await fn();
    timer();

    tokenCounter.inc({ model, type: 'input', endpoint }, result.usage.input);
    tokenCounter.inc({ model, type: 'output', endpoint }, result.usage.output);

    return result;
  } catch (error) {
    timer();
    throw error;
  }
}

Escalado

Horizontal scaling con queue

// Para operaciones costosas (embeddings, batch processing)
import { Queue, Worker } from 'bullmq';

const embeddingQueue = new Queue('embeddings', {
  connection: { host: 'redis', port: 6379 },
});

// Producer: encolar trabajo
await embeddingQueue.add('embed-document', {
  documentId: 'doc-123',
  content: 'texto del documento...',
});

// Worker: procesar de forma distribuida
const worker = new Worker('embeddings', async (job) => {
  const { content } = job.data;
  const embedding = await openai.embeddings.create({
    model: 'text-embedding-3-small',
    input: content,
  });
  await db.storeEmbedding(job.data.documentId, embedding.data[0].embedding);
}, {
  connection: { host: 'redis', port: 6379 },
  concurrency: 5, // 5 jobs en paralelo por worker
});

Auto-scaling basado en métricas

# kubernetes HPA
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
  name: ai-api
spec:
  scaleTargetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: ai-api
  minReplicas: 2
  maxReplicas: 10
  metrics:
    - type: Pods
      pods:
        metric:
          name: ai_active_requests
        target:
          type: AverageValue
          averageValue: "10"

Checklist de deploy

Variables de entorno para todas las API keys
Health checks configurados
Métricas y logging operacional
Rate limiting en producción
CORS configurado correctamente
HTTPS forzado
Backups de base de datos automatizados
Alertas para errores de LLM y costos
Fallback cuando el LLM provider está caído
Documentación de runbook para incidentes

Deploy a Producción: Arquitectura AI Fullstack

Estás en modo lectura

Deploy y Arquitectura AI Full Stack

Arquitectura de referencia

Docker para AI Apps

Dockerfile multi-stage

docker-compose.yml

CI/CD para apps AI

GitHub Actions

Serverless y Edge Functions

Vercel AI SDK con Edge Runtime

Cloudflare Workers AI

Monitoring y Observabilidad

Health checks

Métricas AI específicas

Escalado

Horizontal scaling con queue

Auto-scaling basado en métricas

Checklist de deploy

Ejercicio práctico disponible

¿Te gustó esta lección?