Deploy de LLMs
Opciones de Deploy
┌────────────────────────────────────────────────┐
│ ESPECTRO DE DEPLOY │
│ │
│ API externa ←────────────────→ Self-hosted │
│ │
│ OpenAI Serverless GPU On-Premise │
│ Anthropic (Modal, Cloud (Propia │
│ Google Replicate) (AWS,GCP) infra GPU) │
│ │
│ Menor ←── Control ──→ Mayor │
│ control control │
│ Menor ←── Costo ──→ Mayor │
│ costo fijo costo fijo │
│ Mayor ←── Dependencia → Menor │
│ dependencia dependencia │
└────────────────────────────────────────────────┘
Deploy con API Gateway
from fastapi import FastAPI, HTTPException, Depends
from pydantic import BaseModel
import uvicorn
app = FastAPI(title="LLM API Gateway")
class ChatRequest(BaseModel):
message: str
model: str = "gpt-4o-mini"
temperature: float = 0.7
max_tokens: int = 1000
class ChatResponse(BaseModel):
response: str
model: str
tokens_used: int
cost_usd: float
@app.post("/chat", response_model=ChatResponse)
async def chat(request: ChatRequest):
try:
response = client.chat.completions.create(
model=request.model,
messages=[{"role": "user", "content": request.message}],
temperature=request.temperature,
max_tokens=request.max_tokens,
)
tokens = response.usage.total_tokens
cost = calculate_cost(request.model, response.usage)
return ChatResponse(
response=response.choices[0].message.content,
model=request.model,
tokens_used=tokens,
cost_usd=cost,
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/health")
async def health():
return {"status": "healthy"}
Deploy de Modelos Open-Source
vLLM (Producción)
# Instalar
pip install vllm
# Servir modelo
python -m vllm.entrypoints.openai.api_server \
--model meta-llama/Meta-Llama-3-8B-Instruct \
--port 8000 \
--tensor-parallel-size 1 \
--max-model-len 8192 \
--gpu-memory-utilization 0.9
# Cliente — API compatible con OpenAI
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:8000/v1",
api_key="not-needed",
)
response = client.chat.completions.create(
model="meta-llama/Meta-Llama-3-8B-Instruct",
messages=[{"role": "user", "content": "Hola"}],
)
Ollama (Desarrollo Local)
# Instalar y ejecutar
ollama run llama3
# API
curl http://localhost:11434/api/generate -d '{
"model": "llama3",
"prompt": "Explica Docker en 3 oraciones"
}'
# Python
import requests
response = requests.post("http://localhost:11434/api/generate", json={
"model": "llama3",
"prompt": "Explica Docker",
"stream": False,
})
print(response.json()["response"])
Containerización con Docker
# Dockerfile para API Gateway
FROM python:3.11-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
EXPOSE 8000
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "4"]
# docker-compose.yml
version: '3.8'
services:
api-gateway:
build: .
ports:
- "8000:8000"
environment:
- OPENAI_API_KEY=${OPENAI_API_KEY}
- LOG_LEVEL=info
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 3
chromadb:
image: chromadb/chroma:latest
ports:
- "8001:8000"
volumes:
- chroma_data:/chroma/chroma
restart: unless-stopped
redis:
image: redis:7-alpine
ports:
- "6379:6379"
restart: unless-stopped
volumes:
chroma_data:
Deploy Serverless
Modal
import modal
app = modal.App("llm-service")
@app.function(
gpu="A100",
image=modal.Image.debian_slim().pip_install("vllm", "torch"),
timeout=300,
)
def generate(prompt: str) -> str:
from vllm import LLM, SamplingParams
llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
params = SamplingParams(temperature=0.7, max_tokens=512)
outputs = llm.generate([prompt], params)
return outputs[0].outputs[0].text
@app.local_entrypoint()
def main():
result = generate.remote("Explica microservicios")
print(result)
GPU Cloud Deploy
# Configuración para AWS SageMaker
import sagemaker
from sagemaker.huggingface import HuggingFaceModel
role = sagemaker.get_execution_role()
model = HuggingFaceModel(
model_data="s3://my-bucket/llama-3-8b/model.tar.gz",
role=role,
transformers_version="4.37",
pytorch_version="2.1",
py_version="py310",
model_server_workers=1,
)
predictor = model.deploy(
initial_instance_count=1,
instance_type="ml.g5.2xlarge", # GPU NVIDIA A10G
endpoint_name="llama-3-8b-endpoint",
)
# Invocar
result = predictor.predict({
"inputs": "Explica Docker",
"parameters": {"max_new_tokens": 256, "temperature": 0.7}
})
Health Checks y Readiness
import time
from contextlib import asynccontextmanager
@asynccontextmanager
async def lifespan(app: FastAPI):
# Startup: verificar conectividad
print("Verificando conexión con LLM...")
try:
test = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": "ping"}],
max_tokens=5,
)
print("✓ LLM conectado")
except Exception as e:
print(f"⚠️ LLM no disponible: {e}")
yield
# Shutdown
print("Cerrando conexiones...")
app = FastAPI(lifespan=lifespan)
@app.get("/health")
async def health():
return {"status": "healthy", "timestamp": time.time()}
@app.get("/ready")
async def ready():
try:
# Verificar que el LLM responde
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": "1+1"}],
max_tokens=5,
timeout=5,
)
return {"ready": True}
except:
return {"ready": False}, 503
Resumen
El deploy de LLMs va desde APIs externas (más simple) hasta self-hosting con GPUs (más control). Para producción, necesitas: API gateway con FastAPI, containerización con Docker, health checks, y la decisión estratégica entre API externas vs modelos propios según costos, latencia y privacidad de datos.