Observabilidad y Monitoreo
¿Por Qué Observabilidad para LLMs?
A diferencia del software tradicional, las aplicaciones LLM pueden fallar silenciosamente — la respuesta puede ser incorrecta, una alucinación, o degradada sin que haya un error técnico. La observabilidad es la única forma de detectar estos problemas.
Los Tres Pilares + LLM
Observabilidad Tradicional:
1. Logs → ¿Qué pasó?
2. Métricas → ¿Cuánto/cuándo?
3. Traces → ¿Cómo fluye un request?
Observabilidad LLM (adicional):
4. Prompt/Completion logs → ¿Qué se le preguntó y qué respondió?
5. Quality metrics → ¿La respuesta fue buena?
6. Cost tracking → ¿Cuánto costó?
7. Token analytics → ¿Cómo se consumen los tokens?
Logging Estructurado
import logging
import json
from datetime import datetime
class LLMLogger:
def __init__(self):
self.logger = logging.getLogger("llm_ops")
handler = logging.FileHandler("llm_interactions.jsonl")
self.logger.addHandler(handler)
self.logger.setLevel(logging.INFO)
def log_interaction(self, request_id: str, data: dict):
log_entry = {
"timestamp": datetime.utcnow().isoformat(),
"request_id": request_id,
"model": data.get("model"),
"prompt_tokens": data.get("prompt_tokens"),
"completion_tokens": data.get("completion_tokens"),
"total_tokens": data.get("total_tokens"),
"latency_ms": data.get("latency_ms"),
"temperature": data.get("temperature"),
"status": data.get("status", "success"),
"error": data.get("error"),
# NO logear el contenido del prompt/completion por defecto (PII)
"prompt_hash": data.get("prompt_hash"),
"has_tool_calls": data.get("has_tool_calls", False),
}
self.logger.info(json.dumps(log_entry))
def log_quality(self, request_id: str, scores: dict):
log_entry = {
"timestamp": datetime.utcnow().isoformat(),
"request_id": request_id,
"type": "quality",
"scores": scores,
}
self.logger.info(json.dumps(log_entry))
Tracing
El tracing captura la cadena completa de operaciones en un request LLM.
import uuid
import time
class LLMTracer:
def __init__(self):
self.traces = {}
def start_trace(self, name: str, metadata: dict = None) -> str:
trace_id = str(uuid.uuid4())[:8]
self.traces[trace_id] = {
"name": name,
"trace_id": trace_id,
"start_time": time.time(),
"spans": [],
"metadata": metadata or {},
}
return trace_id
def add_span(self, trace_id: str, name: str, data: dict):
span = {
"name": name,
"timestamp": time.time(),
"data": data,
}
self.traces[trace_id]["spans"].append(span)
def end_trace(self, trace_id: str) -> dict:
trace = self.traces[trace_id]
trace["end_time"] = time.time()
trace["duration_ms"] = (trace["end_time"] - trace["start_time"]) * 1000
return trace
# Uso en un pipeline RAG
tracer = LLMTracer()
def rag_query_traced(question: str):
trace_id = tracer.start_trace("rag_query", {"question": question})
# Span 1: Embedding
t0 = time.time()
query_emb = embed_query(question)
tracer.add_span(trace_id, "embedding", {
"model": "text-embedding-3-small",
"latency_ms": (time.time() - t0) * 1000,
})
# Span 2: Retrieval
t0 = time.time()
docs = vector_search(query_emb, top_k=5)
tracer.add_span(trace_id, "retrieval", {
"docs_found": len(docs),
"top_score": docs[0]["score"] if docs else 0,
"latency_ms": (time.time() - t0) * 1000,
})
# Span 3: LLM Generation
t0 = time.time()
response = generate_answer(question, docs)
tracer.add_span(trace_id, "llm_generation", {
"model": "gpt-4o",
"prompt_tokens": response.usage.prompt_tokens,
"completion_tokens": response.usage.completion_tokens,
"latency_ms": (time.time() - t0) * 1000,
})
trace = tracer.end_trace(trace_id)
print(f"Total: {trace['duration_ms']:.0f}ms")
return response
Métricas Clave
from collections import defaultdict
import statistics
class LLMMetrics:
def __init__(self):
self.data = defaultdict(list)
def record(self, metric: str, value: float, tags: dict = None):
self.data[metric].append({
"value": value,
"timestamp": time.time(),
"tags": tags or {},
})
def get_stats(self, metric: str) -> dict:
values = [d["value"] for d in self.data[metric]]
if not values:
return {}
return {
"count": len(values),
"mean": statistics.mean(values),
"median": statistics.median(values),
"p95": sorted(values)[int(len(values) * 0.95)],
"p99": sorted(values)[int(len(values) * 0.99)] if len(values) >= 100 else None,
"min": min(values),
"max": max(values),
}
metrics = LLMMetrics()
# Métricas a rastrear:
metrics.record("latency_ms", 1250, {"model": "gpt-4o", "endpoint": "/chat"})
metrics.record("tokens_total", 850, {"model": "gpt-4o"})
metrics.record("cost_usd", 0.0085, {"model": "gpt-4o"})
metrics.record("quality_score", 0.92, {"evaluator": "llm_judge"})
metrics.record("retrieval_precision", 0.85, {"pipeline": "rag_v2"})
Dashboard de Métricas
| Métrica | Objetivo | Alerta si |
|---|---|---|
| Latencia P95 | < 3s | > 5s |
| Error rate | < 1% | > 5% |
| Costo por request | < $0.01 | > $0.05 |
| Token usage diario | < 500K | > 1M |
| Quality score | > 0.85 | < 0.70 |
| Hallucination rate | < 5% | > 15% |
Herramientas de Observabilidad LLM
LangSmith (LangChain)
import os
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = "..."
os.environ["LANGCHAIN_PROJECT"] = "mi-proyecto"
# Automáticamente traza todas las llamadas de LangChain
Langfuse (Open-Source)
from langfuse import Langfuse
langfuse = Langfuse(
public_key="pk-...",
secret_key="sk-...",
host="https://cloud.langfuse.com"
)
# Crear trace
trace = langfuse.trace(name="chat-request", user_id="user-123")
# Crear span para generación
generation = trace.generation(
name="chat-completion",
model="gpt-4o",
input=[{"role": "user", "content": "Hola"}],
output="¡Hola! ¿Cómo puedo ayudarte?",
usage={"input": 10, "output": 15},
)
# Registrar score
trace.score(name="quality", value=0.9, comment="Respuesta relevante")
Alertas
class AlertSystem:
def __init__(self, thresholds: dict):
self.thresholds = thresholds
def check(self, metrics: LLMMetrics):
alerts = []
for metric_name, threshold in self.thresholds.items():
stats = metrics.get_stats(metric_name)
if not stats:
continue
if "max" in threshold and stats["mean"] > threshold["max"]:
alerts.append({
"severity": "critical",
"metric": metric_name,
"message": f"{metric_name} = {stats['mean']:.2f} > {threshold['max']}",
})
if "min" in threshold and stats["mean"] < threshold["min"]:
alerts.append({
"severity": "warning",
"metric": metric_name,
"message": f"{metric_name} = {stats['mean']:.2f} < {threshold['min']}",
})
return alerts
alert_system = AlertSystem({
"latency_ms": {"max": 5000},
"quality_score": {"min": 0.70},
"cost_usd": {"max": 0.05},
})
Resumen
La observabilidad en LLMOps va más allá del monitoreo técnico: necesitas tracing de prompts, métricas de calidad, tracking de costos y alertas sobre degradación. Herramientas como LangSmith y Langfuse simplifican la implementación, pero los principios son universales.