Gestión de Costos
El Factor Costo en LLMOps
Los costos de LLMs son variables y proporcionales al uso. A diferencia del software tradicional donde el costo de compute es mayormente fijo, cada request a un LLM tiene un costo directo en tokens.
Estructura de Costos
┌──────────────────────────────────────────────┐
│ COSTOS DE UNA APP LLM │
│ │
│ ┌─────────────────────────────────────┐ │
│ │ Tokens LLM (60-80% del costo) │ │
│ │ Input tokens + Output tokens │ │
│ │ Variable por request │ │
│ └─────────────────────────────────────┘ │
│ ┌─────────────────────────────────────┐ │
│ │ Embeddings (5-15%) │ │
│ │ Indexación + queries │ │
│ └─────────────────────────────────────┘ │
│ ┌─────────────────────────────────────┐ │
│ │ Infraestructura (10-20%) │ │
│ │ Vector DB + Compute + Storage │ │
│ └─────────────────────────────────────┘ │
│ ┌─────────────────────────────────────┐ │
│ │ Otros (5-10%) │ │
│ │ Observabilidad, fine-tuning, etc. │ │
│ └─────────────────────────────────────┘ │
└──────────────────────────────────────────────┘
Calculadora de Costos
class CostCalculator:
PRICING = {
"gpt-4o": {"input": 2.50, "output": 10.00}, # por 1M tokens
"gpt-4o-mini": {"input": 0.15, "output": 0.60},
"claude-3-5-sonnet": {"input": 3.00, "output": 15.00},
"claude-3-5-haiku": {"input": 0.80, "output": 4.00},
"text-embedding-3-small": {"input": 0.02, "output": 0},
"text-embedding-3-large": {"input": 0.13, "output": 0},
}
def calculate_request_cost(self, model, input_tokens, output_tokens):
pricing = self.PRICING.get(model, {"input": 0, "output": 0})
input_cost = (input_tokens / 1_000_000) * pricing["input"]
output_cost = (output_tokens / 1_000_000) * pricing["output"]
return {
"input_cost": input_cost,
"output_cost": output_cost,
"total_cost": input_cost + output_cost,
}
def estimate_monthly(self, model, avg_input, avg_output, requests_per_day):
per_request = self.calculate_request_cost(model, avg_input, avg_output)
daily = per_request["total_cost"] * requests_per_day
monthly = daily * 30
return {
"per_request": per_request["total_cost"],
"daily": daily,
"monthly": monthly,
"yearly": monthly * 12,
}
calc = CostCalculator()
estimate = calc.estimate_monthly(
model="gpt-4o",
avg_input=500, # tokens por request
avg_output=300,
requests_per_day=10000,
)
print(f"Costo mensual estimado: ${estimate['monthly']:.2f}")
Estrategias de Optimización
1. Model Routing
class ModelRouter:
"""Rutar requests al modelo más costo-eficiente."""
def select_model(self, task: str, complexity: str) -> str:
routing = {
("classification", "low"): "gpt-4o-mini",
("classification", "high"): "gpt-4o-mini",
("generation", "low"): "gpt-4o-mini",
("generation", "high"): "gpt-4o",
("code", "low"): "gpt-4o-mini",
("code", "high"): "gpt-4o",
("reasoning", "low"): "gpt-4o-mini",
("reasoning", "high"): "gpt-4o",
}
return routing.get((task, complexity), "gpt-4o-mini")
def classify_complexity(self, prompt: str) -> str:
"""Clasificar complejidad del prompt (puede usar LLM barato)."""
word_count = len(prompt.split())
has_code = any(kw in prompt for kw in ["código", "function", "class", "def"])
has_reasoning = any(kw in prompt for kw in ["por qué", "analiza", "compara", "explica"])
if word_count > 200 or (has_code and has_reasoning):
return "high"
return "low"
2. Prompt Caching
import hashlib
import json
class PromptCache:
def __init__(self, ttl_seconds=3600):
self.cache = {}
self.ttl = ttl_seconds
self.stats = {"hits": 0, "misses": 0}
def _key(self, model, messages, temperature):
data = json.dumps({"model": model, "messages": messages, "temp": temperature})
return hashlib.sha256(data.encode()).hexdigest()
def get(self, model, messages, temperature=0):
if temperature > 0:
return None # No cachear respuestas no-determinísticas
key = self._key(model, messages, temperature)
entry = self.cache.get(key)
if entry and time.time() - entry["timestamp"] < self.ttl:
self.stats["hits"] += 1
return entry["response"]
self.stats["misses"] += 1
return None
def set(self, model, messages, temperature, response):
if temperature > 0:
return
key = self._key(model, messages, temperature)
self.cache[key] = {
"response": response,
"timestamp": time.time(),
}
def savings_report(self):
total = self.stats["hits"] + self.stats["misses"]
hit_rate = self.stats["hits"] / total if total > 0 else 0
return {
"hit_rate": f"{hit_rate:.1%}",
"requests_saved": self.stats["hits"],
}
3. Prompt Optimization
def optimize_prompt(original_prompt: str) -> dict:
"""Reducir tokens sin perder efectividad."""
import tiktoken
encoder = tiktoken.encoding_for_model("gpt-4o")
original_tokens = len(encoder.encode(original_prompt))
# Técnicas de reducción
optimized = original_prompt
# 1. Eliminar espacios redundantes
optimized = re.sub(r'\n{3,}', '\n\n', optimized)
optimized = re.sub(r' {2,}', ' ', optimized)
# 2. Usar abreviaciones comunes
replacements = {
"por favor": "",
"por favor,": "",
"a continuación": "",
"de acuerdo con": "según",
}
for old, new in replacements.items():
optimized = optimized.replace(old, new)
optimized_tokens = len(encoder.encode(optimized))
savings = original_tokens - optimized_tokens
return {
"original_tokens": original_tokens,
"optimized_tokens": optimized_tokens,
"tokens_saved": savings,
"cost_reduction": f"{savings/original_tokens:.1%}" if original_tokens > 0 else "0%",
"optimized_prompt": optimized,
}
4. Streaming + Early Stop
async def stream_with_budget(prompt, max_cost_usd=0.01):
"""Detener generación si se excede el presupuesto."""
tokens_generated = 0
max_tokens = int(max_cost_usd / (10.00 / 1_000_000)) # GPT-4o output price
stream = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
stream=True,
max_tokens=min(max_tokens, 4096),
)
result = []
for chunk in stream:
content = chunk.choices[0].delta.content
if content:
result.append(content)
tokens_generated += 1
return "".join(result)
Monitoreo de Costos en Tiempo Real
class CostMonitor:
def __init__(self, daily_budget: float = 50.0):
self.daily_budget = daily_budget
self.daily_spend = 0.0
self.last_reset = datetime.utcnow().date()
def record_cost(self, cost: float) -> dict:
today = datetime.utcnow().date()
if today != self.last_reset:
self.daily_spend = 0.0
self.last_reset = today
self.daily_spend += cost
remaining = self.daily_budget - self.daily_spend
result = {
"daily_spend": self.daily_spend,
"remaining": remaining,
"utilization": self.daily_spend / self.daily_budget,
}
if remaining < self.daily_budget * 0.1:
result["alert"] = "⚠️ Menos del 10% del presupuesto diario"
if remaining <= 0:
result["alert"] = "🚨 Presupuesto diario excedido"
return result
Resumen
Los costos en LLMOps son variables y pueden escalar rápidamente. Las estrategias clave son: model routing (usar el modelo más barato que funcione), caching (evitar llamadas repetidas), prompt optimization (menos tokens = menos costo), y monitoreo continuo con alertas y presupuestos.