Gestión de Costos

El Factor Costo en LLMOps

Los costos de LLMs son variables y proporcionales al uso. A diferencia del software tradicional donde el costo de compute es mayormente fijo, cada request a un LLM tiene un costo directo en tokens.

Estructura de Costos

┌──────────────────────────────────────────────┐
│         COSTOS DE UNA APP LLM               │
│                                               │
│  ┌─────────────────────────────────────┐     │
│  │     Tokens LLM (60-80% del costo)  │     │
│  │  Input tokens + Output tokens       │     │
│  │  Variable por request               │     │
│  └─────────────────────────────────────┘     │
│  ┌─────────────────────────────────────┐     │
│  │     Embeddings (5-15%)              │     │
│  │  Indexación + queries               │     │
│  └─────────────────────────────────────┘     │
│  ┌─────────────────────────────────────┐     │
│  │     Infraestructura (10-20%)        │     │
│  │  Vector DB + Compute + Storage      │     │
│  └─────────────────────────────────────┘     │
│  ┌─────────────────────────────────────┐     │
│  │     Otros (5-10%)                   │     │
│  │  Observabilidad, fine-tuning, etc.  │     │
│  └─────────────────────────────────────┘     │
└──────────────────────────────────────────────┘

Calculadora de Costos

class CostCalculator:
    PRICING = {
        "gpt-4o": {"input": 2.50, "output": 10.00},       # por 1M tokens
        "gpt-4o-mini": {"input": 0.15, "output": 0.60},
        "claude-3-5-sonnet": {"input": 3.00, "output": 15.00},
        "claude-3-5-haiku": {"input": 0.80, "output": 4.00},
        "text-embedding-3-small": {"input": 0.02, "output": 0},
        "text-embedding-3-large": {"input": 0.13, "output": 0},
    }
    
    def calculate_request_cost(self, model, input_tokens, output_tokens):
        pricing = self.PRICING.get(model, {"input": 0, "output": 0})
        input_cost = (input_tokens / 1_000_000) * pricing["input"]
        output_cost = (output_tokens / 1_000_000) * pricing["output"]
        return {
            "input_cost": input_cost,
            "output_cost": output_cost,
            "total_cost": input_cost + output_cost,
        }
    
    def estimate_monthly(self, model, avg_input, avg_output, requests_per_day):
        per_request = self.calculate_request_cost(model, avg_input, avg_output)
        daily = per_request["total_cost"] * requests_per_day
        monthly = daily * 30
        return {
            "per_request": per_request["total_cost"],
            "daily": daily,
            "monthly": monthly,
            "yearly": monthly * 12,
        }

calc = CostCalculator()
estimate = calc.estimate_monthly(
    model="gpt-4o",
    avg_input=500,    # tokens por request
    avg_output=300,
    requests_per_day=10000,
)
print(f"Costo mensual estimado: ${estimate['monthly']:.2f}")

Estrategias de Optimización

1. Model Routing

class ModelRouter:
    """Rutar requests al modelo más costo-eficiente."""
    
    def select_model(self, task: str, complexity: str) -> str:
        routing = {
            ("classification", "low"): "gpt-4o-mini",
            ("classification", "high"): "gpt-4o-mini",
            ("generation", "low"): "gpt-4o-mini",
            ("generation", "high"): "gpt-4o",
            ("code", "low"): "gpt-4o-mini",
            ("code", "high"): "gpt-4o",
            ("reasoning", "low"): "gpt-4o-mini",
            ("reasoning", "high"): "gpt-4o",
        }
        return routing.get((task, complexity), "gpt-4o-mini")
    
    def classify_complexity(self, prompt: str) -> str:
        """Clasificar complejidad del prompt (puede usar LLM barato)."""
        word_count = len(prompt.split())
        has_code = any(kw in prompt for kw in ["código", "function", "class", "def"])
        has_reasoning = any(kw in prompt for kw in ["por qué", "analiza", "compara", "explica"])
        
        if word_count > 200 or (has_code and has_reasoning):
            return "high"
        return "low"

2. Prompt Caching

import hashlib
import json

class PromptCache:
    def __init__(self, ttl_seconds=3600):
        self.cache = {}
        self.ttl = ttl_seconds
        self.stats = {"hits": 0, "misses": 0}
    
    def _key(self, model, messages, temperature):
        data = json.dumps({"model": model, "messages": messages, "temp": temperature})
        return hashlib.sha256(data.encode()).hexdigest()
    
    def get(self, model, messages, temperature=0):
        if temperature > 0:
            return None  # No cachear respuestas no-determinísticas
        
        key = self._key(model, messages, temperature)
        entry = self.cache.get(key)
        
        if entry and time.time() - entry["timestamp"] < self.ttl:
            self.stats["hits"] += 1
            return entry["response"]
        
        self.stats["misses"] += 1
        return None
    
    def set(self, model, messages, temperature, response):
        if temperature > 0:
            return
        key = self._key(model, messages, temperature)
        self.cache[key] = {
            "response": response,
            "timestamp": time.time(),
        }
    
    def savings_report(self):
        total = self.stats["hits"] + self.stats["misses"]
        hit_rate = self.stats["hits"] / total if total > 0 else 0
        return {
            "hit_rate": f"{hit_rate:.1%}",
            "requests_saved": self.stats["hits"],
        }

3. Prompt Optimization

def optimize_prompt(original_prompt: str) -> dict:
    """Reducir tokens sin perder efectividad."""
    import tiktoken
    encoder = tiktoken.encoding_for_model("gpt-4o")
    
    original_tokens = len(encoder.encode(original_prompt))
    
    # Técnicas de reducción
    optimized = original_prompt
    
    # 1. Eliminar espacios redundantes
    optimized = re.sub(r'\n{3,}', '\n\n', optimized)
    optimized = re.sub(r' {2,}', ' ', optimized)
    
    # 2. Usar abreviaciones comunes
    replacements = {
        "por favor": "",
        "por favor,": "",
        "a continuación": "",
        "de acuerdo con": "según",
    }
    for old, new in replacements.items():
        optimized = optimized.replace(old, new)
    
    optimized_tokens = len(encoder.encode(optimized))
    savings = original_tokens - optimized_tokens
    
    return {
        "original_tokens": original_tokens,
        "optimized_tokens": optimized_tokens,
        "tokens_saved": savings,
        "cost_reduction": f"{savings/original_tokens:.1%}" if original_tokens > 0 else "0%",
        "optimized_prompt": optimized,
    }

4. Streaming + Early Stop

async def stream_with_budget(prompt, max_cost_usd=0.01):
    """Detener generación si se excede el presupuesto."""
    tokens_generated = 0
    max_tokens = int(max_cost_usd / (10.00 / 1_000_000))  # GPT-4o output price
    
    stream = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        stream=True,
        max_tokens=min(max_tokens, 4096),
    )
    
    result = []
    for chunk in stream:
        content = chunk.choices[0].delta.content
        if content:
            result.append(content)
            tokens_generated += 1
    
    return "".join(result)

Monitoreo de Costos en Tiempo Real

class CostMonitor:
    def __init__(self, daily_budget: float = 50.0):
        self.daily_budget = daily_budget
        self.daily_spend = 0.0
        self.last_reset = datetime.utcnow().date()
    
    def record_cost(self, cost: float) -> dict:
        today = datetime.utcnow().date()
        if today != self.last_reset:
            self.daily_spend = 0.0
            self.last_reset = today
        
        self.daily_spend += cost
        remaining = self.daily_budget - self.daily_spend
        
        result = {
            "daily_spend": self.daily_spend,
            "remaining": remaining,
            "utilization": self.daily_spend / self.daily_budget,
        }
        
        if remaining < self.daily_budget * 0.1:
            result["alert"] = "⚠️ Menos del 10% del presupuesto diario"
        if remaining <= 0:
            result["alert"] = "🚨 Presupuesto diario excedido"
        
        return result

Resumen

Los costos en LLMOps son variables y pueden escalar rápidamente. Las estrategias clave son: model routing (usar el modelo más barato que funcione), caching (evitar llamadas repetidas), prompt optimization (menos tokens = menos costo), y monitoreo continuo con alertas y presupuestos.

Gestión de Costos

Estás en modo lectura

Gestión de Costos

El Factor Costo en LLMOps

Estructura de Costos

Calculadora de Costos

Estrategias de Optimización

1. Model Routing

2. Prompt Caching

3. Prompt Optimization

4. Streaming + Early Stop

Monitoreo de Costos en Tiempo Real

Resumen

Ejercicio práctico disponible

¿Te gustó esta lección?