APIs de LLMs
Trabajar con APIs de Modelos
En LLMOps, la interacción con LLMs generalmente se realiza a través de APIs HTTP. Dominar las APIs es fundamental para construir aplicaciones robustas.
OpenAI API
Chat Completion
from openai import OpenAI
client = OpenAI(api_key="sk-...")
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "Eres un experto en Python."},
{"role": "user", "content": "¿Qué es un decorador?"}
],
temperature=0.7,
max_tokens=500,
top_p=0.9,
)
print(response.choices[0].message.content)
print(f"Tokens usados: {response.usage.total_tokens}")
print(f"Costo: ${response.usage.prompt_tokens * 0.0000025 + response.usage.completion_tokens * 0.00001:.6f}")
Streaming
stream = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": "Explica microservicios"}],
stream=True,
)
for chunk in stream:
content = chunk.choices[0].delta.content
if content:
print(content, end="", flush=True)
Anthropic API (Claude)
import anthropic
client = anthropic.Anthropic(api_key="sk-ant-...")
message = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=1024,
system="Eres un arquitecto de software senior.",
messages=[
{"role": "user", "content": "Diseña una arquitectura de microservicios para un e-commerce."}
]
)
print(message.content[0].text)
print(f"Tokens input: {message.usage.input_tokens}")
print(f"Tokens output: {message.usage.output_tokens}")
Patrones de Producción con APIs
1. Retry con Exponential Backoff
import time
from openai import OpenAI, RateLimitError, APIError
client = OpenAI()
def call_llm_with_retry(messages, max_retries=3):
for attempt in range(max_retries):
try:
response = client.chat.completions.create(
model="gpt-4o",
messages=messages,
timeout=30,
)
return response.choices[0].message.content
except RateLimitError:
wait = 2 ** attempt
print(f"Rate limited. Esperando {wait}s...")
time.sleep(wait)
except APIError as e:
if attempt == max_retries - 1:
raise
time.sleep(1)
raise Exception("Max retries exceeded")
2. Fallback entre Modelos
class LLMRouter:
def __init__(self):
self.openai = OpenAI()
self.anthropic = anthropic.Anthropic()
def complete(self, prompt, **kwargs):
"""Intentar OpenAI primero, fallback a Anthropic."""
try:
resp = self.openai.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
timeout=15,
**kwargs,
)
return {"text": resp.choices[0].message.content, "model": "gpt-4o"}
except Exception as e:
print(f"OpenAI falló: {e}. Fallback a Claude...")
msg = self.anthropic.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=1024,
messages=[{"role": "user", "content": prompt}],
)
return {"text": msg.content[0].text, "model": "claude-sonnet"}
3. Rate Limiting
import asyncio
from collections import deque
class TokenBucket:
def __init__(self, tokens_per_minute=60000):
self.capacity = tokens_per_minute
self.tokens = tokens_per_minute
self.last_refill = time.time()
async def acquire(self, tokens_needed):
while True:
now = time.time()
elapsed = now - self.last_refill
self.tokens = min(
self.capacity,
self.tokens + elapsed * (self.capacity / 60)
)
self.last_refill = now
if self.tokens >= tokens_needed:
self.tokens -= tokens_needed
return
await asyncio.sleep(0.1)
Structured Outputs (JSON Mode)
from pydantic import BaseModel
class ProductReview(BaseModel):
sentiment: str # "positive", "negative", "neutral"
score: float # 0.0 a 1.0
keywords: list[str]
summary: str
response = client.beta.chat.completions.parse(
model="gpt-4o",
messages=[
{"role": "system", "content": "Analiza la siguiente reseña."},
{"role": "user", "content": "El producto es increíble, muy buena calidad."}
],
response_format=ProductReview,
)
review = response.choices[0].message.parsed
print(f"Sentimiento: {review.sentiment}, Score: {review.score}")
Function Calling / Tool Use
tools = [
{
"type": "function",
"function": {
"name": "get_weather",
"description": "Obtener el clima actual de una ciudad",
"parameters": {
"type": "object",
"properties": {
"city": {"type": "string", "description": "Nombre de la ciudad"},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
},
"required": ["city"],
},
},
}
]
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": "¿Qué clima hace en Madrid?"}],
tools=tools,
tool_choice="auto",
)
tool_call = response.choices[0].message.tool_calls[0]
print(f"Función: {tool_call.function.name}")
print(f"Args: {tool_call.function.arguments}")
Gestión de API Keys
import os
from dotenv import load_dotenv
load_dotenv()
# NUNCA hardcodear API keys
# ✗ client = OpenAI(api_key="sk-abc123...")
# ✓ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
# Variables de entorno necesarias:
# OPENAI_API_KEY=sk-...
# ANTHROPIC_API_KEY=sk-ant-...
# En producción: usar secretos de la nube (AWS Secrets Manager, etc.)
Resumen
Dominar las APIs de LLMs —retries, fallbacks, rate limiting, structured outputs y function calling— es la base de cualquier aplicación LLMOps en producción. La robustez del código que envuelve estas APIs determina la confiabilidad de toda la aplicación.