CI/CD Pipelines para LLMs
Pipeline LLMOps
┌──────────────────────────────────────────────────┐
│ PIPELINE CI/CD LLMOps │
│ │
│ Código/Prompts → Tests → Build → Stage → Prod │
│ │
│ ┌─────────┐ ┌──────────┐ ┌─────────────┐ │
│ │ Commit │→ │ CI Tests │→ │ Deploy │ │
│ │ │ │ │ │ │ │
│ │ - Code │ │ - Unit │ │ - Canary │ │
│ │ - Prompt│ │ - Eval │ │ - A/B test │ │
│ │ - Config│ │ - Cost │ │ - Rollback │ │
│ └─────────┘ └──────────┘ └─────────────┘ │
└──────────────────────────────────────────────────┘
Estructura de Proyecto para CI/CD
llm-project/
├── .github/
│ └── workflows/
│ ├── ci.yml
│ ├── eval.yml
│ └── deploy.yml
├── src/
│ ├── chains/
│ ├── prompts/
│ └── tools/
├── prompts/
│ ├── system/
│ │ └── assistant_v3.yaml
│ └── templates/
│ └── summarize_v2.yaml
├── evals/
│ ├── datasets/
│ │ └── golden_set.jsonl
│ ├── test_accuracy.py
│ └── test_cost.py
├── tests/
│ ├── test_chains.py
│ └── test_prompts.py
├── config/
│ ├── models.yaml
│ └── guardrails.yaml
└── Dockerfile
GitHub Actions: CI Pipeline
# .github/workflows/ci.yml
name: LLM CI Pipeline
on:
push:
branches: [main, develop]
pull_request:
branches: [main]
jobs:
lint-and-test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "3.11"
- name: Install dependencies
run: pip install -r requirements.txt
- name: Lint
run: |
ruff check src/
mypy src/
- name: Unit tests (sin LLM)
run: pytest tests/ -m "not llm" -v
- name: Validar prompts
run: python scripts/validate_prompts.py
prompt-eval:
needs: lint-and-test
runs-on: ubuntu-latest
if: github.event_name == 'pull_request'
steps:
- uses: actions/checkout@v4
- name: Eval con dataset golden
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
run: |
python evals/test_accuracy.py \
--dataset evals/datasets/golden_set.jsonl \
--threshold 0.85 \
--max-cost 5.0
- name: Comparar con baseline
run: |
python evals/compare_baseline.py \
--current-results eval_results.json \
--baseline evals/baseline.json
- name: Publicar resultados
uses: actions/github-script@v7
with:
script: |
const fs = require('fs');
const results = JSON.parse(fs.readFileSync('eval_results.json'));
const body = `## Eval Results
| Metric | Score | Threshold |
|--------|-------|-----------|
| Accuracy | ${results.accuracy} | 0.85 |
| Latency p50 | ${results.latency_p50}ms | 2000ms |
| Cost | $${results.total_cost} | $5.00 |`;
github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: body
});
Testing de Prompts
# tests/test_prompts.py
import yaml
import pytest
from pathlib import Path
def load_prompt(name: str) -> dict:
path = Path(f"prompts/{name}.yaml")
with open(path) as f:
return yaml.safe_load(f)
class TestPromptValidation:
"""Tests que NO requieren llamada al LLM."""
def test_prompt_has_required_fields(self):
prompt = load_prompt("system/assistant_v3")
assert "version" in prompt
assert "system_message" in prompt
assert "variables" in prompt
def test_prompt_variables_documented(self):
prompt = load_prompt("templates/summarize_v2")
for var in prompt.get("variables", []):
assert "name" in var
assert "description" in var
assert "example" in var
def test_prompt_under_token_limit(self):
import tiktoken
enc = tiktoken.encoding_for_model("gpt-4o")
prompt = load_prompt("system/assistant_v3")
tokens = len(enc.encode(prompt["system_message"]))
assert tokens < 4000, f"System prompt tiene {tokens} tokens (max: 4000)"
def test_no_hardcoded_values(self):
prompt = load_prompt("templates/summarize_v2")
# Verificar que usa variables, no valores hardcoded
template = prompt["template"]
assert "{{" in template, "Template debe usar variables"
# tests/test_eval.py
@pytest.mark.llm
class TestLLMEvaluation:
"""Tests que SÍ requieren llamada al LLM."""
def test_summarization_quality(self):
dataset = load_golden_set("evals/datasets/golden_set.jsonl")
results = []
for sample in dataset:
response = call_llm(sample["input"])
score = evaluate_response(response, sample["expected"])
results.append(score)
avg_score = sum(results) / len(results)
assert avg_score >= 0.85, f"Score promedio {avg_score:.2f} < 0.85"
def test_no_hallucination(self):
test_cases = [
{
"context": "La empresa fue fundada en 2020.",
"question": "¿Cuándo se fundó la empresa?",
"must_contain": "2020",
"must_not_contain": ["2019", "2021"],
}
]
for case in test_cases:
response = call_rag(case["context"], case["question"])
assert case["must_contain"] in response
for forbidden in case["must_not_contain"]:
assert forbidden not in response
Versionado de Prompts
# prompts/system/assistant_v3.yaml
version: "3.0.1"
created: "2024-01-15"
author: "equipo-llm"
model_target: "gpt-4o"
description: |
System prompt para asistente principal.
Cambio v3: Añadido manejo de idioma y formato.
system_message: |
Eres un asistente experto. Sigue estas reglas:
1. Responde siempre en {{language}}
2. Sé conciso pero completo
3. Si no sabes algo, dilo claramente
4. Usa formato {{output_format}}
variables:
- name: language
description: "Idioma de respuesta"
example: "español"
default: "español"
- name: output_format
description: "Formato de salida"
example: "markdown"
default: "texto plano"
evaluation:
golden_set: "evals/datasets/assistant_v3.jsonl"
min_accuracy: 0.90
max_latency_p95_ms: 3000
# src/prompt_manager.py
import yaml
from pathlib import Path
from functools import lru_cache
class PromptManager:
def __init__(self, prompts_dir: str = "prompts"):
self.dir = Path(prompts_dir)
@lru_cache(maxsize=100)
def get(self, name: str, version: str = None) -> dict:
if version:
path = self.dir / f"{name}_v{version}.yaml"
else:
# Encontrar última versión
pattern = name.replace("/", "_")
path = self.dir / f"{name}.yaml"
with open(path) as f:
prompt = yaml.safe_load(f)
return prompt
def render(self, name: str, **variables) -> str:
prompt = self.get(name)
template = prompt["system_message"]
for key, value in variables.items():
template = template.replace(f"{{{{{key}}}}}", str(value))
# Verificar variables no resueltas
import re
unresolved = re.findall(r'\{\{(\w+)\}\}', template)
if unresolved:
# Usar defaults
for var_def in prompt.get("variables", []):
if var_def["name"] in unresolved and "default" in var_def:
template = template.replace(
f"{{{{{var_def['name']}}}}}",
var_def["default"]
)
return template
# Uso
pm = PromptManager()
system_msg = pm.render(
"system/assistant_v3",
language="español",
output_format="markdown",
)
Deploy Canary y A/B Testing
# src/ab_testing.py
import random
import hashlib
from dataclasses import dataclass
@dataclass
class Variant:
name: str
prompt_version: str
model: str
weight: float # 0.0 - 1.0
class ABTester:
def __init__(self, variants: list[Variant]):
self.variants = variants
assert abs(sum(v.weight for v in variants) - 1.0) < 0.01
def assign_variant(self, user_id: str) -> Variant:
"""Asignación determinista por usuario."""
hash_val = int(hashlib.md5(user_id.encode()).hexdigest(), 16)
normalized = (hash_val % 1000) / 1000
cumulative = 0
for variant in self.variants:
cumulative += variant.weight
if normalized < cumulative:
return variant
return self.variants[-1]
# Configuración
ab_test = ABTester([
Variant("control", prompt_version="v2", model="gpt-4o-mini", weight=0.8),
Variant("treatment", prompt_version="v3", model="gpt-4o-mini", weight=0.2),
])
# En el handler
async def handle_request(user_id: str, message: str):
variant = ab_test.assign_variant(user_id)
prompt = prompt_manager.get(f"system/assistant_{variant.prompt_version}")
response = await call_llm(
model=variant.model,
system=prompt,
message=message,
)
# Log para análisis
log_experiment({
"user_id": user_id,
"variant": variant.name,
"prompt_version": variant.prompt_version,
"response_quality": None, # Se llena con feedback
})
return response
Rollback Automatizado
# scripts/auto_rollback.py
import time
class AutoRollback:
def __init__(self, error_threshold=0.05, window_seconds=300):
self.error_threshold = error_threshold
self.window = window_seconds
self.errors = []
self.total = []
def record(self, success: bool):
now = time.time()
self.total.append(now)
if not success:
self.errors.append(now)
# Limpiar fuera de ventana
cutoff = now - self.window
self.errors = [t for t in self.errors if t > cutoff]
self.total = [t for t in self.total if t > cutoff]
def should_rollback(self) -> bool:
if len(self.total) < 10: # Mínimo de muestras
return False
error_rate = len(self.errors) / len(self.total)
return error_rate > self.error_threshold
# Uso en deploy
monitor = AutoRollback(error_threshold=0.05)
async def monitored_endpoint(request):
try:
result = await process_with_new_version(request)
monitor.record(success=True)
return result
except Exception as e:
monitor.record(success=False)
if monitor.should_rollback():
trigger_rollback()
notify_team("Rollback automático activado")
raise
Resumen
CI/CD para LLMs extiende pipelines tradicionales con: validación de prompts sin LLM, evaluación con datasets golden, versionado de prompts en YAML, A/B testing de variantes, y rollback automatizado basado en métricas de error. Cada cambio en prompt o configuración debe pasar por el mismo rigor que un cambio de código.