Procesamiento Multimodal y de Archivos
Las aplicaciones AI-First modernas no se limitan a texto. Los LLMs actuales pueden procesar imágenes, PDFs, audio y video. En esta lección aprenderás a implementar procesamiento multimodal en tu aplicación.
Visión con LLMs
OpenAI Vision
const response = await openai.chat.completions.create({
model: 'gpt-4o',
messages: [
{
role: 'user',
content: [
{ type: 'text', text: 'Describe qué ves en esta imagen.' },
{
type: 'image_url',
image_url: {
url: 'https://example.com/photo.jpg',
detail: 'high', // 'low' | 'high' | 'auto'
},
},
],
},
],
max_tokens: 500,
});
Imagen desde Base64
import { readFileSync } from 'fs';
const imageBuffer = readFileSync('./screenshot.png');
const base64Image = imageBuffer.toString('base64');
const response = await openai.chat.completions.create({
model: 'gpt-4o',
messages: [
{
role: 'user',
content: [
{ type: 'text', text: 'Analiza este screenshot de UI y sugiere mejoras.' },
{
type: 'image_url',
image_url: {
url: `data:image/png;base64,${base64Image}`,
},
},
],
},
],
});
Claude Vision
const response = await anthropic.messages.create({
model: 'claude-sonnet-4-20250514',
max_tokens: 1024,
messages: [
{
role: 'user',
content: [
{
type: 'image',
source: {
type: 'base64',
media_type: 'image/png',
data: base64Image,
},
},
{
type: 'text',
text: 'Extrae todo el texto visible en esta imagen.',
},
],
},
],
});
Procesamiento de PDFs
Extraer texto de PDFs
import { PdfReader } from 'pdfreader';
async function extractTextFromPDF(filePath: string): Promise<string> {
return new Promise((resolve, reject) => {
const texts: string[] = [];
new PdfReader().parseFileItems(filePath, (err, item) => {
if (err) return reject(err);
if (!item) return resolve(texts.join(' '));
if (item.text) texts.push(item.text);
});
});
}
// Alternativa: pdf-parse (más simple)
import pdf from 'pdf-parse';
async function extractPDF(buffer: Buffer): Promise<string> {
const data = await pdf(buffer);
return data.text; // Todo el texto del PDF
}
PDF → Chunks → RAG
async function ingestPDF(file: Buffer, metadata: Record<string, any>) {
// 1. Extraer texto
const text = await extractPDF(file);
// 2. Chunking
const chunks = recursiveChunk(text, 1000);
// 3. Generar embeddings
const response = await openai.embeddings.create({
model: 'text-embedding-3-small',
input: chunks,
});
// 4. Almacenar en vector DB
for (let i = 0; i < chunks.length; i++) {
await db.query(
`INSERT INTO documents (content, embedding, metadata)
VALUES ($1, $2::vector, $3)`,
[
chunks[i],
JSON.stringify(response.data[i].embedding),
JSON.stringify({ ...metadata, chunkIndex: i, totalChunks: chunks.length }),
]
);
}
return { chunksProcessed: chunks.length };
}
PDFs con visión (scanned PDFs)
import { fromPath } from 'pdf2pic';
async function processScanedPDF(pdfPath: string): Promise<string> {
// Convertir cada página a imagen
const converter = fromPath(pdfPath, {
density: 300,
format: 'png',
width: 2000,
height: 2800,
});
const pages = await converter.bulk(-1); // Todas las páginas
let fullText = '';
// Usar visión del LLM para OCR
for (const page of pages) {
const imageBase64 = readFileSync(page.path!).toString('base64');
const response = await openai.chat.completions.create({
model: 'gpt-4o',
messages: [{
role: 'user',
content: [
{ type: 'text', text: 'Extrae todo el texto de esta página. Solo el texto, sin comentarios.' },
{ type: 'image_url', image_url: { url: `data:image/png;base64,${imageBase64}` } },
],
}],
});
fullText += response.choices[0].message.content + '\n\n';
}
return fullText;
}
Audio con Whisper
Transcripción
import { createReadStream } from 'fs';
async function transcribeAudio(audioPath: string): Promise<string> {
const transcription = await openai.audio.transcriptions.create({
model: 'whisper-1',
file: createReadStream(audioPath),
language: 'es', // Opcional: idioma del audio
response_format: 'text',
});
return transcription;
}
// Con timestamps
async function transcribeWithTimestamps(audioPath: string) {
const transcription = await openai.audio.transcriptions.create({
model: 'whisper-1',
file: createReadStream(audioPath),
response_format: 'verbose_json',
timestamp_granularities: ['segment'],
});
return transcription.segments?.map(s => ({
start: s.start,
end: s.end,
text: s.text,
}));
}
Text-to-Speech (TTS)
async function textToSpeech(text: string, outputPath: string) {
const response = await openai.audio.speech.create({
model: 'tts-1', // o 'tts-1-hd' para mejor calidad
voice: 'alloy', // alloy, echo, fable, onyx, nova, shimmer
input: text,
speed: 1.0,
});
const buffer = Buffer.from(await response.arrayBuffer());
writeFileSync(outputPath, buffer);
}
Upload de archivos: API endpoint
import { Hono } from 'hono';
import multer from 'multer';
const upload = multer({
limits: { fileSize: 25 * 1024 * 1024 }, // 25MB
fileFilter: (req, file, cb) => {
const allowed = [
'application/pdf',
'image/png', 'image/jpeg', 'image/webp',
'audio/mpeg', 'audio/wav', 'audio/webm',
'text/plain', 'text/markdown',
];
cb(null, allowed.includes(file.mimetype));
},
});
app.post('/api/upload', authMiddleware, async (c) => {
const formData = await c.req.formData();
const file = formData.get('file') as File;
if (!file) return c.json({ error: 'No file uploaded' }, 400);
const buffer = Buffer.from(await file.arrayBuffer());
const mimeType = file.type;
let result;
switch (true) {
case mimeType === 'application/pdf':
result = await processPDF(buffer);
break;
case mimeType.startsWith('image/'):
result = await processImage(buffer, mimeType);
break;
case mimeType.startsWith('audio/'):
result = await processAudio(buffer);
break;
case mimeType.startsWith('text/'):
result = await processText(buffer.toString('utf-8'));
break;
default:
return c.json({ error: 'Tipo de archivo no soportado' }, 400);
}
return c.json(result);
});
async function processImage(buffer: Buffer, mimeType: string) {
const base64 = buffer.toString('base64');
const response = await openai.chat.completions.create({
model: 'gpt-4o',
messages: [{
role: 'user',
content: [
{ type: 'text', text: 'Describe esta imagen en detalle.' },
{ type: 'image_url', image_url: { url: `data:${mimeType};base64,${base64}` } },
],
}],
});
return {
type: 'image',
description: response.choices[0].message.content,
};
}
Frontend: Componente de File Upload
function FileUpload({ onUpload }: { onUpload: (result: any) => void }) {
const [uploading, setUploading] = useState(false);
const [dragOver, setDragOver] = useState(false);
const handleFile = async (file: File) => {
setUploading(true);
const formData = new FormData();
formData.append('file', file);
try {
const response = await fetch('/api/upload', {
method: 'POST',
headers: { Authorization: `Bearer ${token}` },
body: formData,
});
const result = await response.json();
onUpload(result);
} catch (error) {
console.error('Upload failed:', error);
} finally {
setUploading(false);
}
};
return (
<div
onDragOver={(e) => { e.preventDefault(); setDragOver(true); }}
onDragLeave={() => setDragOver(false)}
onDrop={(e) => {
e.preventDefault();
setDragOver(false);
const file = e.dataTransfer.files[0];
if (file) handleFile(file);
}}
className={`border-2 border-dashed rounded-lg p-8 text-center transition
${dragOver ? 'border-blue-500 bg-blue-50' : 'border-gray-300'}
${uploading ? 'opacity-50' : 'cursor-pointer'}`}
>
<input
type="file"
onChange={(e) => e.target.files?.[0] && handleFile(e.target.files[0])}
className="hidden"
id="file-input"
accept=".pdf,.png,.jpg,.jpeg,.webp,.mp3,.wav,.txt,.md"
/>
<label htmlFor="file-input" className="cursor-pointer">
{uploading ? (
<p>Procesando archivo...</p>
) : (
<>
<p className="text-lg font-medium">Arrastra un archivo aquí</p>
<p className="text-sm text-gray-500 mt-1">
PDF, imágenes, audio o texto (máx 25MB)
</p>
</>
)}
</label>
</div>
);
}