Multimodal AI Agents: Beyond Text

What Are Multimodal Agents?

Multimodal AI agents can process and understand multiple types of input - text, images, audio, and video. This enables powerful new capabilities:

Visual understanding: Analyze images, screenshots, diagrams
Document processing: Extract info from PDFs with charts and tables
Video analysis: Summarize videos, extract key frames
Audio processing: Transcribe and analyze speech

Vision Language Models

GPT-4 Vision (GPT-4V)

from openai import OpenAI
import base64

client = OpenAI()

def encode_image(image_path):
    with open(image_path, "rb") as f:
        return base64.b64encode(f.read()).decode("utf-8")

# Analyze an image
image_data = encode_image("screenshot.png")

response = client.chat.completions.create(
    model="gpt-4-vision-preview",
    messages=[
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "What's in this image?"},
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/png;base64,{image_data}"
                    }
                }
            ]
        }
    ],
    max_tokens=500
)

print(response.choices[0].message.content)

Claude Vision

import anthropic
import base64

client = anthropic.Anthropic()

# Read and encode image
with open("diagram.png", "rb") as f:
    image_data = base64.standard_b64encode(f.read()).decode("utf-8")

response = client.messages.create(
    model="claude-3-sonnet-20240229",
    max_tokens=1024,
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "source": {
                        "type": "base64",
                        "media_type": "image/png",
                        "data": image_data
                    }
                },
                {
                    "type": "text",
                    "text": "Explain this architecture diagram"
                }
            ]
        }
    ]
)

print(response.content[0].text)

Multimodal Use Cases

Document Analysis

Extract data from receipts, invoices, forms with complex layouts.

UI Testing

Analyze screenshots to verify UI elements and detect bugs.

Chart Understanding

Interpret graphs and visualizations in reports.

Accessibility

Generate image descriptions for visually impaired users.

Building a Visual Agent

from openai import OpenAI
import base64
from typing import List

class VisualAgent:
    def __init__(self):
        self.client = OpenAI()
        self.conversation = []

    def encode_image(self, path: str) -> str:
        with open(path, "rb") as f:
            return base64.b64encode(f.read()).decode("utf-8")

    def analyze_image(self, image_path: str, question: str) -> str:
        image_data = self.encode_image(image_path)

        message = {
            "role": "user",
            "content": [
                {"type": "text", "text": question},
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/png;base64,{image_data}"}
                }
            ]
        }
        self.conversation.append(message)

        response = self.client.chat.completions.create(
            model="gpt-4-vision-preview",
            messages=self.conversation,
            max_tokens=1000
        )

        answer = response.choices[0].message.content
        self.conversation.append({"role": "assistant", "content": answer})

        return answer

    def compare_images(self, images: List[str], question: str) -> str:
        content = [{"type": "text", "text": question}]

        for i, path in enumerate(images):
            image_data = self.encode_image(path)
            content.append({
                "type": "image_url",
                "image_url": {"url": f"data:image/png;base64,{image_data}"}
            })

        response = self.client.chat.completions.create(
            model="gpt-4-vision-preview",
            messages=[{"role": "user", "content": content}],
            max_tokens=1000
        )

        return response.choices[0].message.content

# Usage
agent = VisualAgent()

# Analyze a single image
result = agent.analyze_image("chart.png", "What trends do you see?")
print(result)

# Compare multiple images
diff = agent.compare_images(
    ["before.png", "after.png"],
    "What changed between these two screenshots?"
)
print(diff)

Multimodal RAG

Combine vision and text retrieval for document understanding:

from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
import fitz  # PyMuPDF
import base64

class MultimodalRAG:
    def __init__(self):
        self.llm = ChatOpenAI(model="gpt-4-vision-preview")
        self.embeddings = OpenAIEmbeddings()
        self.vectorstore = None

    def extract_images_from_pdf(self, pdf_path: str):
        """Extract images from PDF pages."""
        doc = fitz.open(pdf_path)
        images = []

        for page_num in range(len(doc)):
            page = doc[page_num]
            # Render page as image
            pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
            img_bytes = pix.tobytes("png")
            images.append({
                "page": page_num,
                "image": base64.b64encode(img_bytes).decode()
            })

        return images

    def index_document(self, pdf_path: str):
        """Index both text and images from PDF."""
        # Extract text
        loader = PyPDFLoader(pdf_path)
        documents = loader.load()

        # Split text
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200
        )
        texts = splitter.split_documents(documents)

        # Create vector store
        self.vectorstore = Chroma.from_documents(
            documents=texts,
            embedding=self.embeddings
        )

        # Store images separately
        self.images = self.extract_images_from_pdf(pdf_path)

    def query(self, question: str) -> str:
        # Get relevant text chunks
        docs = self.vectorstore.similarity_search(question, k=3)
        context = "\n".join([doc.page_content for doc in docs])

        # Get relevant page images
        page_nums = set(doc.metadata.get("page", 0) for doc in docs)
        relevant_images = [
            img for img in self.images
            if img["page"] in page_nums
        ][:2]  # Limit to 2 images

        # Build multimodal prompt
        content = [
            {"type": "text", "text": f"Context:\n{context}\n\nQuestion: {question}"}
        ]

        for img in relevant_images:
            content.append({
                "type": "image_url",
                "image_url": {"url": f"data:image/png;base64,{img['image']}"}
            })

        response = self.llm.invoke([{"role": "user", "content": content}])
        return response.content

# Usage
rag = MultimodalRAG()
rag.index_document("annual_report.pdf")
answer = rag.query("What were the Q4 revenue trends shown in the charts?")

Audio Processing

from openai import OpenAI

client = OpenAI()

# Transcribe audio
def transcribe_audio(audio_path: str) -> str:
    with open(audio_path, "rb") as f:
        transcript = client.audio.transcriptions.create(
            model="whisper-1",
            file=f
        )
    return transcript.text

# Generate speech
def text_to_speech(text: str, output_path: str):
    response = client.audio.speech.create(
        model="tts-1",
        voice="alloy",
        input=text
    )
    response.stream_to_file(output_path)

# Voice agent
class VoiceAgent:
    def __init__(self):
        self.client = OpenAI()
        self.conversation = []

    def process_voice_input(self, audio_path: str) -> str:
        # Transcribe
        text = transcribe_audio(audio_path)

        # Process with LLM
        self.conversation.append({"role": "user", "content": text})
        response = self.client.chat.completions.create(
            model="gpt-4",
            messages=self.conversation
        )

        answer = response.choices[0].message.content
        self.conversation.append({"role": "assistant", "content": answer})

        return answer

    def respond_with_voice(self, text: str, output_path: str):
        text_to_speech(text, output_path)

Video Analysis

import cv2
import base64
from openai import OpenAI

class VideoAnalyzer:
    def __init__(self):
        self.client = OpenAI()

    def extract_frames(self, video_path: str, num_frames: int = 5):
        """Extract evenly spaced frames from video."""
        cap = cv2.VideoCapture(video_path)
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        frame_indices = [int(i * total_frames / num_frames) for i in range(num_frames)]

        frames = []
        for idx in frame_indices:
            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
            ret, frame = cap.read()
            if ret:
                _, buffer = cv2.imencode('.jpg', frame)
                frames.append(base64.b64encode(buffer).decode())

        cap.release()
        return frames

    def analyze_video(self, video_path: str, question: str) -> str:
        frames = self.extract_frames(video_path)

        content = [{"type": "text", "text": f"""
These are frames from a video. {question}

Analyze the sequence and provide insights."""}]

        for frame in frames:
            content.append({
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{frame}"}
            })

        response = self.client.chat.completions.create(
            model="gpt-4-vision-preview",
            messages=[{"role": "user", "content": content}],
            max_tokens=1000
        )

        return response.choices[0].message.content

# Usage
analyzer = VideoAnalyzer()
summary = analyzer.analyze_video(
    "demo.mp4",
    "What actions are being performed in this video?"
)

Best Practices

Optimize image size: Resize images before sending (max ~2000px)
Use appropriate detail level: GPT-4V supports low/high detail modes
Batch when possible: Send multiple images in one request
Cache results: Store analysis results to avoid re-processing
Handle failures: Vision models can misinterpret - validate outputs
Consider costs: Image tokens are expensive - use wisely

Build Multimodal AI Applications

Our Agentic AI program covers multimodal agents and vision language models. Learn to build AI that truly understands the world.

Explore Agentic AI Program

Multimodal AI Agents