What Are Multimodal Agents?
Multimodal AI agents can process and understand multiple types of input - text, images, audio, and video. This enables powerful new capabilities:
- Visual understanding: Analyze images, screenshots, diagrams
- Document processing: Extract info from PDFs with charts and tables
- Video analysis: Summarize videos, extract key frames
- Audio processing: Transcribe and analyze speech
Vision Language Models
GPT-4 Vision (GPT-4V)
from openai import OpenAI
import base64
client = OpenAI()
def encode_image(image_path):
with open(image_path, "rb") as f:
return base64.b64encode(f.read()).decode("utf-8")
# Analyze an image
image_data = encode_image("screenshot.png")
response = client.chat.completions.create(
model="gpt-4-vision-preview",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "What's in this image?"},
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{image_data}"
}
}
]
}
],
max_tokens=500
)
print(response.choices[0].message.content)
Claude Vision
import anthropic
import base64
client = anthropic.Anthropic()
# Read and encode image
with open("diagram.png", "rb") as f:
image_data = base64.standard_b64encode(f.read()).decode("utf-8")
response = client.messages.create(
model="claude-3-sonnet-20240229",
max_tokens=1024,
messages=[
{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": image_data
}
},
{
"type": "text",
"text": "Explain this architecture diagram"
}
]
}
]
)
print(response.content[0].text)
Multimodal Use Cases
Document Analysis
Extract data from receipts, invoices, forms with complex layouts.
UI Testing
Analyze screenshots to verify UI elements and detect bugs.
Chart Understanding
Interpret graphs and visualizations in reports.
Accessibility
Generate image descriptions for visually impaired users.
Building a Visual Agent
from openai import OpenAI
import base64
from typing import List
class VisualAgent:
def __init__(self):
self.client = OpenAI()
self.conversation = []
def encode_image(self, path: str) -> str:
with open(path, "rb") as f:
return base64.b64encode(f.read()).decode("utf-8")
def analyze_image(self, image_path: str, question: str) -> str:
image_data = self.encode_image(image_path)
message = {
"role": "user",
"content": [
{"type": "text", "text": question},
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{image_data}"}
}
]
}
self.conversation.append(message)
response = self.client.chat.completions.create(
model="gpt-4-vision-preview",
messages=self.conversation,
max_tokens=1000
)
answer = response.choices[0].message.content
self.conversation.append({"role": "assistant", "content": answer})
return answer
def compare_images(self, images: List[str], question: str) -> str:
content = [{"type": "text", "text": question}]
for i, path in enumerate(images):
image_data = self.encode_image(path)
content.append({
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{image_data}"}
})
response = self.client.chat.completions.create(
model="gpt-4-vision-preview",
messages=[{"role": "user", "content": content}],
max_tokens=1000
)
return response.choices[0].message.content
# Usage
agent = VisualAgent()
# Analyze a single image
result = agent.analyze_image("chart.png", "What trends do you see?")
print(result)
# Compare multiple images
diff = agent.compare_images(
["before.png", "after.png"],
"What changed between these two screenshots?"
)
print(diff)
Multimodal RAG
Combine vision and text retrieval for document understanding:
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
import fitz # PyMuPDF
import base64
class MultimodalRAG:
def __init__(self):
self.llm = ChatOpenAI(model="gpt-4-vision-preview")
self.embeddings = OpenAIEmbeddings()
self.vectorstore = None
def extract_images_from_pdf(self, pdf_path: str):
"""Extract images from PDF pages."""
doc = fitz.open(pdf_path)
images = []
for page_num in range(len(doc)):
page = doc[page_num]
# Render page as image
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
img_bytes = pix.tobytes("png")
images.append({
"page": page_num,
"image": base64.b64encode(img_bytes).decode()
})
return images
def index_document(self, pdf_path: str):
"""Index both text and images from PDF."""
# Extract text
loader = PyPDFLoader(pdf_path)
documents = loader.load()
# Split text
splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200
)
texts = splitter.split_documents(documents)
# Create vector store
self.vectorstore = Chroma.from_documents(
documents=texts,
embedding=self.embeddings
)
# Store images separately
self.images = self.extract_images_from_pdf(pdf_path)
def query(self, question: str) -> str:
# Get relevant text chunks
docs = self.vectorstore.similarity_search(question, k=3)
context = "\n".join([doc.page_content for doc in docs])
# Get relevant page images
page_nums = set(doc.metadata.get("page", 0) for doc in docs)
relevant_images = [
img for img in self.images
if img["page"] in page_nums
][:2] # Limit to 2 images
# Build multimodal prompt
content = [
{"type": "text", "text": f"Context:\n{context}\n\nQuestion: {question}"}
]
for img in relevant_images:
content.append({
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{img['image']}"}
})
response = self.llm.invoke([{"role": "user", "content": content}])
return response.content
# Usage
rag = MultimodalRAG()
rag.index_document("annual_report.pdf")
answer = rag.query("What were the Q4 revenue trends shown in the charts?")
Audio Processing
from openai import OpenAI
client = OpenAI()
# Transcribe audio
def transcribe_audio(audio_path: str) -> str:
with open(audio_path, "rb") as f:
transcript = client.audio.transcriptions.create(
model="whisper-1",
file=f
)
return transcript.text
# Generate speech
def text_to_speech(text: str, output_path: str):
response = client.audio.speech.create(
model="tts-1",
voice="alloy",
input=text
)
response.stream_to_file(output_path)
# Voice agent
class VoiceAgent:
def __init__(self):
self.client = OpenAI()
self.conversation = []
def process_voice_input(self, audio_path: str) -> str:
# Transcribe
text = transcribe_audio(audio_path)
# Process with LLM
self.conversation.append({"role": "user", "content": text})
response = self.client.chat.completions.create(
model="gpt-4",
messages=self.conversation
)
answer = response.choices[0].message.content
self.conversation.append({"role": "assistant", "content": answer})
return answer
def respond_with_voice(self, text: str, output_path: str):
text_to_speech(text, output_path)
Video Analysis
import cv2
import base64
from openai import OpenAI
class VideoAnalyzer:
def __init__(self):
self.client = OpenAI()
def extract_frames(self, video_path: str, num_frames: int = 5):
"""Extract evenly spaced frames from video."""
cap = cv2.VideoCapture(video_path)
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
frame_indices = [int(i * total_frames / num_frames) for i in range(num_frames)]
frames = []
for idx in frame_indices:
cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
ret, frame = cap.read()
if ret:
_, buffer = cv2.imencode('.jpg', frame)
frames.append(base64.b64encode(buffer).decode())
cap.release()
return frames
def analyze_video(self, video_path: str, question: str) -> str:
frames = self.extract_frames(video_path)
content = [{"type": "text", "text": f"""
These are frames from a video. {question}
Analyze the sequence and provide insights."""}]
for frame in frames:
content.append({
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{frame}"}
})
response = self.client.chat.completions.create(
model="gpt-4-vision-preview",
messages=[{"role": "user", "content": content}],
max_tokens=1000
)
return response.choices[0].message.content
# Usage
analyzer = VideoAnalyzer()
summary = analyzer.analyze_video(
"demo.mp4",
"What actions are being performed in this video?"
)
Best Practices
- Optimize image size: Resize images before sending (max ~2000px)
- Use appropriate detail level: GPT-4V supports low/high detail modes
- Batch when possible: Send multiple images in one request
- Cache results: Store analysis results to avoid re-processing
- Handle failures: Vision models can misinterpret - validate outputs
- Consider costs: Image tokens are expensive - use wisely
Build Multimodal AI Applications
Our Agentic AI program covers multimodal agents and vision language models. Learn to build AI that truly understands the world.
Explore Agentic AI Program