Voice AI Pipeline
Voice agents combine three core capabilities:
# Voice Agent Pipeline
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
│ Speech │ │ LLM │ │ Speech │
│ Recognition │ -> │ Processing │ -> │ Synthesis │
│ (Whisper) │ │ (GPT-4) │ │ (TTS) │
└─────────────┘ └─────────────┘ └─────────────┘
│ │ │
Audio In Text Logic Audio Out
Speech Recognition with Whisper
from openai import OpenAI
client = OpenAI()
# Transcribe audio file
def transcribe(audio_path: str) -> str:
with open(audio_path, "rb") as f:
transcript = client.audio.transcriptions.create(
model="whisper-1",
file=f,
response_format="text"
)
return transcript
# With language detection
def transcribe_with_details(audio_path: str) -> dict:
with open(audio_path, "rb") as f:
transcript = client.audio.transcriptions.create(
model="whisper-1",
file=f,
response_format="verbose_json"
)
return {
"text": transcript.text,
"language": transcript.language,
"duration": transcript.duration
}
# Real-time with streaming (local Whisper)
import whisper
model = whisper.load_model("base")
result = model.transcribe("audio.mp3")
print(result["text"])
Text-to-Speech
from openai import OpenAI
from pathlib import Path
client = OpenAI()
def text_to_speech(text: str, output_path: str, voice: str = "alloy"):
"""Convert text to speech.
Voices: alloy, echo, fable, onyx, nova, shimmer
"""
response = client.audio.speech.create(
model="tts-1", # or tts-1-hd for higher quality
voice=voice,
input=text
)
response.stream_to_file(output_path)
# Generate speech
text_to_speech(
"Hello! I'm your AI assistant. How can I help you today?",
"greeting.mp3",
voice="nova"
)
# Streaming TTS for lower latency
def stream_speech(text: str):
response = client.audio.speech.create(
model="tts-1",
voice="alloy",
input=text
)
# Stream chunks
for chunk in response.iter_bytes(chunk_size=4096):
yield chunk
Complete Voice Agent
from openai import OpenAI
import pyaudio
import wave
import tempfile
import os
class VoiceAgent:
def __init__(self):
self.client = OpenAI()
self.conversation = []
def record_audio(self, duration: int = 5) -> str:
"""Record audio from microphone."""
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT, channels=CHANNELS,
rate=RATE, input=True, frames_per_buffer=CHUNK)
print("Recording...")
frames = []
for _ in range(0, int(RATE / CHUNK * duration)):
data = stream.read(CHUNK)
frames.append(data)
stream.stop_stream()
stream.close()
p.terminate()
# Save to temp file
temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
wf = wave.open(temp_file.name, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()
return temp_file.name
def transcribe(self, audio_path: str) -> str:
"""Convert speech to text."""
with open(audio_path, "rb") as f:
result = self.client.audio.transcriptions.create(
model="whisper-1",
file=f
)
return result.text
def generate_response(self, user_input: str) -> str:
"""Generate AI response."""
self.conversation.append({"role": "user", "content": user_input})
response = self.client.chat.completions.create(
model="gpt-4",
messages=[
{"role": "system", "content": "You are a helpful voice assistant. Keep responses concise and conversational."},
*self.conversation
]
)
assistant_message = response.choices[0].message.content
self.conversation.append({"role": "assistant", "content": assistant_message})
return assistant_message
def speak(self, text: str):
"""Convert text to speech and play."""
response = self.client.audio.speech.create(
model="tts-1",
voice="nova",
input=text
)
temp_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
response.stream_to_file(temp_file.name)
# Play audio (platform-specific)
import subprocess
subprocess.run(["afplay", temp_file.name]) # macOS
# subprocess.run(["aplay", temp_file.name]) # Linux
# os.startfile(temp_file.name) # Windows
os.unlink(temp_file.name)
def run(self):
"""Main conversation loop."""
print("Voice Agent ready. Speak to interact. Say 'goodbye' to exit.")
while True:
# Record
audio_path = self.record_audio(duration=5)
# Transcribe
user_text = self.transcribe(audio_path)
print(f"You: {user_text}")
os.unlink(audio_path)
if "goodbye" in user_text.lower():
self.speak("Goodbye! Have a great day.")
break
# Generate response
response = self.generate_response(user_text)
print(f"Agent: {response}")
# Speak
self.speak(response)
# Run
agent = VoiceAgent()
agent.run()
Real-Time Voice with WebSockets
from fastapi import FastAPI, WebSocket
from openai import OpenAI
import base64
app = FastAPI()
client = OpenAI()
@app.websocket("/voice")
async def voice_chat(websocket: WebSocket):
await websocket.accept()
while True:
# Receive audio data
audio_data = await websocket.receive_bytes()
# Save temporarily and transcribe
with open("temp.webm", "wb") as f:
f.write(audio_data)
with open("temp.webm", "rb") as f:
transcript = client.audio.transcriptions.create(
model="whisper-1",
file=f
)
# Generate response
response = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": transcript.text}]
)
reply = response.choices[0].message.content
# Convert to speech
speech = client.audio.speech.create(
model="tts-1",
voice="alloy",
input=reply
)
# Send back
await websocket.send_json({
"transcript": transcript.text,
"reply": reply
})
await websocket.send_bytes(speech.content)
Use Cases
Customer Service
Automated phone support with natural conversations.
Accessibility
Voice interfaces for users who can't type.
Hands-Free
Voice control for driving, cooking, working.
Language Learning
Pronunciation practice and conversation partners.
Best Practices
- Handle silence: Detect when user stops speaking
- Feedback: Audio cues for listening/processing states
- Interruption: Allow users to interrupt the agent
- Noise handling: Filter background noise
- Fallback: Offer text input as alternative
Build Voice AI Applications
Our Agentic AI program covers voice interfaces and conversational AI.
Explore Agentic AI Program