ElevenLabs Integration: Text-to-Speech for Video Pipelines
Why ElevenLabs for Video Pipelines
Automated video production needs narration, and ElevenLabs produces the most natural-sounding text-to-speech I have tested. I integrated it into a pipeline that generates explainer videos automatically, and the voice quality is good enough that viewers rarely notice it is AI-generated.
Here is how I set up the integration and the production patterns I learned along the way.
Basic API Integration
from elevenlabs import ElevenLabs
from pathlib import Path
client = ElevenLabs(api_key="your-api-key")
def generate_speech(
text: str,
voice_id: str = "21m00Tcm4TlvDq8ikWAM", # Rachel
output_path: str = "output.mp3"
) -> str:
audio = client.text_to_speech.convert(
text=text,
voice_id=voice_id,
model_id="eleven_multilingual_v2",
voice_settings={
"stability": 0.5,
"similarity_boost": 0.75,
"style": 0.0,
"use_speaker_boost": True
}
)
with open(output_path, "wb") as f:
for chunk in audio:
f.write(chunk)
return output_path
Voice Selection Strategy
Voice selection dramatically affects viewer engagement. I test multiple voices for each content type and measure audience retention.
VOICE_PROFILES = {
"tech_tutorial": {
"voice_id": "pNInz6obpgDQGcFmaJgB", # Adam
"stability": 0.6,
"similarity_boost": 0.8,
"style": 0.1,
"description": "Clear, authoritative, moderate pace"
},
"storytelling": {
"voice_id": "EXAVITQu4vr4xnSDxMaL", # Bella
"stability": 0.4,
"similarity_boost": 0.7,
"style": 0.3,
"description": "Warm, expressive, varied pacing"
},
"news_style": {
"voice_id": "ErXwobaYiN019PkySvjV", # Antoni
"stability": 0.7,
"similarity_boost": 0.85,
"style": 0.0,
"description": "Professional, steady, neutral tone"
}
}
def get_voice_config(content_type: str) -> dict:
return VOICE_PROFILES.get(content_type, VOICE_PROFILES["tech_tutorial"])
Script Preparation for Natural Speech
Raw text does not produce good speech. You need to prepare the script with pauses, emphasis, and pronunciation hints.
class ScriptPreparer:
def prepare(self, raw_text: str) -> str:
text = raw_text
# Add pauses after periods for natural breathing
text = text.replace(". ", ". ... ")
# Handle technical terms
text = self.expand_abbreviations(text)
# Handle numbers
text = self.expand_numbers(text)
return text
def expand_abbreviations(self, text: str) -> str:
abbreviations = {
"API": "A P I",
"SQL": "sequel",
"CLI": "C L I",
"UI": "U I",
"JSON": "Jason",
"YAML": "Yamel",
"AWS": "A W S",
"LLM": "L L M"
}
for abbr, expansion in abbreviations.items():
text = text.replace(abbr, expansion)
return text
Chunked Generation for Long Scripts
ElevenLabs has a character limit per request. For long scripts, I split at sentence boundaries and concatenate the audio.
import io
from pydub import AudioSegment
class LongFormGenerator:
def __init__(self, max_chars: int = 4000):
self.max_chars = max_chars
def generate_long(self, script: str, voice_config: dict) -> AudioSegment:
chunks = self._split_script(script)
audio_segments = []
for i, chunk in enumerate(chunks):
audio_data = client.text_to_speech.convert(
text=chunk,
voice_id=voice_config["voice_id"],
model_id="eleven_multilingual_v2",
voice_settings=voice_config
)
audio_bytes = b"".join(audio_data)
segment = AudioSegment.from_mp3(io.BytesIO(audio_bytes))
audio_segments.append(segment)
# Small pause between chunks
audio_segments.append(AudioSegment.silent(duration=300))
combined = sum(audio_segments)
return combined
def _split_script(self, script: str) -> list[str]:
sentences = script.split(". ")
chunks = []
current_chunk = []
current_length = 0
for sentence in sentences:
if current_length + len(sentence) > self.max_chars and current_chunk:
chunks.append(". ".join(current_chunk) + ".")
current_chunk = []
current_length = 0
current_chunk.append(sentence)
current_length += len(sentence)
if current_chunk:
chunks.append(". ".join(current_chunk))
return chunks
Video Pipeline Integration
class VideoNarrationPipeline:
def __init__(self):
self.script_preparer = ScriptPreparer()
self.generator = LongFormGenerator()
def create_narrated_video(
self,
video_path: str,
script: str,
content_type: str,
output_path: str
):
# Prepare script
prepared = self.script_preparer.prepare(script)
# Generate speech
voice_config = get_voice_config(content_type)
audio = self.generator.generate_long(prepared, voice_config)
# Export audio
temp_audio = "temp_narration.mp3"
audio.export(temp_audio, format="mp3")
# Combine with video
self._mux_audio_video(video_path, temp_audio, output_path)
# Cleanup
Path(temp_audio).unlink()
def _mux_audio_video(self, video: str, audio: str, output: str):
subprocess.run([
"ffmpeg", "-i", video, "-i", audio,
"-map", "0:v", "-map", "1:a",
"-c:v", "copy", "-c:a", "aac",
"-shortest", output
], check=True)
Cost Management
ElevenLabs pricing is based on characters. At scale, costs add up quickly. My strategies for managing spend:
- Cache generated audio by content hash to avoid regenerating identical scripts
- Use the lower-cost Turbo model for drafts and the full model for final renders
- Generate at 128kbps for web content rather than 320kbps
- Batch scripts and generate during off-peak hours if the API offers rate advantages
Quality Assurance
I run automated checks on generated audio before including it in final videos:
- Duration validation: narration length should approximately match expected reading time
- Silence detection: flag audio with unexpected gaps longer than 2 seconds
- Volume normalization: ensure consistent levels across all narration segments
ElevenLabs has made text-to-speech good enough for production video content. The key is proper script preparation, consistent voice selection, and systematic quality checks. Get those right, and automated narration becomes a reliable component of your video pipeline.