Text-to-speech using Microsoft Edge TTS (edge-tts Python package). Use when: generating voice narration, TTS per scene, creating audio from text, converting script to speech, generating mp3 from subtitle text, voice synthesis for video narration, edge-tts voice list, changing TTS voice/rate/pitch/volume.
Generate high-quality text-to-speech audio using Microsoft Edge's online TTS service via the edge-tts Python package. Free, no API key required.
edge_tts.Communicate over CLI for integrationcommunicate.save() handles both, or use --write-subtitles in CLI.env — use EDGE_TTS_NAME env var, fallback to en-US-AriaNeuralpathlib.Pathedge_tts is async; use asyncio.run() at entry pointspip install edge-tts
Add to requirements.txt:
edge-tts>=7.0.0
# .env
EDGE_TTS_NAME=id-ID-ArdiNeural # Voice name (see voice list below)
import asyncio
import edge_tts
from pathlib import Path
async def generate_tts(
text: str,
output_audio: Path,
output_srt: Path,
voice: str = "id-ID-ArdiNeural",
rate: str = "+0%",
volume: str = "+0%",
pitch: str = "+0Hz",
) -> None:
"""Generate MP3 audio and word-level SRT from text."""
output_audio.parent.mkdir(parents=True, exist_ok=True)
communicate = edge_tts.Communicate(
text=text,
voice=voice,
rate=rate,
volume=volume,
pitch=pitch,
)
await communicate.save(str(output_audio), str(output_srt))
# Usage
asyncio.run(generate_tts(
text="Tahukah kamu bahwa Titanic punya kolam renang?",
output_audio=Path("output/scene_1/audio_1.mp3"),
output_srt=Path("output/scene_1/subtitle_1.srt"),
))
import asyncio
import edge_tts
from pathlib import Path
async def tts_audio_only(text: str, output: Path, voice: str = "id-ID-ArdiNeural") -> None:
communicate = edge_tts.Communicate(text=text, voice=voice)
await communicate.save(str(output))
asyncio.run(tts_audio_only("Hello world", Path("hello.mp3")))
# Basic generation with subtitles
edge-tts --text "Hello, world!" --write-media hello.mp3 --write-subtitles hello.srt
# With voice selection
edge-tts --voice id-ID-ArdiNeural --text "Halo dunia!" --write-media hello_id.mp3
# Adjust rate, volume, pitch
edge-tts --rate=-20% --volume=+10% --pitch=-10Hz --text "Slower and deeper" --write-media adjusted.mp3
# List all available voices
edge-tts --list-voices
import asyncio
import edge_tts
async def list_voices(language_prefix: str = "id-ID") -> list[dict]:
voices = await edge_tts.list_voices()
return [v for v in voices if v["ShortName"].startswith(language_prefix)]
voices = asyncio.run(list_voices("id-ID"))
for v in voices:
print(f"{v['ShortName']:30s} {v['Gender']:8s} {v.get('Locale', '')}")
import asyncio
import edge_tts
from pathlib import Path
async def batch_tts(
scenes: list[dict], # [{"text": "...", "audio": Path, "srt": Path}, ...]
voice: str = "id-ID-ArdiNeural",
rate: str = "+0%",
) -> None:
for scene in scenes:
print(f"Generating TTS: {scene['audio']}")
comm = edge_tts.Communicate(text=scene["text"], voice=voice, rate=rate)
await comm.save(str(scene["audio"]), str(scene["srt"]))
# Example
scenes = [
{
"text": "Scene satu narasi...",
"audio": Path("output/scene_1/audio_1.mp3"),
"srt": Path("output/scene_1/subtitle_1.srt"),
},
{
"text": "Scene dua narasi...",
"audio": Path("output/scene_2/audio_2.mp3"),
"srt": Path("output/scene_2/subtitle_2.srt"),
},
]
asyncio.run(batch_tts(scenes))
| Voice Name | Gender | Notes |
|---|---|---|
id-ID-ArdiNeural | Male | Clear, natural male voice |
id-ID-GadisNeural | Female | Clear, natural female voice |
| Voice Name | Gender | Notes |
|---|---|---|
en-US-AriaNeural | Female | Natural, expressive |
en-US-GuyNeural | Male | Clear, professional |
en-US-JennyNeural | Female | Friendly, warm |
en-GB-SoniaNeural | Female | British English |
en-GB-RyanNeural | Male | British English |
| Parameter | Format | Examples |
|---|---|---|
rate | ±N% | +20%, -30%, +0% |
volume | ±N% | +10%, -50%, +0% |
pitch | ±NHz | +50Hz, -20Hz, +0Hz |
Edge TTS generates word-level SRT timing. Example output:
1