voice-cloning-backend / backend /app /multilingual_tts.py
AJ50's picture
Revert to gTTS for Hindi - reliable, works with Hindi text
e6a9194
"""Multilingual TTS Service - Supports English (WaveRNN) and Hindi (XTTS)."""
import os
import sys
# Set environment variables BEFORE any TTS imports to bypass CPML prompt
os.environ['TTS_HOME'] = '/tmp/tts_models'
os.environ['TTS_CPML'] = '1'
os.environ['TTS_SKIP_TOS'] = '1'
os.environ['TTS_DISABLE_WEB_VERSION_PROMPT'] = '1'
os.environ['COQUI_TOS_AGREED'] = '1'
# Create a silent TTS manager that handles model initialization without prompts
def _create_silent_tts_manager():
"""Create a TTS manager configured to skip all interactive prompts."""
try:
from TTS.utils.manage import ModelManager
from pathlib import Path
# Set model manager to use our TTS_HOME directory
model_dir = Path(os.environ.get('TTS_HOME', '/tmp/tts_models'))
model_dir.mkdir(parents=True, exist_ok=True)
manager = ModelManager(model_name="tts_models/multilingual/multi-dataset/xtts_v2")
# Mark TOS as agreed in the manager to prevent prompts
manager.tos_agreed = True
return manager, model_dir
except Exception as e:
print(f"[WARNING] Could not create silent TTS manager: {e}")
return None, None
import gc
import torch
import numpy as np
from pathlib import Path
from typing import Optional, Union
from enum import Enum
class Language(str, Enum):
"""Supported languages."""
ENGLISH = "english"
HINDI = "hindi"
class MultilingualTTSService:
"""
Unified TTS service supporting multiple languages.
- English: Uses existing WaveRNN vocoder + Tacotron2 synthesizer + encoder
- Hindi: Uses XTTS (Coqui TTS) model
"""
def __init__(self, models_dir: Path, hindi_model_dir: Optional[Path] = None):
"""
Initialize multilingual TTS service.
Args:
models_dir: Directory with English models (encoder.pt, synthesizer.pt, vocoder.pt)
hindi_model_dir: Directory with XTTS Hindi model. If None, Hindi support disabled.
"""
self.models_dir = Path(models_dir)
self.hindi_model_dir = Path(hindi_model_dir) if hindi_model_dir else None
# Track loaded models
self._encoder_model = None
self._synthesizer_model = None
self._vocoder_model = None
self._xtts_model = None
self.sr = 16000
print("[MultilingualTTSService] Initialized")
print(f"[MultilingualTTSService] English models dir: {self.models_dir}")
if self.hindi_model_dir:
print(f"[MultilingualTTSService] Hindi XTTS dir: {self.hindi_model_dir}")
else:
print("[MultilingualTTSService] Hindi support: DISABLED (no model path)")
def _load_english_models(self):
"""Load English voice cloning models (lazy load)."""
if self._encoder_model is None:
print("[MultilingualTTSService] Loading English encoder...")
from encoder import inference as encoder_infer
enc_path = self.models_dir / "default" / "encoder.pt"
if not enc_path.exists():
raise RuntimeError(f"English encoder model missing: {enc_path}")
encoder_infer.load_model(enc_path)
self._encoder_model = True
print("[MultilingualTTSService] ✓ English encoder loaded")
if self._synthesizer_model is None:
print("[MultilingualTTSService] Loading English synthesizer...")
from synthesizer import inference as synthesizer_infer
syn_path = self.models_dir / "default" / "synthesizer.pt"
if not syn_path.exists():
raise RuntimeError(f"English synthesizer model missing: {syn_path}")
self._synthesizer_model = synthesizer_infer.Synthesizer(syn_path)
print("[MultilingualTTSService] ✓ English synthesizer loaded")
if self._vocoder_model is None:
print("[MultilingualTTSService] Loading English vocoder...")
from app.vocoder import inference as vocoder_infer
voc_path = self.models_dir / "default" / "vocoder.pt"
if not voc_path.exists():
raise RuntimeError(f"English vocoder model missing: {voc_path}")
vocoder_infer.load_model(voc_path)
self._vocoder_model = True
print("[MultilingualTTSService] ✓ English vocoder loaded")
def _load_hindi_models(self):
"""Load Hindi models - using Google Text-to-Speech (gTTS)."""
if self._xtts_model is None:
print("[MultilingualTTSService] Loading Hindi support (gTTS)...")
try:
from gtts import gTTS
print("[MultilingualTTSService] ✓ Hindi gTTS support loaded")
print("[MultilingualTTSService] Engine: Google Text-to-Speech (gTTS)")
print("[MultilingualTTSService] Language: Hindi (hin)")
print("[MultilingualTTSService] TOS: No (Google Cloud)")
# Mark as loaded (gTTS doesn't require actual model loading)
self._xtts_model = True
except ImportError:
raise ImportError(
"gTTS library required for Hindi support. "
"Install with: pip install gtts"
)
except Exception as e:
print(f"[MultilingualTTSService] Error loading Hindi support: {e}")
raise RuntimeError(f"Failed to load Hindi support: {e}")
def synthesize(self, text: str, voice_sample_path: Union[str, Path],
language: str = "english") -> np.ndarray:
"""
Synthesize speech in specified language.
Args:
text: Text to synthesize
voice_sample_path: Path to reference voice sample
language: "english" or "hindi"
Returns:
Audio waveform as numpy array
"""
language = language.lower()
if language == Language.ENGLISH:
return self._synthesize_english(text, voice_sample_path)
elif language == Language.HINDI:
return self._synthesize_hindi(text, voice_sample_path)
else:
raise ValueError(f"Unsupported language: {language}")
def _synthesize_english(self, text: str, voice_sample_path: Union[str, Path]) -> np.ndarray:
"""Synthesize English speech using WaveRNN + Tacotron2."""
from encoder import inference as encoder_infer
from app.vocoder import inference as vocoder_infer
self._load_english_models()
print(f"[MultilingualTTSService] Synthesizing English: {text[:50]}...")
# Embed voice
wav = encoder_infer.preprocess_wav(voice_sample_path)
embed = encoder_infer.embed_utterance(wav)
# Generate mel
mels = self._synthesizer_model.synthesize_spectrograms([text], [embed])
mel = mels[0]
# Vocalize
try:
synthesized = vocoder_infer.infer_waveform(
mel, normalize=True, batched=False, target=8000, overlap=800
).astype(np.float32)
except Exception as e:
print(f"[MultilingualTTSService] Vocoder failed: {e}, using Griffin-Lim fallback")
synthesized = self._synthesizer_model.griffin_lim(mel).astype(np.float32)
# Normalize
max_val = np.max(np.abs(synthesized))
if max_val > 0:
target_level = 0.707
synthesized = synthesized * (target_level / max_val)
return np.clip(synthesized, -1.0, 1.0)
def _synthesize_hindi(self, text: str, voice_sample_path: Union[str, Path]) -> np.ndarray:
"""Synthesize Hindi speech using Google Text-to-Speech (gTTS)."""
self._load_hindi_models()
print(f"[MultilingualTTSService] Synthesizing Hindi: {text[:50]}...")
try:
from gtts import gTTS
import io
from pydub import AudioSegment
# Generate speech using Google TTS
tts = gTTS(text=text, lang='hi', slow=False)
# Save to BytesIO buffer
buffer = io.BytesIO()
tts.write_to_fp(buffer)
buffer.seek(0)
# Load audio from buffer
audio_segment = AudioSegment.from_mp3(buffer)
# Convert to numpy array (mono, float32)
samples = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)
# Handle stereo to mono conversion
if audio_segment.channels == 2:
# Convert stereo to mono by averaging channels
samples = samples.reshape((-1, 2)).mean(axis=1)
# Normalize to [-1, 1] range
max_val = np.max(np.abs(samples))
if max_val > 0:
samples = samples / (32767.0 if audio_segment.sample_width == 2 else 128.0)
return np.clip(samples, -1.0, 1.0)
except Exception as e:
print(f"[MultilingualTTSService] Error during Hindi synthesis: {e}")
raise RuntimeError(f"Hindi synthesis failed: {e}")
def synthesize_and_save(self, text: str, voice_sample_path: Union[str, Path],
output_path: Union[str, Path], language: str = "english") -> Path:
"""
Synthesize and save to file.
Args:
text: Text to synthesize
voice_sample_path: Path to reference voice
output_path: Where to save audio
language: "english" or "hindi"
Returns:
Path to output file
"""
import soundfile as sf
output_path = Path(output_path)
try:
audio = self.synthesize(text, voice_sample_path, language)
# Determine sample rate based on language
sr = 24000 if language.lower() == Language.HINDI else 16000
sf.write(output_path, audio, sr)
print(f"[MultilingualTTSService] Audio saved: {output_path}")
return output_path
except Exception as e:
print(f"[MultilingualTTSService] Error during synthesis: {e}")
raise
def cleanup(self):
"""Release model memory."""
print("[MultilingualTTSService] Cleaning up models...")
try:
self._encoder_model = None
self._synthesizer_model = None
self._vocoder_model = None
self._xtts_model = None
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
except Exception as e:
print(f"[MultilingualTTSService] Cleanup warning: {e}")