"""Multilingual TTS Service - Supports English (WaveRNN) and Hindi (XTTS).""" import os import sys # Set environment variables BEFORE any TTS imports to bypass CPML prompt os.environ['TTS_HOME'] = '/tmp/tts_models' os.environ['TTS_CPML'] = '1' os.environ['TTS_SKIP_TOS'] = '1' os.environ['TTS_DISABLE_WEB_VERSION_PROMPT'] = '1' os.environ['COQUI_TOS_AGREED'] = '1' # Create a silent TTS manager that handles model initialization without prompts def _create_silent_tts_manager(): """Create a TTS manager configured to skip all interactive prompts.""" try: from TTS.utils.manage import ModelManager from pathlib import Path # Set model manager to use our TTS_HOME directory model_dir = Path(os.environ.get('TTS_HOME', '/tmp/tts_models')) model_dir.mkdir(parents=True, exist_ok=True) manager = ModelManager(model_name="tts_models/multilingual/multi-dataset/xtts_v2") # Mark TOS as agreed in the manager to prevent prompts manager.tos_agreed = True return manager, model_dir except Exception as e: print(f"[WARNING] Could not create silent TTS manager: {e}") return None, None import gc import torch import numpy as np from pathlib import Path from typing import Optional, Union from enum import Enum class Language(str, Enum): """Supported languages.""" ENGLISH = "english" HINDI = "hindi" class MultilingualTTSService: """ Unified TTS service supporting multiple languages. - English: Uses existing WaveRNN vocoder + Tacotron2 synthesizer + encoder - Hindi: Uses XTTS (Coqui TTS) model """ def __init__(self, models_dir: Path, hindi_model_dir: Optional[Path] = None): """ Initialize multilingual TTS service. Args: models_dir: Directory with English models (encoder.pt, synthesizer.pt, vocoder.pt) hindi_model_dir: Directory with XTTS Hindi model. If None, Hindi support disabled. """ self.models_dir = Path(models_dir) self.hindi_model_dir = Path(hindi_model_dir) if hindi_model_dir else None # Track loaded models self._encoder_model = None self._synthesizer_model = None self._vocoder_model = None self._xtts_model = None self.sr = 16000 print("[MultilingualTTSService] Initialized") print(f"[MultilingualTTSService] English models dir: {self.models_dir}") if self.hindi_model_dir: print(f"[MultilingualTTSService] Hindi XTTS dir: {self.hindi_model_dir}") else: print("[MultilingualTTSService] Hindi support: DISABLED (no model path)") def _load_english_models(self): """Load English voice cloning models (lazy load).""" if self._encoder_model is None: print("[MultilingualTTSService] Loading English encoder...") from encoder import inference as encoder_infer enc_path = self.models_dir / "default" / "encoder.pt" if not enc_path.exists(): raise RuntimeError(f"English encoder model missing: {enc_path}") encoder_infer.load_model(enc_path) self._encoder_model = True print("[MultilingualTTSService] ✓ English encoder loaded") if self._synthesizer_model is None: print("[MultilingualTTSService] Loading English synthesizer...") from synthesizer import inference as synthesizer_infer syn_path = self.models_dir / "default" / "synthesizer.pt" if not syn_path.exists(): raise RuntimeError(f"English synthesizer model missing: {syn_path}") self._synthesizer_model = synthesizer_infer.Synthesizer(syn_path) print("[MultilingualTTSService] ✓ English synthesizer loaded") if self._vocoder_model is None: print("[MultilingualTTSService] Loading English vocoder...") from app.vocoder import inference as vocoder_infer voc_path = self.models_dir / "default" / "vocoder.pt" if not voc_path.exists(): raise RuntimeError(f"English vocoder model missing: {voc_path}") vocoder_infer.load_model(voc_path) self._vocoder_model = True print("[MultilingualTTSService] ✓ English vocoder loaded") def _load_hindi_models(self): """Load Hindi models - using Google Text-to-Speech (gTTS).""" if self._xtts_model is None: print("[MultilingualTTSService] Loading Hindi support (gTTS)...") try: from gtts import gTTS print("[MultilingualTTSService] ✓ Hindi gTTS support loaded") print("[MultilingualTTSService] Engine: Google Text-to-Speech (gTTS)") print("[MultilingualTTSService] Language: Hindi (hin)") print("[MultilingualTTSService] TOS: No (Google Cloud)") # Mark as loaded (gTTS doesn't require actual model loading) self._xtts_model = True except ImportError: raise ImportError( "gTTS library required for Hindi support. " "Install with: pip install gtts" ) except Exception as e: print(f"[MultilingualTTSService] Error loading Hindi support: {e}") raise RuntimeError(f"Failed to load Hindi support: {e}") def synthesize(self, text: str, voice_sample_path: Union[str, Path], language: str = "english") -> np.ndarray: """ Synthesize speech in specified language. Args: text: Text to synthesize voice_sample_path: Path to reference voice sample language: "english" or "hindi" Returns: Audio waveform as numpy array """ language = language.lower() if language == Language.ENGLISH: return self._synthesize_english(text, voice_sample_path) elif language == Language.HINDI: return self._synthesize_hindi(text, voice_sample_path) else: raise ValueError(f"Unsupported language: {language}") def _synthesize_english(self, text: str, voice_sample_path: Union[str, Path]) -> np.ndarray: """Synthesize English speech using WaveRNN + Tacotron2.""" from encoder import inference as encoder_infer from app.vocoder import inference as vocoder_infer self._load_english_models() print(f"[MultilingualTTSService] Synthesizing English: {text[:50]}...") # Embed voice wav = encoder_infer.preprocess_wav(voice_sample_path) embed = encoder_infer.embed_utterance(wav) # Generate mel mels = self._synthesizer_model.synthesize_spectrograms([text], [embed]) mel = mels[0] # Vocalize try: synthesized = vocoder_infer.infer_waveform( mel, normalize=True, batched=False, target=8000, overlap=800 ).astype(np.float32) except Exception as e: print(f"[MultilingualTTSService] Vocoder failed: {e}, using Griffin-Lim fallback") synthesized = self._synthesizer_model.griffin_lim(mel).astype(np.float32) # Normalize max_val = np.max(np.abs(synthesized)) if max_val > 0: target_level = 0.707 synthesized = synthesized * (target_level / max_val) return np.clip(synthesized, -1.0, 1.0) def _synthesize_hindi(self, text: str, voice_sample_path: Union[str, Path]) -> np.ndarray: """Synthesize Hindi speech using Google Text-to-Speech (gTTS).""" self._load_hindi_models() print(f"[MultilingualTTSService] Synthesizing Hindi: {text[:50]}...") try: from gtts import gTTS import io from pydub import AudioSegment # Generate speech using Google TTS tts = gTTS(text=text, lang='hi', slow=False) # Save to BytesIO buffer buffer = io.BytesIO() tts.write_to_fp(buffer) buffer.seek(0) # Load audio from buffer audio_segment = AudioSegment.from_mp3(buffer) # Convert to numpy array (mono, float32) samples = np.array(audio_segment.get_array_of_samples(), dtype=np.float32) # Handle stereo to mono conversion if audio_segment.channels == 2: # Convert stereo to mono by averaging channels samples = samples.reshape((-1, 2)).mean(axis=1) # Normalize to [-1, 1] range max_val = np.max(np.abs(samples)) if max_val > 0: samples = samples / (32767.0 if audio_segment.sample_width == 2 else 128.0) return np.clip(samples, -1.0, 1.0) except Exception as e: print(f"[MultilingualTTSService] Error during Hindi synthesis: {e}") raise RuntimeError(f"Hindi synthesis failed: {e}") def synthesize_and_save(self, text: str, voice_sample_path: Union[str, Path], output_path: Union[str, Path], language: str = "english") -> Path: """ Synthesize and save to file. Args: text: Text to synthesize voice_sample_path: Path to reference voice output_path: Where to save audio language: "english" or "hindi" Returns: Path to output file """ import soundfile as sf output_path = Path(output_path) try: audio = self.synthesize(text, voice_sample_path, language) # Determine sample rate based on language sr = 24000 if language.lower() == Language.HINDI else 16000 sf.write(output_path, audio, sr) print(f"[MultilingualTTSService] Audio saved: {output_path}") return output_path except Exception as e: print(f"[MultilingualTTSService] Error during synthesis: {e}") raise def cleanup(self): """Release model memory.""" print("[MultilingualTTSService] Cleaning up models...") try: self._encoder_model = None self._synthesizer_model = None self._vocoder_model = None self._xtts_model = None gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() except Exception as e: print(f"[MultilingualTTSService] Cleanup warning: {e}")