Spaces:
Sleeping
Sleeping
File size: 10,973 Bytes
03fe1d8 aeb26b6 3072798 aeb26b6 7fcb2a7 aeb26b6 7fcb2a7 3072798 aeb26b6 03fe1d8 e6a9194 03fe1d8 e6a9194 03fe1d8 e6a9194 3072798 e6a9194 03fe1d8 e6a9194 03fe1d8 d9c4b3e e6a9194 03fe1d8 e6a9194 03fe1d8 f152556 e6a9194 e1c7f06 e6a9194 e1c7f06 e6a9194 e1c7f06 e6a9194 e1c7f06 e6a9194 e1c7f06 ccd13e3 03fe1d8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 |
"""Multilingual TTS Service - Supports English (WaveRNN) and Hindi (XTTS)."""
import os
import sys
# Set environment variables BEFORE any TTS imports to bypass CPML prompt
os.environ['TTS_HOME'] = '/tmp/tts_models'
os.environ['TTS_CPML'] = '1'
os.environ['TTS_SKIP_TOS'] = '1'
os.environ['TTS_DISABLE_WEB_VERSION_PROMPT'] = '1'
os.environ['COQUI_TOS_AGREED'] = '1'
# Create a silent TTS manager that handles model initialization without prompts
def _create_silent_tts_manager():
"""Create a TTS manager configured to skip all interactive prompts."""
try:
from TTS.utils.manage import ModelManager
from pathlib import Path
# Set model manager to use our TTS_HOME directory
model_dir = Path(os.environ.get('TTS_HOME', '/tmp/tts_models'))
model_dir.mkdir(parents=True, exist_ok=True)
manager = ModelManager(model_name="tts_models/multilingual/multi-dataset/xtts_v2")
# Mark TOS as agreed in the manager to prevent prompts
manager.tos_agreed = True
return manager, model_dir
except Exception as e:
print(f"[WARNING] Could not create silent TTS manager: {e}")
return None, None
import gc
import torch
import numpy as np
from pathlib import Path
from typing import Optional, Union
from enum import Enum
class Language(str, Enum):
"""Supported languages."""
ENGLISH = "english"
HINDI = "hindi"
class MultilingualTTSService:
"""
Unified TTS service supporting multiple languages.
- English: Uses existing WaveRNN vocoder + Tacotron2 synthesizer + encoder
- Hindi: Uses XTTS (Coqui TTS) model
"""
def __init__(self, models_dir: Path, hindi_model_dir: Optional[Path] = None):
"""
Initialize multilingual TTS service.
Args:
models_dir: Directory with English models (encoder.pt, synthesizer.pt, vocoder.pt)
hindi_model_dir: Directory with XTTS Hindi model. If None, Hindi support disabled.
"""
self.models_dir = Path(models_dir)
self.hindi_model_dir = Path(hindi_model_dir) if hindi_model_dir else None
# Track loaded models
self._encoder_model = None
self._synthesizer_model = None
self._vocoder_model = None
self._xtts_model = None
self.sr = 16000
print("[MultilingualTTSService] Initialized")
print(f"[MultilingualTTSService] English models dir: {self.models_dir}")
if self.hindi_model_dir:
print(f"[MultilingualTTSService] Hindi XTTS dir: {self.hindi_model_dir}")
else:
print("[MultilingualTTSService] Hindi support: DISABLED (no model path)")
def _load_english_models(self):
"""Load English voice cloning models (lazy load)."""
if self._encoder_model is None:
print("[MultilingualTTSService] Loading English encoder...")
from encoder import inference as encoder_infer
enc_path = self.models_dir / "default" / "encoder.pt"
if not enc_path.exists():
raise RuntimeError(f"English encoder model missing: {enc_path}")
encoder_infer.load_model(enc_path)
self._encoder_model = True
print("[MultilingualTTSService] β English encoder loaded")
if self._synthesizer_model is None:
print("[MultilingualTTSService] Loading English synthesizer...")
from synthesizer import inference as synthesizer_infer
syn_path = self.models_dir / "default" / "synthesizer.pt"
if not syn_path.exists():
raise RuntimeError(f"English synthesizer model missing: {syn_path}")
self._synthesizer_model = synthesizer_infer.Synthesizer(syn_path)
print("[MultilingualTTSService] β English synthesizer loaded")
if self._vocoder_model is None:
print("[MultilingualTTSService] Loading English vocoder...")
from app.vocoder import inference as vocoder_infer
voc_path = self.models_dir / "default" / "vocoder.pt"
if not voc_path.exists():
raise RuntimeError(f"English vocoder model missing: {voc_path}")
vocoder_infer.load_model(voc_path)
self._vocoder_model = True
print("[MultilingualTTSService] β English vocoder loaded")
def _load_hindi_models(self):
"""Load Hindi models - using Google Text-to-Speech (gTTS)."""
if self._xtts_model is None:
print("[MultilingualTTSService] Loading Hindi support (gTTS)...")
try:
from gtts import gTTS
print("[MultilingualTTSService] β Hindi gTTS support loaded")
print("[MultilingualTTSService] Engine: Google Text-to-Speech (gTTS)")
print("[MultilingualTTSService] Language: Hindi (hin)")
print("[MultilingualTTSService] TOS: No (Google Cloud)")
# Mark as loaded (gTTS doesn't require actual model loading)
self._xtts_model = True
except ImportError:
raise ImportError(
"gTTS library required for Hindi support. "
"Install with: pip install gtts"
)
except Exception as e:
print(f"[MultilingualTTSService] Error loading Hindi support: {e}")
raise RuntimeError(f"Failed to load Hindi support: {e}")
def synthesize(self, text: str, voice_sample_path: Union[str, Path],
language: str = "english") -> np.ndarray:
"""
Synthesize speech in specified language.
Args:
text: Text to synthesize
voice_sample_path: Path to reference voice sample
language: "english" or "hindi"
Returns:
Audio waveform as numpy array
"""
language = language.lower()
if language == Language.ENGLISH:
return self._synthesize_english(text, voice_sample_path)
elif language == Language.HINDI:
return self._synthesize_hindi(text, voice_sample_path)
else:
raise ValueError(f"Unsupported language: {language}")
def _synthesize_english(self, text: str, voice_sample_path: Union[str, Path]) -> np.ndarray:
"""Synthesize English speech using WaveRNN + Tacotron2."""
from encoder import inference as encoder_infer
from app.vocoder import inference as vocoder_infer
self._load_english_models()
print(f"[MultilingualTTSService] Synthesizing English: {text[:50]}...")
# Embed voice
wav = encoder_infer.preprocess_wav(voice_sample_path)
embed = encoder_infer.embed_utterance(wav)
# Generate mel
mels = self._synthesizer_model.synthesize_spectrograms([text], [embed])
mel = mels[0]
# Vocalize
try:
synthesized = vocoder_infer.infer_waveform(
mel, normalize=True, batched=False, target=8000, overlap=800
).astype(np.float32)
except Exception as e:
print(f"[MultilingualTTSService] Vocoder failed: {e}, using Griffin-Lim fallback")
synthesized = self._synthesizer_model.griffin_lim(mel).astype(np.float32)
# Normalize
max_val = np.max(np.abs(synthesized))
if max_val > 0:
target_level = 0.707
synthesized = synthesized * (target_level / max_val)
return np.clip(synthesized, -1.0, 1.0)
def _synthesize_hindi(self, text: str, voice_sample_path: Union[str, Path]) -> np.ndarray:
"""Synthesize Hindi speech using Google Text-to-Speech (gTTS)."""
self._load_hindi_models()
print(f"[MultilingualTTSService] Synthesizing Hindi: {text[:50]}...")
try:
from gtts import gTTS
import io
from pydub import AudioSegment
# Generate speech using Google TTS
tts = gTTS(text=text, lang='hi', slow=False)
# Save to BytesIO buffer
buffer = io.BytesIO()
tts.write_to_fp(buffer)
buffer.seek(0)
# Load audio from buffer
audio_segment = AudioSegment.from_mp3(buffer)
# Convert to numpy array (mono, float32)
samples = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)
# Handle stereo to mono conversion
if audio_segment.channels == 2:
# Convert stereo to mono by averaging channels
samples = samples.reshape((-1, 2)).mean(axis=1)
# Normalize to [-1, 1] range
max_val = np.max(np.abs(samples))
if max_val > 0:
samples = samples / (32767.0 if audio_segment.sample_width == 2 else 128.0)
return np.clip(samples, -1.0, 1.0)
except Exception as e:
print(f"[MultilingualTTSService] Error during Hindi synthesis: {e}")
raise RuntimeError(f"Hindi synthesis failed: {e}")
def synthesize_and_save(self, text: str, voice_sample_path: Union[str, Path],
output_path: Union[str, Path], language: str = "english") -> Path:
"""
Synthesize and save to file.
Args:
text: Text to synthesize
voice_sample_path: Path to reference voice
output_path: Where to save audio
language: "english" or "hindi"
Returns:
Path to output file
"""
import soundfile as sf
output_path = Path(output_path)
try:
audio = self.synthesize(text, voice_sample_path, language)
# Determine sample rate based on language
sr = 24000 if language.lower() == Language.HINDI else 16000
sf.write(output_path, audio, sr)
print(f"[MultilingualTTSService] Audio saved: {output_path}")
return output_path
except Exception as e:
print(f"[MultilingualTTSService] Error during synthesis: {e}")
raise
def cleanup(self):
"""Release model memory."""
print("[MultilingualTTSService] Cleaning up models...")
try:
self._encoder_model = None
self._synthesizer_model = None
self._vocoder_model = None
self._xtts_model = None
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
except Exception as e:
print(f"[MultilingualTTSService] Cleanup warning: {e}")
|