AJ50 commited on
Commit
2e1a616
·
1 Parent(s): e1c7f06

Implement Silero TTS for Hindi - natural neural voice (v3_en_indic + hindi_female)

Browse files
backend/app/multilingual_tts.py CHANGED
@@ -110,26 +110,41 @@ class MultilingualTTSService:
110
  print("[MultilingualTTSService] ✓ English vocoder loaded")
111
 
112
  def _load_hindi_models(self):
113
- """Load Hindi models - using Google Text-to-Speech (gTTS)."""
114
  if self._xtts_model is None:
115
- print("[MultilingualTTSService] Loading Hindi support (gTTS)...")
116
  try:
117
- from gtts import gTTS
118
- print("[MultilingualTTSService] ✓ Hindi gTTS support loaded")
119
- print("[MultilingualTTSService] Engine: Google Text-to-Speech (gTTS)")
120
- print("[MultilingualTTSService] Language: Hindi (hin)")
121
- print("[MultilingualTTSService] TOS: No (Google Cloud)")
122
- # Mark as loaded (gTTS doesn't require actual model loading)
123
- self._xtts_model = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
- except ImportError:
126
  raise ImportError(
127
- "gTTS library required for Hindi support. "
128
- "Install with: pip install gtts"
129
  )
130
  except Exception as e:
131
- print(f"[MultilingualTTSService] Error loading Hindi support: {e}")
132
- raise RuntimeError(f"Failed to load Hindi support: {e}")
133
 
134
  def synthesize(self, text: str, voice_sample_path: Union[str, Path],
135
  language: str = "english") -> np.ndarray:
@@ -188,41 +203,30 @@ class MultilingualTTSService:
188
  return np.clip(synthesized, -1.0, 1.0)
189
 
190
  def _synthesize_hindi(self, text: str, voice_sample_path: Union[str, Path]) -> np.ndarray:
191
- """Synthesize Hindi speech using Google Text-to-Speech (gTTS)."""
192
  self._load_hindi_models()
193
 
194
  print(f"[MultilingualTTSService] Synthesizing Hindi: {text[:50]}...")
195
 
196
  try:
197
- from gtts import gTTS
198
- import io
199
- from pydub import AudioSegment
200
-
201
- # Generate speech using Google TTS
202
- tts = gTTS(text=text, lang='hi', slow=False)
203
-
204
- # Save to BytesIO buffer
205
- buffer = io.BytesIO()
206
- tts.write_to_fp(buffer)
207
- buffer.seek(0)
208
-
209
- # Load audio from buffer
210
- audio_segment = AudioSegment.from_mp3(buffer)
211
 
212
- # Convert to numpy array (mono, float32)
213
- samples = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)
 
214
 
215
- # Handle stereo to mono conversion
216
- if audio_segment.channels == 2:
217
- # Convert stereo to mono by averaging channels
218
- samples = samples.reshape((-1, 2)).mean(axis=1)
219
 
220
- # Normalize to [-1, 1] range
221
- max_val = np.max(np.abs(samples))
222
- if max_val > 0:
223
- samples = samples / (32767.0 if audio_segment.sample_width == 2 else 128.0)
224
 
225
- return np.clip(samples, -1.0, 1.0)
226
 
227
  except Exception as e:
228
  print(f"[MultilingualTTSService] Error during Hindi synthesis: {e}")
 
110
  print("[MultilingualTTSService] ✓ English vocoder loaded")
111
 
112
  def _load_hindi_models(self):
113
+ """Load Hindi Silero TTS model - natural neural voice."""
114
  if self._xtts_model is None:
115
+ print("[MultilingualTTSService] Loading Hindi Silero TTS model...")
116
  try:
117
+ import torch
118
+
119
+ # Load Silero TTS v3_en_indic model for Indic languages (includes Hindi)
120
+ # Returns (model, example_text) tuple
121
+ result = torch.hub.load(
122
+ repo_or_dir='snakers4/silero-models',
123
+ model='silero_tts',
124
+ language='en',
125
+ speaker='v3_en_indic',
126
+ trust_repo=True
127
+ )
128
+
129
+ if isinstance(result, tuple):
130
+ self._xtts_model, _ = result
131
+ else:
132
+ self._xtts_model = result
133
+
134
+ print("[MultilingualTTSService] ✓ Hindi Silero TTS loaded successfully")
135
+ print("[MultilingualTTSService] Engine: Silero TTS (Neural v3_en_indic)")
136
+ print("[MultilingualTTSService] Language: Hindi (hindi_female speaker)")
137
+ print("[MultilingualTTSService] Voice: Natural female voice")
138
+ print("[MultilingualTTSService] TOS: No (Open source)")
139
 
140
+ except ImportError as e:
141
  raise ImportError(
142
+ "Torch required for Silero TTS. "
143
+ "Install with: pip install torch"
144
  )
145
  except Exception as e:
146
+ print(f"[MultilingualTTSService] Error loading Silero TTS: {e}")
147
+ raise RuntimeError(f"Failed to load Hindi Silero model: {e}")
148
 
149
  def synthesize(self, text: str, voice_sample_path: Union[str, Path],
150
  language: str = "english") -> np.ndarray:
 
203
  return np.clip(synthesized, -1.0, 1.0)
204
 
205
  def _synthesize_hindi(self, text: str, voice_sample_path: Union[str, Path]) -> np.ndarray:
206
+ """Synthesize Hindi speech using Silero TTS neural model."""
207
  self._load_hindi_models()
208
 
209
  print(f"[MultilingualTTSService] Synthesizing Hindi: {text[:50]}...")
210
 
211
  try:
212
+ # Silero TTS returns Tensor directly
213
+ audio = self._xtts_model.apply_tts(
214
+ text=text,
215
+ speaker='hindi_female'
216
+ )
 
 
 
 
 
 
 
 
 
217
 
218
+ # Convert Tensor to numpy
219
+ if isinstance(audio, torch.Tensor):
220
+ audio = audio.numpy()
221
 
222
+ audio = np.asarray(audio, dtype=np.float32)
 
 
 
223
 
224
+ # Normalize to [-1, 1] range (audio is in [-1, 1] from Silero already)
225
+ max_val = np.max(np.abs(audio))
226
+ if max_val > 1.0:
227
+ audio = audio / max_val
228
 
229
+ return np.clip(audio, -1.0, 1.0)
230
 
231
  except Exception as e:
232
  print(f"[MultilingualTTSService] Error during Hindi synthesis: {e}")
backend/requirements.txt CHANGED
@@ -15,4 +15,4 @@ inflect==7.0.0
15
  unidecode>=1.3.2
16
  webrtcvad==2.0.10
17
  demucs==4.0.1
18
- gtts==2.4.0
 
15
  unidecode>=1.3.2
16
  webrtcvad==2.0.10
17
  demucs==4.0.1
18
+ omegaconf==2.3.0