AJ50 commited on
Commit
e6a9194
·
1 Parent(s): 2e1a616

Revert to gTTS for Hindi - reliable, works with Hindi text

Browse files
backend/app/multilingual_tts.py CHANGED
@@ -110,41 +110,26 @@ class MultilingualTTSService:
110
  print("[MultilingualTTSService] ✓ English vocoder loaded")
111
 
112
  def _load_hindi_models(self):
113
- """Load Hindi Silero TTS model - natural neural voice."""
114
  if self._xtts_model is None:
115
- print("[MultilingualTTSService] Loading Hindi Silero TTS model...")
116
  try:
117
- import torch
118
-
119
- # Load Silero TTS v3_en_indic model for Indic languages (includes Hindi)
120
- # Returns (model, example_text) tuple
121
- result = torch.hub.load(
122
- repo_or_dir='snakers4/silero-models',
123
- model='silero_tts',
124
- language='en',
125
- speaker='v3_en_indic',
126
- trust_repo=True
127
- )
128
-
129
- if isinstance(result, tuple):
130
- self._xtts_model, _ = result
131
- else:
132
- self._xtts_model = result
133
-
134
- print("[MultilingualTTSService] ✓ Hindi Silero TTS loaded successfully")
135
- print("[MultilingualTTSService] Engine: Silero TTS (Neural v3_en_indic)")
136
- print("[MultilingualTTSService] Language: Hindi (hindi_female speaker)")
137
- print("[MultilingualTTSService] Voice: Natural female voice")
138
- print("[MultilingualTTSService] TOS: No (Open source)")
139
 
140
- except ImportError as e:
141
  raise ImportError(
142
- "Torch required for Silero TTS. "
143
- "Install with: pip install torch"
144
  )
145
  except Exception as e:
146
- print(f"[MultilingualTTSService] Error loading Silero TTS: {e}")
147
- raise RuntimeError(f"Failed to load Hindi Silero model: {e}")
148
 
149
  def synthesize(self, text: str, voice_sample_path: Union[str, Path],
150
  language: str = "english") -> np.ndarray:
@@ -203,30 +188,41 @@ class MultilingualTTSService:
203
  return np.clip(synthesized, -1.0, 1.0)
204
 
205
  def _synthesize_hindi(self, text: str, voice_sample_path: Union[str, Path]) -> np.ndarray:
206
- """Synthesize Hindi speech using Silero TTS neural model."""
207
  self._load_hindi_models()
208
 
209
  print(f"[MultilingualTTSService] Synthesizing Hindi: {text[:50]}...")
210
 
211
  try:
212
- # Silero TTS returns Tensor directly
213
- audio = self._xtts_model.apply_tts(
214
- text=text,
215
- speaker='hindi_female'
216
- )
 
 
 
 
 
 
 
 
 
217
 
218
- # Convert Tensor to numpy
219
- if isinstance(audio, torch.Tensor):
220
- audio = audio.numpy()
221
 
222
- audio = np.asarray(audio, dtype=np.float32)
 
 
 
223
 
224
- # Normalize to [-1, 1] range (audio is in [-1, 1] from Silero already)
225
- max_val = np.max(np.abs(audio))
226
- if max_val > 1.0:
227
- audio = audio / max_val
228
 
229
- return np.clip(audio, -1.0, 1.0)
230
 
231
  except Exception as e:
232
  print(f"[MultilingualTTSService] Error during Hindi synthesis: {e}")
 
110
  print("[MultilingualTTSService] ✓ English vocoder loaded")
111
 
112
  def _load_hindi_models(self):
113
+ """Load Hindi models - using Google Text-to-Speech (gTTS)."""
114
  if self._xtts_model is None:
115
+ print("[MultilingualTTSService] Loading Hindi support (gTTS)...")
116
  try:
117
+ from gtts import gTTS
118
+ print("[MultilingualTTSService] ✓ Hindi gTTS support loaded")
119
+ print("[MultilingualTTSService] Engine: Google Text-to-Speech (gTTS)")
120
+ print("[MultilingualTTSService] Language: Hindi (hin)")
121
+ print("[MultilingualTTSService] TOS: No (Google Cloud)")
122
+ # Mark as loaded (gTTS doesn't require actual model loading)
123
+ self._xtts_model = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
+ except ImportError:
126
  raise ImportError(
127
+ "gTTS library required for Hindi support. "
128
+ "Install with: pip install gtts"
129
  )
130
  except Exception as e:
131
+ print(f"[MultilingualTTSService] Error loading Hindi support: {e}")
132
+ raise RuntimeError(f"Failed to load Hindi support: {e}")
133
 
134
  def synthesize(self, text: str, voice_sample_path: Union[str, Path],
135
  language: str = "english") -> np.ndarray:
 
188
  return np.clip(synthesized, -1.0, 1.0)
189
 
190
  def _synthesize_hindi(self, text: str, voice_sample_path: Union[str, Path]) -> np.ndarray:
191
+ """Synthesize Hindi speech using Google Text-to-Speech (gTTS)."""
192
  self._load_hindi_models()
193
 
194
  print(f"[MultilingualTTSService] Synthesizing Hindi: {text[:50]}...")
195
 
196
  try:
197
+ from gtts import gTTS
198
+ import io
199
+ from pydub import AudioSegment
200
+
201
+ # Generate speech using Google TTS
202
+ tts = gTTS(text=text, lang='hi', slow=False)
203
+
204
+ # Save to BytesIO buffer
205
+ buffer = io.BytesIO()
206
+ tts.write_to_fp(buffer)
207
+ buffer.seek(0)
208
+
209
+ # Load audio from buffer
210
+ audio_segment = AudioSegment.from_mp3(buffer)
211
 
212
+ # Convert to numpy array (mono, float32)
213
+ samples = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)
 
214
 
215
+ # Handle stereo to mono conversion
216
+ if audio_segment.channels == 2:
217
+ # Convert stereo to mono by averaging channels
218
+ samples = samples.reshape((-1, 2)).mean(axis=1)
219
 
220
+ # Normalize to [-1, 1] range
221
+ max_val = np.max(np.abs(samples))
222
+ if max_val > 0:
223
+ samples = samples / (32767.0 if audio_segment.sample_width == 2 else 128.0)
224
 
225
+ return np.clip(samples, -1.0, 1.0)
226
 
227
  except Exception as e:
228
  print(f"[MultilingualTTSService] Error during Hindi synthesis: {e}")
backend/requirements.txt CHANGED
@@ -16,3 +16,4 @@ unidecode>=1.3.2
16
  webrtcvad==2.0.10
17
  demucs==4.0.1
18
  omegaconf==2.3.0
 
 
16
  webrtcvad==2.0.10
17
  demucs==4.0.1
18
  omegaconf==2.3.0
19
+ gtts==2.4.0