Spaces:

AJ50
/

voice-cloning-backend

Sleeping

App Files Files Community

voice-cloning-backend / backend /app /multilingual_tts.py

AJ50

Revert to gTTS for Hindi - reliable, works with Hindi text

e6a9194 18 days ago

raw

history blame contribute delete

11 kB

	"""Multilingual TTS Service - Supports English (WaveRNN) and Hindi (XTTS)."""

	import os
	import sys

	# Set environment variables BEFORE any TTS imports to bypass CPML prompt
	os.environ['TTS_HOME'] = '/tmp/tts_models'
	os.environ['TTS_CPML'] = '1'
	os.environ['TTS_SKIP_TOS'] = '1'
	os.environ['TTS_DISABLE_WEB_VERSION_PROMPT'] = '1'
	os.environ['COQUI_TOS_AGREED'] = '1'

	# Create a silent TTS manager that handles model initialization without prompts
	def _create_silent_tts_manager():
	"""Create a TTS manager configured to skip all interactive prompts."""
	try:
	from TTS.utils.manage import ModelManager
	from pathlib import Path

	# Set model manager to use our TTS_HOME directory
	model_dir = Path(os.environ.get('TTS_HOME', '/tmp/tts_models'))
	model_dir.mkdir(parents=True, exist_ok=True)

	manager = ModelManager(model_name="tts_models/multilingual/multi-dataset/xtts_v2")
	# Mark TOS as agreed in the manager to prevent prompts
	manager.tos_agreed = True

	return manager, model_dir
	except Exception as e:
	print(f"[WARNING] Could not create silent TTS manager: {e}")
	return None, None

	import gc
	import torch
	import numpy as np
	from pathlib import Path
	from typing import Optional, Union
	from enum import Enum


	class Language(str, Enum):
	"""Supported languages."""
	ENGLISH = "english"
	HINDI = "hindi"


	class MultilingualTTSService:
	"""
	Unified TTS service supporting multiple languages.

	- English: Uses existing WaveRNN vocoder + Tacotron2 synthesizer + encoder
	- Hindi: Uses XTTS (Coqui TTS) model
	"""

	def __init__(self, models_dir: Path, hindi_model_dir: Optional[Path] = None):
	"""
	Initialize multilingual TTS service.

	Args:
	models_dir: Directory with English models (encoder.pt, synthesizer.pt, vocoder.pt)
	hindi_model_dir: Directory with XTTS Hindi model. If None, Hindi support disabled.
	"""
	self.models_dir = Path(models_dir)
	self.hindi_model_dir = Path(hindi_model_dir) if hindi_model_dir else None

	# Track loaded models
	self._encoder_model = None
	self._synthesizer_model = None
	self._vocoder_model = None
	self._xtts_model = None

	self.sr = 16000

	print("[MultilingualTTSService] Initialized")
	print(f"[MultilingualTTSService] English models dir: {self.models_dir}")
	if self.hindi_model_dir:
	print(f"[MultilingualTTSService] Hindi XTTS dir: {self.hindi_model_dir}")
	else:
	print("[MultilingualTTSService] Hindi support: DISABLED (no model path)")

	def _load_english_models(self):
	"""Load English voice cloning models (lazy load)."""
	if self._encoder_model is None:
	print("[MultilingualTTSService] Loading English encoder...")
	from encoder import inference as encoder_infer
	enc_path = self.models_dir / "default" / "encoder.pt"
	if not enc_path.exists():
	raise RuntimeError(f"English encoder model missing: {enc_path}")
	encoder_infer.load_model(enc_path)
	self._encoder_model = True
	print("[MultilingualTTSService] ✓ English encoder loaded")

	if self._synthesizer_model is None:
	print("[MultilingualTTSService] Loading English synthesizer...")
	from synthesizer import inference as synthesizer_infer
	syn_path = self.models_dir / "default" / "synthesizer.pt"
	if not syn_path.exists():
	raise RuntimeError(f"English synthesizer model missing: {syn_path}")
	self._synthesizer_model = synthesizer_infer.Synthesizer(syn_path)
	print("[MultilingualTTSService] ✓ English synthesizer loaded")

	if self._vocoder_model is None:
	print("[MultilingualTTSService] Loading English vocoder...")
	from app.vocoder import inference as vocoder_infer
	voc_path = self.models_dir / "default" / "vocoder.pt"
	if not voc_path.exists():
	raise RuntimeError(f"English vocoder model missing: {voc_path}")
	vocoder_infer.load_model(voc_path)
	self._vocoder_model = True
	print("[MultilingualTTSService] ✓ English vocoder loaded")

	def _load_hindi_models(self):
	"""Load Hindi models - using Google Text-to-Speech (gTTS)."""
	if self._xtts_model is None:
	print("[MultilingualTTSService] Loading Hindi support (gTTS)...")
	try:
	from gtts import gTTS
	print("[MultilingualTTSService] ✓ Hindi gTTS support loaded")
	print("[MultilingualTTSService] Engine: Google Text-to-Speech (gTTS)")
	print("[MultilingualTTSService] Language: Hindi (hin)")
	print("[MultilingualTTSService] TOS: No (Google Cloud)")
	# Mark as loaded (gTTS doesn't require actual model loading)
	self._xtts_model = True

	except ImportError:
	raise ImportError(
	"gTTS library required for Hindi support. "
	"Install with: pip install gtts"
	)
	except Exception as e:
	print(f"[MultilingualTTSService] Error loading Hindi support: {e}")
	raise RuntimeError(f"Failed to load Hindi support: {e}")

	def synthesize(self, text: str, voice_sample_path: Union[str, Path],
	language: str = "english") -> np.ndarray:
	"""
	Synthesize speech in specified language.

	Args:
	text: Text to synthesize
	voice_sample_path: Path to reference voice sample
	language: "english" or "hindi"

	Returns:
	Audio waveform as numpy array
	"""
	language = language.lower()

	if language == Language.ENGLISH:
	return self._synthesize_english(text, voice_sample_path)
	elif language == Language.HINDI:
	return self._synthesize_hindi(text, voice_sample_path)
	else:
	raise ValueError(f"Unsupported language: {language}")

	def _synthesize_english(self, text: str, voice_sample_path: Union[str, Path]) -> np.ndarray:
	"""Synthesize English speech using WaveRNN + Tacotron2."""
	from encoder import inference as encoder_infer
	from app.vocoder import inference as vocoder_infer

	self._load_english_models()

	print(f"[MultilingualTTSService] Synthesizing English: {text[:50]}...")

	# Embed voice
	wav = encoder_infer.preprocess_wav(voice_sample_path)
	embed = encoder_infer.embed_utterance(wav)

	# Generate mel
	mels = self._synthesizer_model.synthesize_spectrograms([text], [embed])
	mel = mels[0]

	# Vocalize
	try:
	synthesized = vocoder_infer.infer_waveform(
	mel, normalize=True, batched=False, target=8000, overlap=800
	).astype(np.float32)
	except Exception as e:
	print(f"[MultilingualTTSService] Vocoder failed: {e}, using Griffin-Lim fallback")
	synthesized = self._synthesizer_model.griffin_lim(mel).astype(np.float32)

	# Normalize
	max_val = np.max(np.abs(synthesized))
	if max_val > 0:
	target_level = 0.707
	synthesized = synthesized * (target_level / max_val)

	return np.clip(synthesized, -1.0, 1.0)

	def _synthesize_hindi(self, text: str, voice_sample_path: Union[str, Path]) -> np.ndarray:
	"""Synthesize Hindi speech using Google Text-to-Speech (gTTS)."""
	self._load_hindi_models()

	print(f"[MultilingualTTSService] Synthesizing Hindi: {text[:50]}...")

	try:
	from gtts import gTTS
	import io
	from pydub import AudioSegment

	# Generate speech using Google TTS
	tts = gTTS(text=text, lang='hi', slow=False)

	# Save to BytesIO buffer
	buffer = io.BytesIO()
	tts.write_to_fp(buffer)
	buffer.seek(0)

	# Load audio from buffer
	audio_segment = AudioSegment.from_mp3(buffer)

	# Convert to numpy array (mono, float32)
	samples = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)

	# Handle stereo to mono conversion
	if audio_segment.channels == 2:
	# Convert stereo to mono by averaging channels
	samples = samples.reshape((-1, 2)).mean(axis=1)

	# Normalize to [-1, 1] range
	max_val = np.max(np.abs(samples))
	if max_val > 0:
	samples = samples / (32767.0 if audio_segment.sample_width == 2 else 128.0)

	return np.clip(samples, -1.0, 1.0)

	except Exception as e:
	print(f"[MultilingualTTSService] Error during Hindi synthesis: {e}")
	raise RuntimeError(f"Hindi synthesis failed: {e}")

	def synthesize_and_save(self, text: str, voice_sample_path: Union[str, Path],
	output_path: Union[str, Path], language: str = "english") -> Path:
	"""
	Synthesize and save to file.

	Args:
	text: Text to synthesize
	voice_sample_path: Path to reference voice
	output_path: Where to save audio
	language: "english" or "hindi"

	Returns:
	Path to output file
	"""
	import soundfile as sf

	output_path = Path(output_path)

	try:
	audio = self.synthesize(text, voice_sample_path, language)

	# Determine sample rate based on language
	sr = 24000 if language.lower() == Language.HINDI else 16000

	sf.write(output_path, audio, sr)
	print(f"[MultilingualTTSService] Audio saved: {output_path}")
	return output_path

	except Exception as e:
	print(f"[MultilingualTTSService] Error during synthesis: {e}")
	raise

	def cleanup(self):
	"""Release model memory."""
	print("[MultilingualTTSService] Cleaning up models...")
	try:
	self._encoder_model = None
	self._synthesizer_model = None
	self._vocoder_model = None
	self._xtts_model = None
	gc.collect()
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	except Exception as e:
	print(f"[MultilingualTTSService] Cleanup warning: {e}")