AJ50 commited on
Commit
5008b66
·
1 Parent(s): 95def52

Initial voice cloning backend with all dependencies

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +18 -0
  2. .gitignore +62 -0
  3. Dockerfile +25 -0
  4. README.md +570 -10
  5. backend/.env.example +15 -0
  6. backend/__init__.py +1 -0
  7. backend/app/__init__.py +34 -0
  8. backend/app/routes.py +409 -0
  9. backend/app/vocoder/audio.py +108 -0
  10. backend/app/vocoder/display.py +127 -0
  11. backend/app/vocoder/distribution.py +132 -0
  12. backend/app/vocoder/hparams.py +44 -0
  13. backend/app/vocoder/inference.py +83 -0
  14. backend/app/vocoder/models/fatchord_version.py +434 -0
  15. backend/app/voice_cloning.py +108 -0
  16. backend/download_models.py +54 -0
  17. backend/encoder/__init__.py +0 -0
  18. backend/encoder/audio.py +117 -0
  19. backend/encoder/inference.py +178 -0
  20. backend/encoder/model.py +135 -0
  21. backend/encoder/params_data.py +29 -0
  22. backend/encoder/params_model.py +11 -0
  23. backend/enrolled_voices/voice_26bfa1ef.mp3 +0 -0
  24. backend/enrolled_voices/voice_72beeda9.mp3 +0 -0
  25. backend/enrolled_voices/voices.json +100 -0
  26. backend/requirements.txt +14 -0
  27. backend/runtime.txt +1 -0
  28. backend/synthesizer/__init__.py +1 -0
  29. backend/synthesizer/audio.py +211 -0
  30. backend/synthesizer/hparams.py +92 -0
  31. backend/synthesizer/inference.py +165 -0
  32. backend/synthesizer/models/tacotron.py +542 -0
  33. backend/synthesizer/utils/__init__.py +45 -0
  34. backend/synthesizer/utils/cleaners.py +88 -0
  35. backend/synthesizer/utils/numbers.py +69 -0
  36. backend/synthesizer/utils/symbols.py +17 -0
  37. backend/synthesizer/utils/text.py +75 -0
  38. backend/wsgi.py +15 -0
  39. frontend/.env.development +4 -0
  40. frontend/.env.production +2 -0
  41. frontend/.gitignore +99 -0
  42. frontend/README.md +111 -0
  43. frontend/components.json +20 -0
  44. frontend/eslint.config.js +29 -0
  45. frontend/index.html +24 -0
  46. frontend/package-lock.json +0 -0
  47. frontend/package.json +88 -0
  48. frontend/postcss.config.js +6 -0
  49. frontend/public/placeholder.svg +1 -0
  50. frontend/public/robots.txt +14 -0
.dockerignore ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__
2
+ *.pyc
3
+ .git
4
+ .env
5
+ .env.local
6
+ node_modules
7
+ dist
8
+ build
9
+ .DS_Store
10
+ *.log
11
+ .vscode
12
+ .idea
13
+ *.egg-info
14
+ .pytest_cache
15
+ frontend/node_modules
16
+ .next
17
+ .nuxt
18
+ .cache
.gitignore ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model files - downloaded at build time, not stored in git
2
+ backend/models/default/*.pt
3
+ models/default/*.pt
4
+ *.pt
5
+
6
+ # Python
7
+ __pycache__/
8
+ *.py[cod]
9
+ *$py.class
10
+ *.so
11
+ .Python
12
+ env/
13
+ venv/
14
+ ENV/
15
+ build/
16
+ develop-eggs/
17
+ dist/
18
+ downloads/
19
+ eggs/
20
+ .eggs/
21
+ lib/
22
+ lib64/
23
+ parts/
24
+ sdist/
25
+ var/
26
+ wheels/
27
+ *.egg-info/
28
+ .installed.cfg
29
+ *.egg
30
+
31
+ # Environment variables
32
+ .env
33
+ .env.local
34
+ .env.*.local
35
+ backend/.env
36
+
37
+ # IDE
38
+ .vscode/
39
+ .idea/
40
+ *.swp
41
+ *.swo
42
+ *~
43
+
44
+ # OS
45
+ .DS_Store
46
+ Thumbs.db
47
+
48
+ # Node/Frontend
49
+ node_modules/
50
+ dist/
51
+ .next/
52
+ out/
53
+
54
+ # Build artifacts
55
+ outputs/
56
+ temp_uploads/
57
+ enrolled_voices/*.wav
58
+ enrolled_voices/*.mp3
59
+
60
+ # Cache
61
+ .cache/
62
+ .pytest_cache/
Dockerfile ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # Install system dependencies
4
+ RUN apt-get update && apt-get install -y \
5
+ libsndfile1 libsndfile1-dev \
6
+ ffmpeg \
7
+ git \
8
+ && rm -rf /var/lib/apt/lists/*
9
+
10
+ WORKDIR /app
11
+
12
+ # Copy entire project
13
+ COPY . .
14
+
15
+ # Install Python dependencies
16
+ RUN pip install --no-cache-dir -r backend/requirements.txt
17
+
18
+ # Download models during build
19
+ RUN cd backend && python download_models.py
20
+
21
+ # Expose port (HF Spaces uses 7860)
22
+ EXPOSE 7860
23
+
24
+ # Start the application
25
+ CMD ["gunicorn", "--bind", "0.0.0.0:7860", "--workers", "1", "--timeout", "300", "backend.wsgi:app"]
README.md CHANGED
@@ -1,12 +1,572 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
- title: Voice Cloning Backend
3
- emoji: 💻
4
- colorFrom: red
5
- colorTo: green
6
- sdk: docker
7
- pinned: false
8
- license: mit
9
- short_description: 'AI-powered Voice Cloning '
10
- ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
1
+ # Real-Time Voice Cloning (RTVC)
2
+
3
+ A complete full-stack voice cloning application with React frontend and PyTorch backend that can synthesize speech in anyone's voice from just a few seconds of audio reference.
4
+
5
+ [![Python 3.11+](https://img.shields.io/badge/python-3.11+-blue.svg)](https://www.python.org/downloads/)
6
+ [![PyTorch](https://img.shields.io/badge/PyTorch-2.0+-red.svg)](https://pytorch.org/)
7
+ [![React](https://img.shields.io/badge/React-18.0+-61dafb.svg)](https://reactjs.org/)
8
+ [![TypeScript](https://img.shields.io/badge/TypeScript-5.0+-blue.svg)](https://www.typescriptlang.org/)
9
+ [![License](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE)
10
+
11
+ ## Features
12
+
13
+ - **Full Stack Application**: Modern React UI + Flask API + PyTorch backend
14
+ - **Voice Enrollment**: Record or upload voice samples directly in the browser
15
+ - **Speech Synthesis**: Generate cloned speech with intuitive interface
16
+ - **Voice Cloning**: Clone any voice with just 3-10 seconds of audio
17
+ - **Real-Time Generation**: Generate speech at 2-3x real-time speed on CPU
18
+ - **High Quality**: Natural-sounding synthetic speech using state-of-the-art models
19
+ - **Easy to Use**: Beautiful UI with 3D visualizations and audio waveforms
20
+ - **Multiple Formats**: Supports WAV, MP3, M4A, FLAC input audio
21
+ - **Multi-Language**: Supports English and Hindi text-to-speech
22
+
23
+ ## Table of Contents
24
+
25
+ - [Demo](#demo)
26
+ - [Quick Start (Full Stack)](#quick-start-full-stack)
27
+ - [Deployment](#deployment)
28
+ - [How It Works](#how-it-works)
29
+ - [Installation](#installation)
30
+ - [Project Structure](#project-structure)
31
+ - [Usage Examples](#usage-examples)
32
+ - [API Documentation](#api-documentation)
33
+ - [Troubleshooting](#troubleshooting)
34
+ - [Technical Details](#technical-details)
35
+ - [Credits](#credits)
36
+
37
+ ## Demo
38
+
39
+ **Frontend UI**: Modern React interface with 3D visualizations
40
+ **Voice Enrollment**: Record/upload voice samples → Backend saves to database
41
+ **Speech Synthesis**: Select voice + Enter text → Backend generates cloned speech
42
+ **Playback**: Listen to generated audio directly in browser or download
43
+
44
+ ## Quick Start (Full Stack)
45
+
46
+ ### Option 1: Using the Startup Script (Easiest)
47
+
48
+ ```powershell
49
+ # Windows PowerShell
50
+ cd rtvc
51
+ .\start_app.ps1
52
+ ```
53
+
54
+ This will:
55
+ 1. Start the Backend API server (port 5000)
56
+ 2. Start the Frontend dev server (port 8080)
57
+ 3. Open your browser to http://localhost:8080
58
+
59
+ ### Option 2: Manual Start
60
+
61
+ **Terminal 1 - Backend API:**
62
+ ```bash
63
+ cd rtvc
64
+ python api_server.py
65
+ ```
66
+
67
+ **Terminal 2 - Frontend:**
68
+ ```bash
69
+ cd "rtvc/Frontend Voice Cloning"
70
+ npm run dev
71
+ ```
72
+
73
+ Then open http://localhost:8080 in your browser.
74
+
75
+ ## Deployment
76
+
77
+ ### Production Deployment Stack
78
+
79
+ **Frontend**: Netlify (Free tier)
80
+ **Backend**: Render (Free tier)
81
+ **Models**: HuggingFace Hub (Free)
82
+
83
+ See [DEPLOYMENT.md](DEPLOYMENT.md) for complete deployment guide.
84
+
85
+ #### Quick Deployment
86
+
87
+ 1. **Deploy Backend to Render**
88
+ - Push to GitHub
89
+ - Connect Render to GitHub repo
90
+ - Use `render.yaml` configuration
91
+ - Models auto-download on first deploy (~10 minutes)
92
+
93
+ 2. **Deploy Frontend to Netlify**
94
+ - Connect Netlify to GitHub repo
95
+ - Set base directory: `frontend`
96
+ - Environment: `VITE_API_URL=your-render-backend-url`
97
+
98
+ 3. **Test**
99
+ - Visit your Netlify URL
100
+ - API calls automatically route to Render backend
101
+
102
+ **Pricing**: Free tier for both (with optional paid upgrades)
103
+
104
+ ### Using the Application
105
+
106
+ 1. **Enroll a Voice**:
107
+ - Go to "Voice Enrollment" section
108
+ - Enter a voice name
109
+ - Record audio (3-10 seconds) or upload a file
110
+ - Click "Enroll Voice"
111
+
112
+ 2. **Generate Speech**:
113
+ - Go to "Speech Synthesis" section
114
+ - Select your enrolled voice
115
+ - Enter text to synthesize
116
+ - Click "Generate Speech"
117
+ - Play or download the result
118
+
119
+ For detailed integration information, see [INTEGRATION_GUIDE.md](INTEGRATION_GUIDE.md).
120
+
121
+ ## How It Works
122
+
123
+ The system uses a 3-stage pipeline based on the SV2TTS (Speaker Verification to Text-to-Speech) architecture:
124
+
125
+ ```
126
+ Reference Audio → [Encoder] → Speaker Embedding (256-d vector)
127
+
128
+ Text Input → [Synthesizer (Tacotron)] → Mel-Spectrogram
129
+
130
+ [Vocoder (WaveRNN)] → Audio Output
131
+ ```
132
+
133
+ ### Pipeline Stages:
134
+
135
+ 1. **Speaker Encoder** - Extracts a unique voice "fingerprint" from reference audio
136
+ 2. **Synthesizer** - Generates mel-spectrograms from text conditioned on speaker embedding
137
+ 3. **Vocoder** - Converts mel-spectrograms to high-quality audio waveforms
138
+
139
+ ## Installation
140
+
141
+ ### Prerequisites
142
+
143
+ - Python 3.11 or higher
144
+ - Windows/Linux/macOS
145
+ - ~2 GB disk space for models
146
+ - 4 GB RAM minimum (8 GB recommended)
147
+
148
+ ### Step 1: Clone the Repository
149
+
150
+ ```bash
151
+ git clone https://github.com/yourusername/rtvc.git
152
+ cd rtvc
153
+ ```
154
+
155
+ ### Step 2: Install Dependencies
156
+
157
+ ```bash
158
+ pip install torch numpy librosa scipy soundfile webrtcvad tqdm unidecode inflect matplotlib numba
159
+ ```
160
+
161
+ Or install PyTorch with CUDA for GPU acceleration:
162
+
163
+ ```bash
164
+ pip install torch --index-url https://download.pytorch.org/whl/cu118
165
+ pip install numpy librosa scipy soundfile webrtcvad tqdm unidecode inflect matplotlib numba
166
+ ```
167
+
168
+ ### Step 3: Download Pretrained Models
169
+
170
+ Download the pretrained models from [Google Drive](https://drive.google.com/drive/folders/1fU6umc5uQAVR2udZdHX-lDgXYzTyqG_j):
171
+
172
+ | Model | Size | Description |
173
+ |-------|------|-------------|
174
+ | encoder.pt | 17 MB | Speaker encoder model |
175
+ | synthesizer.pt | 370 MB | Tacotron synthesizer model |
176
+ | vocoder.pt | 53 MB | WaveRNN vocoder model |
177
+
178
+ Place all three files in the `models/default/` directory.
179
+
180
+ ### Step 4: Verify Installation
181
+
182
+ ```bash
183
+ python clone_my_voice.py
184
+ ```
185
+
186
+ If you see errors about missing models, check that all three `.pt` files are in `models/default/`.
187
+
188
+ ## Quick Start
189
+
190
+ ### Method 1: Simple Script (Recommended)
191
+
192
+ 1. Open `clone_my_voice.py`
193
+ 2. Edit these lines:
194
+
195
+ ```python
196
+ # Your voice sample file
197
+ VOICE_FILE = r"sample\your_voice.mp3"
198
+
199
+ # The text you want to be spoken
200
+ TEXT_TO_CLONE = """
201
+ Your text here. Can be multiple sentences or even paragraphs!
202
+ """
203
+
204
+ # Output location
205
+ OUTPUT_FILE = r"outputs\cloned_voice.wav"
206
+ ```
207
+
208
+ 3. Run it:
209
+
210
+ ```bash
211
+ python clone_my_voice.py
212
+ ```
213
+
214
+ ### Method 2: Command Line
215
+
216
+ ```bash
217
+ python run_cli.py --voice "path/to/voice.wav" --text "Text to synthesize" --out "output.wav"
218
+ ```
219
+
220
+ ### Method 3: Advanced Runner Script
221
+
222
+ ```bash
223
+ python run_voice_cloning.py
224
+ ```
225
+
226
+ Edit the paths and text inside the script before running.
227
+
228
+ ## Project Structure
229
+
230
+ ```
231
+ rtvc/
232
+ ├── clone_my_voice.py # Simple script - EDIT THIS to clone your voice!
233
+ ├── run_cli.py # Command-line interface
234
+
235
+ ├── encoder/ # Speaker Encoder Module
236
+ │ ├── __init__.py
237
+ │ ├── audio.py # Audio preprocessing for encoder
238
+ │ ├── inference.py # Encoder inference functions
239
+ │ ├── model.py # SpeakerEncoder neural network
240
+ │ ├── params_data.py # Data hyperparameters
241
+ │ └── params_model.py # Model hyperparameters
242
+
243
+ ├── synthesizer/ # Tacotron Synthesizer Module
244
+ │ ├── __init__.py
245
+ │ ├── audio.py # Audio processing for synthesizer
246
+ │ ├── hparams.py # All synthesizer hyperparameters
247
+ │ ├── inference.py # Synthesizer inference class
248
+ │ │
249
+ │ ├── models/
250
+ │ │ └── tacotron.py # Tacotron 2 architecture
251
+ │ │
252
+ │ └── utils/
253
+ │ ├── cleaners.py # Text cleaning functions
254
+ │ ├── numbers.py # Number-to-text conversion
255
+ │ ├── symbols.py # Character/phoneme symbols
256
+ │ └── text.py # Text-to-sequence conversion
257
+
258
+ ├── vocoder/ # WaveRNN Vocoder Module
259
+ │ ├── audio.py # Audio utilities for vocoder
260
+ │ ├── display.py # Progress display utilities
261
+ │ ├── distribution.py # Probability distributions
262
+ │ ├── hparams.py # Vocoder hyperparameters
263
+ │ ├── inference.py # Vocoder inference functions
264
+ │ │
265
+ │ └── models/
266
+ │ └── fatchord_version.py # WaveRNN architecture
267
+
268
+ ├── utils/
269
+ │ └── default_models.py # Model download utilities
270
+
271
+ ├── models/
272
+ │ └── default/ # Pretrained models go here
273
+ │ ├── encoder.pt # (17 MB)
274
+ │ ├── synthesizer.pt # (370 MB) - Must download!
275
+ │ └── vocoder.pt # (53 MB)
276
+
277
+ ├── sample/ # Put your voice samples here
278
+ │ └── your_voice.mp3
279
+
280
+ └── outputs/ # Generated audio outputs
281
+ └── cloned_voice.wav
282
+ ```
283
+
284
+ ### Key Files Explained
285
+
286
+ | File | Purpose |
287
+ |------|---------|
288
+ | `clone_my_voice.py` | **START HERE** - Simplest way to clone your voice |
289
+ | `run_cli.py` | Command-line tool for voice cloning |
290
+ | `encoder/inference.py` | Loads encoder and extracts speaker embeddings |
291
+ | `synthesizer/inference.py` | Loads synthesizer and generates mel-spectrograms |
292
+ | `vocoder/inference.py` | Loads vocoder and generates waveforms |
293
+ | `**/hparams.py` | Configuration files for each module |
294
+
295
+ ## Usage Examples
296
+
297
+ ### Example 1: Basic Voice Cloning
298
+
299
+ ```bash
300
+ python clone_my_voice.py
301
+ ```
302
+
303
+ Edit `clone_my_voice.py` first:
304
+ ```python
305
+ VOICE_FILE = r"sample\my_voice.mp3"
306
+ TEXT_TO_CLONE = "Hello, this is my cloned voice!"
307
+ ```
308
+
309
+ ### Example 2: Multiple Outputs
310
+
311
+ ```bash
312
+ # Generate first output
313
+ python run_cli.py --voice "voice.wav" --text "First message" --out "output1.wav"
314
+
315
+ # Generate second output with same voice
316
+ python run_cli.py --voice "voice.wav" --text "Second message" --out "output2.wav"
317
+ ```
318
+
319
+ ### Example 3: Long Text
320
+
321
+ ```bash
322
+ python run_cli.py --voice "voice.wav" --text "This is a very long text that spans multiple sentences. The voice cloning system will synthesize all of it in the reference voice. You can make it as long as you need."
323
+ ```
324
+
325
+ ### Example 4: Different Voice Samples
326
+
327
+ ```bash
328
+ # Clone voice A
329
+ python run_cli.py --voice "person_a.wav" --text "Message from person A"
330
+
331
+ # Clone voice B
332
+ python run_cli.py --voice "person_b.wav" --text "Message from person B"
333
+ ```
334
+
335
+ ## Troubleshooting
336
+
337
+ ### Common Issues
338
+
339
+ #### "Model file not found"
340
+
341
+ **Solution**: Download the models from Google Drive and place them in `models/default/`:
342
+ - https://drive.google.com/drive/folders/1fU6umc5uQAVR2udZdHX-lDgXYzTyqG_j
343
+
344
+ Verify file sizes:
345
+ ```bash
346
+ # Windows
347
+ dir models\default\*.pt
348
+
349
+ # Linux/Mac
350
+ ls -lh models/default/*.pt
351
+ ```
352
+
353
+ Expected sizes:
354
+ - encoder.pt: 17,090,379 bytes (17 MB)
355
+ - synthesizer.pt: 370,554,559 bytes (370 MB) - Most common issue!
356
+ - vocoder.pt: 53,845,290 bytes (53 MB)
357
+
358
+ #### "Reference voice file not found"
359
+
360
+ **Solution**: Use absolute paths or check current directory:
361
+ ```python
362
+ # Use absolute path
363
+ VOICE_FILE = r"C:\Users\YourName\Desktop\voice.mp3"
364
+
365
+ # Or relative from project root
366
+ VOICE_FILE = r"sample\voice.mp3"
367
+ ```
368
+
369
+ #### Output sounds robotic or unclear
370
+
371
+ **Solutions**:
372
+ - Use a higher quality voice sample (16kHz+ sample rate)
373
+ - Ensure voice sample is 3-10 seconds long
374
+ - Remove background noise from voice sample
375
+ - Speak clearly and naturally in the reference audio
376
+
377
+ #### "AttributeError: module 'numpy' has no attribute 'cumproduct'"
378
+
379
+ **Solution**: This is already fixed in the code. If you see this:
380
+ ```bash
381
+ pip install --upgrade numpy
382
+ ```
383
+
384
+ #### Slow generation on CPU
385
+
386
+ **Solutions**:
387
+ - Normal speed: 2-3x real-time on modern CPUs
388
+ - For faster generation, install PyTorch with CUDA:
389
+ ```bash
390
+ pip install torch --index-url https://download.pytorch.org/whl/cu118
391
+ ```
392
+
393
+ Then the system will automatically use GPU if available.
394
+
395
+ ### Getting Help
396
+
397
+ If you encounter other issues:
398
+ 1. Check the `HOW_TO_RUN.md` file for detailed instructions
399
+ 2. Verify all models are downloaded correctly
400
+ 3. Ensure Python 3.11+ is installed
401
+ 4. Check that all dependencies are installed
402
+
403
+ ## Technical Details
404
+
405
+ ### Audio Specifications
406
+
407
+ | Parameter | Value |
408
+ |-----------|-------|
409
+ | Sample Rate | 16,000 Hz |
410
+ | Channels | Mono |
411
+ | Bit Depth | 16-bit |
412
+ | FFT Size | 800 samples (50ms) |
413
+ | Hop Size | 200 samples (12.5ms) |
414
+ | Mel Channels | 80 (synthesizer/vocoder), 40 (encoder) |
415
+
416
+ ### Model Architectures
417
+
418
+ #### Speaker Encoder
419
+ - **Type**: LSTM + Linear Projection
420
+ - **Input**: 40-channel mel-spectrogram
421
+ - **Output**: 256-dimensional speaker embedding
422
+ - **Parameters**: ~5M
423
+
424
+ #### Synthesizer (Tacotron 2)
425
+ - **Encoder**: CBHG (Convolution Bank + Highway + GRU)
426
+ - **Decoder**: Attention-based LSTM
427
+ - **PostNet**: 5-layer Residual CNN
428
+ - **Parameters**: ~31M
429
+
430
+ #### Vocoder (WaveRNN)
431
+ - **Type**: Recurrent Neural Vocoder
432
+ - **Mode**: Raw 9-bit with mu-law
433
+ - **Upsample Factors**: (5, 5, 8)
434
+ - **Parameters**: ~4.5M
435
+
436
+ ### Text Processing
437
+
438
+ The system includes sophisticated text normalization:
439
+ - **Numbers**: "123" → "one hundred twenty three"
440
+ - **Currency**: "$5.50" → "five dollars, fifty cents"
441
+ - **Ordinals**: "1st" → "first"
442
+ - **Abbreviations**: "Dr." → "doctor"
443
+ - **Unicode**: Automatic transliteration to ASCII
444
+
445
+ ### Performance
446
+
447
+ | Hardware | Generation Speed |
448
+ |----------|------------------|
449
+ | CPU (Intel i7) | 2-3x real-time |
450
+ | GPU (GTX 1060) | 10-15x real-time |
451
+ | GPU (RTX 3080) | 30-50x real-time |
452
+
453
+ Example: Generating 10 seconds of audio takes ~3-5 seconds on CPU.
454
+
455
+ ## How to Use for Different Applications
456
+
457
+ ### Podcast/Narration
458
+ ```python
459
+ TEXT_TO_CLONE = """
460
+ Welcome to today's episode. In this podcast, we'll be discussing
461
+ the fascinating world of artificial intelligence and voice synthesis.
462
+ Let's dive right in!
463
+ """
464
+ ```
465
+
466
+ ### Audiobook
467
+ ```python
468
+ TEXT_TO_CLONE = """
469
+ Chapter One: The Beginning.
470
+ It was a dark and stormy night when everything changed.
471
+ The old house stood alone on the hill, its windows dark and unwelcoming.
472
+ """
473
+ ```
474
+
475
+ ### Voiceover
476
+ ```python
477
+ TEXT_TO_CLONE = """
478
+ Introducing the all-new product that will change your life.
479
+ With advanced features and intuitive design, it's the perfect solution.
480
+ """
481
+ ```
482
+
483
+ ### Multiple Languages
484
+ The system supports English out of the box. For other languages:
485
+ 1. Use English transliteration for best results
486
+ 2. Or modify `synthesizer/utils/cleaners.py` for your language
487
+
488
+ ## Comparison with Other Methods
489
+
490
+ | Method | Quality | Speed | Setup |
491
+ |--------|---------|-------|-------|
492
+ | Traditional TTS | Low | Fast | Easy |
493
+ | Commercial APIs | High | Fast | API Key Required |
494
+ | **This Project** | High | Medium | One-time Setup |
495
+ | Training from Scratch | High | Slow | Very Complex |
496
+
497
+ ## Best Practices
498
+
499
+ ### For Best Voice Quality:
500
+
501
+ 1. **Reference Audio**:
502
+ - 3-10 seconds long
503
+ - Clear speech, no background noise
504
+ - Natural speaking tone (not reading/singing)
505
+ - 16kHz+ sample rate if possible
506
+
507
+ 2. **Text Input**:
508
+ - Use proper punctuation for natural pauses
509
+ - Break very long texts into paragraphs
510
+ - Avoid excessive special characters
511
+
512
+ 3. **Output**:
513
+ - Generate shorter clips for better quality
514
+ - Concatenate multiple clips if needed
515
+ - Post-process with audio editing software for polish
516
+
517
+ ## Known Limitations
518
+
519
+ - Works best with English text
520
+ - Requires good quality reference audio
521
+ - May not perfectly capture very unique voice characteristics
522
+ - Background noise in reference affects output quality
523
+ - Very short reference audio (<3 seconds) may produce inconsistent results
524
+
525
+ ## Future Improvements
526
+
527
+ - [ ] Add GUI interface
528
+ - [ ] Support for multiple languages
529
+ - [ ] Real-time streaming mode
530
+ - [ ] Voice mixing/morphing capabilities
531
+ - [ ] Fine-tuning on custom datasets
532
+ - [ ] Mobile app version
533
+
534
+ ## Credits
535
+
536
+ This implementation is based on:
537
+ - **SV2TTS**: Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis
538
+ - **Tacotron 2**: Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions
539
+ - **WaveRNN**: Efficient Neural Audio Synthesis
540
+
541
+ Original research papers:
542
+ - [SV2TTS Paper](https://arxiv.org/abs/1806.04558)
543
+ - [Tacotron 2 Paper](https://arxiv.org/abs/1712.05884)
544
+ - [WaveRNN Paper](https://arxiv.org/abs/1802.08435)
545
+
546
+ ## License
547
+
548
+ This project is licensed under the MIT License - see the LICENSE file for details.
549
+
550
+ ## Contributing
551
+
552
+ Contributions are welcome! Please feel free to submit a Pull Request.
553
+
554
+ 1. Fork the repository
555
+ 2. Create your feature branch (`git checkout -b feature/AmazingFeature`)
556
+ 3. Commit your changes (`git commit -m 'Add some AmazingFeature'`)
557
+ 4. Push to the branch (`git push origin feature/AmazingFeature`)
558
+ 5. Open a Pull Request
559
+
560
+ ## Show Your Support
561
+
562
+ If this project helped you, please give it a star!
563
+
564
+ ## Contact
565
+
566
+ For questions or support, please open an issue on GitHub.
567
+
568
  ---
 
 
 
 
 
 
 
 
 
569
 
570
+ **Made with love by the Voice Cloning Community**
571
+
572
+ *Last Updated: October 30, 2025*
backend/.env.example ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Flask backend environment variables
2
+ FLASK_APP=backend.app
3
+ FLASK_ENV=production
4
+ DEBUG=false
5
+
6
+ # HuggingFace configuration
7
+ HF_HOME=.cache/huggingface
8
+
9
+ # CORS configuration for production
10
+ CORS_ORIGINS=https://your-netlify-site.netlify.app
11
+
12
+ # Model configuration
13
+ MODEL_REPO_ENCODER=AJ50/voice-clone-encoder
14
+ MODEL_REPO_SYNTHESIZER=AJ50/voice-clone-synthesizer
15
+ MODEL_REPO_VOCODER=AJ50/voice-clone-vocoder
backend/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Backend package root to support relative imports."""
backend/app/__init__.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Application factory for the voice cloning backend."""
2
+
3
+ import os
4
+ from flask import Flask
5
+ from flask_cors import CORS
6
+
7
+
8
+ def create_app():
9
+ """Create and configure the Flask application."""
10
+
11
+ app = Flask(__name__)
12
+
13
+ # CORS configuration - allow specific frontend URL or all origins
14
+ allowed_origins = os.getenv('FRONTEND_URL', '*').split(',')
15
+ cors_config = {
16
+ "origins": allowed_origins if allowed_origins != ['*'] else '*',
17
+ "methods": ["GET", "POST", "DELETE", "OPTIONS"],
18
+ "allow_headers": ["Content-Type", "Authorization"]
19
+ }
20
+ CORS(app, resources={r"/api/*": cors_config})
21
+
22
+ from .routes import bp
23
+
24
+ app.register_blueprint(bp)
25
+
26
+ # Root endpoint
27
+ @app.route('/')
28
+ def index():
29
+ return {'message': 'Voice Cloning API', 'status': 'running', 'api_prefix': '/api'}
30
+
31
+ return app
32
+
33
+
34
+ app = create_app()
backend/app/routes.py ADDED
@@ -0,0 +1,409 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Flask API Backend for Voice Cloning
3
+ Integrates the Python voice cloning backend with the React frontend
4
+ """
5
+
6
+ from flask import Blueprint, request, jsonify, send_file
7
+ from pathlib import Path
8
+ import uuid
9
+ import json
10
+ from datetime import datetime
11
+ import sys
12
+
13
+ from .voice_cloning import synthesize
14
+
15
+ bp = Blueprint('voice_cloning', __name__, url_prefix='/api')
16
+
17
+ BASE_DIR = Path(__file__).resolve().parents[1]
18
+
19
+ # Configuration
20
+ UPLOAD_FOLDER = BASE_DIR / 'enrolled_voices'
21
+ OUTPUT_FOLDER = BASE_DIR / 'outputs'
22
+ MODELS_DIR = BASE_DIR / 'models'
23
+ VOICES_DB = UPLOAD_FOLDER / 'voices.json'
24
+
25
+ # Create directories with parents
26
+ try:
27
+ UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
28
+ OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)
29
+ VOICES_DB.parent.mkdir(parents=True, exist_ok=True)
30
+ except Exception as e:
31
+ print(f"Failed to create directories: {e}")
32
+ sys.exit(1)
33
+
34
+ # Allowed audio extensions
35
+ ALLOWED_EXTENSIONS = {'mp3', 'wav', 'm4a', 'flac', 'ogg', 'webm'}
36
+
37
+ def allowed_file(filename):
38
+ return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
39
+
40
+ def load_voices_db():
41
+ """Load the voices database"""
42
+ if VOICES_DB.exists():
43
+ with open(VOICES_DB, 'r') as f:
44
+ return json.load(f)
45
+ return []
46
+
47
+ def save_voices_db(voices):
48
+ """Save the voices database"""
49
+ with open(VOICES_DB, 'w') as f:
50
+ json.dump(voices, f, indent=2)
51
+
52
+ @bp.route('/health', methods=['GET'])
53
+ def health_check():
54
+ """Health check endpoint"""
55
+ return jsonify({
56
+ 'status': 'healthy',
57
+ 'message': 'Voice Cloning API is running'
58
+ })
59
+
60
+ @bp.route('/enroll', methods=['POST'])
61
+ def enroll_voice():
62
+ """
63
+ Enroll a new voice by accepting audio file and voice name
64
+ Frontend sends: FormData with 'audio' (File) and 'voice_name' (string)
65
+ """
66
+ try:
67
+ # Check if audio file is present
68
+ if 'audio' not in request.files:
69
+ return jsonify({'error': 'No audio file provided'}), 400
70
+
71
+ audio_file = request.files['audio']
72
+ voice_name = request.form.get('voice_name', 'Unnamed Voice').strip()
73
+
74
+ if audio_file.filename == '':
75
+ return jsonify({'error': 'No file selected'}), 400
76
+
77
+ if not allowed_file(audio_file.filename):
78
+ return jsonify({'error': 'Invalid file type. Supported: mp3, wav, m4a, flac, ogg, webm'}), 400
79
+
80
+ # Ensure upload folder exists
81
+ UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
82
+
83
+ # Generate unique ID and secure filename
84
+ voice_id = f"voice_{uuid.uuid4().hex[:8]}"
85
+ file_extension = audio_file.filename.rsplit('.', 1)[1].lower()
86
+ filename = f"{voice_id}.{file_extension}"
87
+ filepath = UPLOAD_FOLDER / filename
88
+
89
+ # Save the audio file with error handling
90
+ try:
91
+ audio_file.save(str(filepath))
92
+ print(f"✓ Audio file saved: {filepath}")
93
+ except Exception as file_err:
94
+ print(f"✗ Failed to save audio file: {file_err}")
95
+ return jsonify({'error': f'Failed to save audio: {str(file_err)}'}), 500
96
+
97
+ # Create voice entry
98
+ voice_entry = {
99
+ 'id': voice_id,
100
+ 'name': voice_name,
101
+ 'filename': filename,
102
+ 'createdAt': datetime.now().isoformat()
103
+ }
104
+
105
+ # Update voices database with error handling
106
+ try:
107
+ VOICES_DB.parent.mkdir(parents=True, exist_ok=True)
108
+ voices = load_voices_db()
109
+ voices.append(voice_entry)
110
+ save_voices_db(voices)
111
+ print(f"✓ Voice '{voice_name}' (ID: {voice_id}) enrolled successfully")
112
+ except Exception as db_err:
113
+ print(f"✗ Failed to update voices DB: {db_err}")
114
+ return jsonify({'error': f'Failed to save voice metadata: {str(db_err)}'}), 500
115
+
116
+ return jsonify({
117
+ 'success': True,
118
+ 'message': f'Voice "{voice_name}" enrolled successfully',
119
+ 'voice_id': voice_id,
120
+ 'voice_name': voice_name,
121
+ 'created_at': voice_entry['createdAt']
122
+ }), 201
123
+
124
+ except Exception as e:
125
+ print(f"✗ Error enrolling voice: {e}")
126
+ import traceback
127
+ traceback.print_exc()
128
+ return jsonify({'error': f'Failed to enroll voice: {str(e)}'}), 500
129
+
130
+ @bp.route('/voices', methods=['GET'])
131
+ def get_voices():
132
+ """
133
+ Get list of all enrolled voices
134
+ Frontend uses this to populate the voice selection dropdown
135
+ """
136
+ try:
137
+ voices = load_voices_db()
138
+ # Return only necessary info for frontend
139
+ voices_list = [
140
+ {
141
+ 'id': v['id'],
142
+ 'name': v['name'],
143
+ 'createdAt': v['createdAt']
144
+ }
145
+ for v in voices
146
+ ]
147
+ return jsonify({'voices': voices_list}), 200
148
+ except Exception as e:
149
+ print(f"Error getting voices: {e}")
150
+ return jsonify({'error': f'Failed to get voices: {str(e)}'}), 500
151
+
152
+ @bp.route('/synthesize', methods=['POST'])
153
+ def synthesize_speech():
154
+ """
155
+ Synthesize speech from text using enrolled voice
156
+ Frontend sends: { "text": "...", "voiceId": "voice_xxx" }
157
+ """
158
+ try:
159
+ data = request.get_json()
160
+
161
+ if not data:
162
+ return jsonify({'error': 'No data provided'}), 400
163
+
164
+ text = data.get('text', '').strip()
165
+ voice_id = data.get('voice_id', '') # Changed from 'voiceId' to 'voice_id'
166
+
167
+ if not text:
168
+ return jsonify({'error': 'No text provided'}), 400
169
+
170
+ if not voice_id:
171
+ return jsonify({'error': 'No voice selected'}), 400
172
+
173
+ # Find the voice in database
174
+ voices = load_voices_db()
175
+ voice = next((v for v in voices if v['id'] == voice_id), None)
176
+
177
+ if not voice:
178
+ return jsonify({'error': 'Voice not found'}), 404
179
+
180
+ # Reconstruct path from UPLOAD_FOLDER (server-agnostic)
181
+ voice_filepath = UPLOAD_FOLDER / voice['filename']
182
+
183
+ if not voice_filepath.exists():
184
+ return jsonify({'error': f'Voice file not found: {voice_filepath}'}), 404
185
+
186
+ # Generate unique output filename
187
+ output_filename = f"synthesis_{uuid.uuid4().hex[:8]}.wav"
188
+ output_path = OUTPUT_FOLDER / output_filename
189
+
190
+ # Call the voice cloning synthesis function
191
+ print(f"Synthesizing: '{text}' with voice '{voice['name']}'")
192
+ print(f"Voice file: {voice_filepath}")
193
+ print(f"Output path: {output_path}")
194
+ print(f"Models dir: {MODELS_DIR}")
195
+ print("Starting synthesis... This may take 30-60 seconds...")
196
+
197
+ try:
198
+ # Flush output to see logs immediately
199
+ sys.stdout.flush()
200
+
201
+ synthesize(
202
+ voice_path=voice_filepath,
203
+ text=text,
204
+ models_dir=MODELS_DIR,
205
+ out_path=output_path
206
+ )
207
+
208
+ print(f"Synthesis completed! Output saved to: {output_path}")
209
+ sys.stdout.flush()
210
+ except Exception as synth_error:
211
+ print(f"Synthesis error: {synth_error}")
212
+ import traceback
213
+ traceback.print_exc()
214
+ sys.stdout.flush()
215
+ return jsonify({'error': f'Synthesis failed: {str(synth_error)}'}), 500
216
+
217
+ if not output_path.exists():
218
+ error_msg = 'Synthesis failed - output not generated'
219
+ return jsonify({'error': error_msg}), 500
220
+
221
+ # Return the audio file URL
222
+ return jsonify({
223
+ 'success': True,
224
+ 'message': 'Speech synthesized successfully',
225
+ 'audio_url': f'/api/audio/{output_filename}'
226
+ }), 200
227
+
228
+ except Exception as e:
229
+ print(f"Error synthesizing speech: {e}")
230
+ import traceback
231
+ traceback.print_exc()
232
+ return jsonify({'error': f'Failed to synthesize speech: {str(e)}'}), 500
233
+
234
+ @bp.route('/audio/<filename>', methods=['GET'])
235
+ def get_audio(filename):
236
+ """
237
+ Serve synthesized audio files
238
+ Frontend uses this URL to play/download the generated audio
239
+ """
240
+ try:
241
+ filepath = OUTPUT_FOLDER / filename
242
+ if not filepath.exists():
243
+ return jsonify({'error': 'Audio file not found'}), 404
244
+
245
+ return send_file(
246
+ str(filepath),
247
+ mimetype='audio/wav',
248
+ as_attachment=False,
249
+ download_name=filename
250
+ )
251
+ except Exception as e:
252
+ print(f"Error serving audio: {e}")
253
+ return jsonify({'error': f'Failed to serve audio: {str(e)}'}), 500
254
+
255
+ @bp.route('/voices/<voice_id>', methods=['DELETE'])
256
+ def delete_voice(voice_id):
257
+ """
258
+ Delete an enrolled voice
259
+ Optional: Frontend can call this to remove voices
260
+ """
261
+ try:
262
+ voices = load_voices_db()
263
+ voice = next((v for v in voices if v['id'] == voice_id), None)
264
+
265
+ if not voice:
266
+ return jsonify({'error': 'Voice not found'}), 404
267
+
268
+ # Delete the audio file
269
+ voice_filepath = UPLOAD_FOLDER / voice['filename']
270
+ if voice_filepath.exists():
271
+ voice_filepath.unlink()
272
+
273
+ # Remove from database
274
+ voices = [v for v in voices if v['id'] != voice_id]
275
+ save_voices_db(voices)
276
+
277
+ return jsonify({
278
+ 'success': True,
279
+ 'message': f'Voice "{voice["name"]}" deleted successfully'
280
+ }), 200
281
+
282
+ except Exception as e:
283
+ print(f"Error deleting voice: {e}")
284
+ return jsonify({'error': f'Failed to delete voice: {str(e)}'}), 500
285
+
286
+ @bp.route('/spectrogram/<audio_filename>', methods=['GET'])
287
+ def get_spectrogram(audio_filename):
288
+ """
289
+ Generate and return mel-spectrogram data for visualization
290
+ Frontend can use this to display real-time mel-spectrogram
291
+ """
292
+ try:
293
+ print(f"[Spectrogram] Requested file: {audio_filename}")
294
+ filepath = OUTPUT_FOLDER / audio_filename
295
+ print(f"[Spectrogram] Full path: {filepath}")
296
+ print(f"[Spectrogram] File exists: {filepath.exists()}")
297
+
298
+ if not filepath.exists():
299
+ print(f"[Spectrogram] ERROR: File not found: {filepath}")
300
+ return jsonify({'error': f'Audio file {audio_filename} not found'}), 404
301
+
302
+ # Import librosa for mel-spectrogram generation
303
+ import librosa
304
+ import numpy as np
305
+
306
+ print(f"[Spectrogram] Loading audio file...")
307
+ # Load audio file
308
+ y, sr = librosa.load(str(filepath), sr=None)
309
+ print(f"[Spectrogram] Audio loaded: shape={y.shape}, sr={sr}")
310
+
311
+ # Generate mel-spectrogram
312
+ # 80 mel bands (common for Tacotron2), hop_length varies with sample rate
313
+ mel_spec = librosa.feature.melspectrogram(
314
+ y=y,
315
+ sr=sr,
316
+ n_mels=80,
317
+ hop_length=512
318
+ )
319
+ print(f"[Spectrogram] Mel-spec generated: shape={mel_spec.shape}")
320
+
321
+ # Convert to dB scale (log scale for better visualization)
322
+ mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
323
+
324
+ # Normalize to 0-255 range for visualization
325
+ mel_spec_normalized = np.clip(
326
+ ((mel_spec_db + 80) / 80 * 255),
327
+ 0,
328
+ 255
329
+ ).astype(np.uint8)
330
+
331
+ # Convert to list for JSON serialization
332
+ # Transpose to time x frequency format for frontend
333
+ spectrogram_data = mel_spec_normalized.T.tolist()
334
+
335
+ print(f"[Spectrogram] Successfully generated spectrogram: {len(spectrogram_data)} time steps")
336
+
337
+ return jsonify({
338
+ 'spectrogram': spectrogram_data,
339
+ 'n_mels': 80,
340
+ 'shape': {
341
+ 'time_steps': len(spectrogram_data),
342
+ 'frequency_bins': 80
343
+ }
344
+ }), 200
345
+
346
+ except Exception as e:
347
+ print(f"[Spectrogram] ERROR: {str(e)}")
348
+ import traceback
349
+ traceback.print_exc()
350
+ return jsonify({'error': f'Failed to generate spectrogram: {str(e)}'}), 500
351
+
352
+ @bp.route('/waveform/<audio_filename>', methods=['GET'])
353
+ def get_waveform(audio_filename):
354
+ """
355
+ Serve audio waveform as numeric array for real-time FFT visualization
356
+ Frontend fetches this and computes FFT using Web Audio API
357
+ """
358
+ try:
359
+ filepath = OUTPUT_FOLDER / audio_filename
360
+ if not filepath.exists():
361
+ return jsonify({'error': 'Audio file not found'}), 404
362
+
363
+ import soundfile as sf
364
+ import numpy as np
365
+
366
+ # Load audio file
367
+ # soundfile returns (data, sample_rate)
368
+ y, sr = sf.read(str(filepath))
369
+
370
+ # If stereo, convert to mono by taking first channel or averaging
371
+ if len(y.shape) > 1:
372
+ y = np.mean(y, axis=1)
373
+
374
+ # Ensure float32 for compatibility
375
+ y = np.asarray(y, dtype=np.float32)
376
+
377
+ # Downsample if very long to reduce JSON payload
378
+ # Typical waveform for 60s at 22050Hz = 1.3M samples
379
+ # For FFT we can use 8000 Hz safely (captures up to 4 kHz)
380
+ target_sr = 8000
381
+ if sr > target_sr:
382
+ # Calculate downsample factor
383
+ resample_ratio = target_sr / sr
384
+ new_length = int(len(y) * resample_ratio)
385
+ # Simple linear interpolation for downsampling
386
+ indices = np.linspace(0, len(y) - 1, new_length)
387
+ y = np.interp(indices, np.arange(len(y)), y)
388
+ sr = target_sr
389
+
390
+ # Convert to list for JSON serialization
391
+ waveform_data = y.tolist()
392
+
393
+ return jsonify({
394
+ 'waveform': waveform_data,
395
+ 'sample_rate': sr,
396
+ 'duration': len(y) / sr,
397
+ 'samples': len(y)
398
+ }), 200
399
+
400
+ except ImportError as ie:
401
+ err_msg = f'Soundfile library not available: {str(ie)}'
402
+ return jsonify({'error': err_msg}), 500
403
+ except Exception as e:
404
+ print(f"Error serving waveform: {e}")
405
+ import traceback
406
+ traceback.print_exc()
407
+ err_msg = f'Failed to generate waveform: {str(e)}'
408
+ return jsonify({'error': err_msg}), 500
409
+
backend/app/vocoder/audio.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import numpy as np
3
+ import librosa
4
+ from . import hparams as hp
5
+ from scipy.signal import lfilter
6
+ import soundfile as sf
7
+
8
+
9
+ def label_2_float(x, bits) :
10
+ return 2 * x / (2**bits - 1.) - 1.
11
+
12
+
13
+ def float_2_label(x, bits) :
14
+ assert abs(x).max() <= 1.0
15
+ x = (x + 1.) * (2**bits - 1) / 2
16
+ return x.clip(0, 2**bits - 1)
17
+
18
+
19
+ def load_wav(path) :
20
+ return librosa.load(str(path), sr=hp.sample_rate)[0]
21
+
22
+
23
+ def save_wav(x, path) :
24
+ sf.write(path, x.astype(np.float32), hp.sample_rate)
25
+
26
+
27
+ def split_signal(x) :
28
+ unsigned = x + 2**15
29
+ coarse = unsigned // 256
30
+ fine = unsigned % 256
31
+ return coarse, fine
32
+
33
+
34
+ def combine_signal(coarse, fine) :
35
+ return coarse * 256 + fine - 2**15
36
+
37
+
38
+ def encode_16bits(x) :
39
+ return np.clip(x * 2**15, -2**15, 2**15 - 1).astype(np.int16)
40
+
41
+
42
+ mel_basis = None
43
+
44
+
45
+ def linear_to_mel(spectrogram):
46
+ global mel_basis
47
+ if mel_basis is None:
48
+ mel_basis = build_mel_basis()
49
+ return np.dot(mel_basis, spectrogram)
50
+
51
+
52
+ def build_mel_basis():
53
+ return librosa.filters.mel(hp.sample_rate, hp.n_fft, n_mels=hp.num_mels, fmin=hp.fmin)
54
+
55
+
56
+ def normalize(S):
57
+ return np.clip((S - hp.min_level_db) / -hp.min_level_db, 0, 1)
58
+
59
+
60
+ def denormalize(S):
61
+ return (np.clip(S, 0, 1) * -hp.min_level_db) + hp.min_level_db
62
+
63
+
64
+ def amp_to_db(x):
65
+ return 20 * np.log10(np.maximum(1e-5, x))
66
+
67
+
68
+ def db_to_amp(x):
69
+ return np.power(10.0, x * 0.05)
70
+
71
+
72
+ def spectrogram(y):
73
+ D = stft(y)
74
+ S = amp_to_db(np.abs(D)) - hp.ref_level_db
75
+ return normalize(S)
76
+
77
+
78
+ def melspectrogram(y):
79
+ D = stft(y)
80
+ S = amp_to_db(linear_to_mel(np.abs(D)))
81
+ return normalize(S)
82
+
83
+
84
+ def stft(y):
85
+ return librosa.stft(y=y, n_fft=hp.n_fft, hop_length=hp.hop_length, win_length=hp.win_length)
86
+
87
+
88
+ def pre_emphasis(x):
89
+ return lfilter([1, -hp.preemphasis], [1], x)
90
+
91
+
92
+ def de_emphasis(x):
93
+ return lfilter([1], [1, -hp.preemphasis], x)
94
+
95
+
96
+ def encode_mu_law(x, mu) :
97
+ mu = mu - 1
98
+ fx = np.sign(x) * np.log(1 + mu * np.abs(x)) / np.log(1 + mu)
99
+ return np.floor((fx + 1) / 2 * mu + 0.5)
100
+
101
+
102
+ def decode_mu_law(y, mu, from_labels=True) :
103
+ if from_labels:
104
+ y = label_2_float(y, math.log2(mu))
105
+ mu = mu - 1
106
+ x = np.sign(y) / mu * ((1 + mu) ** np.abs(y) - 1)
107
+ return x
108
+
backend/app/vocoder/display.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import numpy as np
3
+ import sys
4
+
5
+
6
+ def progbar(i, n, size=16):
7
+ done = (i * size) // n
8
+ bar = ''
9
+ for i in range(size):
10
+ bar += '█' if i <= done else '░'
11
+ return bar
12
+
13
+
14
+ def stream(message) :
15
+ try:
16
+ sys.stdout.write("\r{%s}" % message)
17
+ except:
18
+ #Remove non-ASCII characters from message
19
+ message = ''.join(i for i in message if ord(i)<128)
20
+ sys.stdout.write("\r{%s}" % message)
21
+
22
+
23
+ def simple_table(item_tuples) :
24
+
25
+ border_pattern = '+---------------------------------------'
26
+ whitespace = ' '
27
+
28
+ headings, cells, = [], []
29
+
30
+ for item in item_tuples :
31
+
32
+ heading, cell = str(item[0]), str(item[1])
33
+
34
+ pad_head = True if len(heading) < len(cell) else False
35
+
36
+ pad = abs(len(heading) - len(cell))
37
+ pad = whitespace[:pad]
38
+
39
+ pad_left = pad[:len(pad)//2]
40
+ pad_right = pad[len(pad)//2:]
41
+
42
+ if pad_head :
43
+ heading = pad_left + heading + pad_right
44
+ else :
45
+ cell = pad_left + cell + pad_right
46
+
47
+ headings += [heading]
48
+ cells += [cell]
49
+
50
+ border, head, body = '', '', ''
51
+
52
+ for i in range(len(item_tuples)) :
53
+
54
+ temp_head = f'| {headings[i]} '
55
+ temp_body = f'| {cells[i]} '
56
+
57
+ border += border_pattern[:len(temp_head)]
58
+ head += temp_head
59
+ body += temp_body
60
+
61
+ if i == len(item_tuples) - 1 :
62
+ head += '|'
63
+ body += '|'
64
+ border += '+'
65
+
66
+ print(border)
67
+ print(head)
68
+ print(border)
69
+ print(body)
70
+ print(border)
71
+ print(' ')
72
+
73
+
74
+ def time_since(started) :
75
+ elapsed = time.time() - started
76
+ m = int(elapsed // 60)
77
+ s = int(elapsed % 60)
78
+ if m >= 60 :
79
+ h = int(m // 60)
80
+ m = m % 60
81
+ return f'{h}h {m}m {s}s'
82
+ else :
83
+ return f'{m}m {s}s'
84
+
85
+
86
+ def save_attention(attn, path):
87
+ import matplotlib.pyplot as plt
88
+
89
+ fig = plt.figure(figsize=(12, 6))
90
+ plt.imshow(attn.T, interpolation='nearest', aspect='auto')
91
+ fig.savefig(f'{path}.png', bbox_inches='tight')
92
+ plt.close(fig)
93
+
94
+
95
+ def save_spectrogram(M, path, length=None):
96
+ import matplotlib.pyplot as plt
97
+
98
+ M = np.flip(M, axis=0)
99
+ if length : M = M[:, :length]
100
+ fig = plt.figure(figsize=(12, 6))
101
+ plt.imshow(M, interpolation='nearest', aspect='auto')
102
+ fig.savefig(f'{path}.png', bbox_inches='tight')
103
+ plt.close(fig)
104
+
105
+
106
+ def plot(array):
107
+ import matplotlib.pyplot as plt
108
+
109
+ fig = plt.figure(figsize=(30, 5))
110
+ ax = fig.add_subplot(111)
111
+ ax.xaxis.label.set_color('grey')
112
+ ax.yaxis.label.set_color('grey')
113
+ ax.xaxis.label.set_fontsize(23)
114
+ ax.yaxis.label.set_fontsize(23)
115
+ ax.tick_params(axis='x', colors='grey', labelsize=23)
116
+ ax.tick_params(axis='y', colors='grey', labelsize=23)
117
+ plt.plot(array)
118
+
119
+
120
+ def plot_spec(M):
121
+ import matplotlib.pyplot as plt
122
+
123
+ M = np.flip(M, axis=0)
124
+ plt.figure(figsize=(18,4))
125
+ plt.imshow(M, interpolation='nearest', aspect='auto')
126
+ plt.show()
127
+
backend/app/vocoder/distribution.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import torch.nn.functional as F
4
+
5
+
6
+ def log_sum_exp(x):
7
+ """ numerically stable log_sum_exp implementation that prevents overflow """
8
+ # TF ordering
9
+ axis = len(x.size()) - 1
10
+ m, _ = torch.max(x, dim=axis)
11
+ m2, _ = torch.max(x, dim=axis, keepdim=True)
12
+ return m + torch.log(torch.sum(torch.exp(x - m2), dim=axis))
13
+
14
+
15
+ # It is adapted from https://github.com/r9y9/wavenet_vocoder/blob/master/wavenet_vocoder/mixture.py
16
+ def discretized_mix_logistic_loss(y_hat, y, num_classes=65536,
17
+ log_scale_min=None, reduce=True):
18
+ if log_scale_min is None:
19
+ log_scale_min = float(np.log(1e-14))
20
+ y_hat = y_hat.permute(0,2,1)
21
+ assert y_hat.dim() == 3
22
+ assert y_hat.size(1) % 3 == 0
23
+ nr_mix = y_hat.size(1) // 3
24
+
25
+ # (B x T x C)
26
+ y_hat = y_hat.transpose(1, 2)
27
+
28
+ # unpack parameters. (B, T, num_mixtures) x 3
29
+ logit_probs = y_hat[:, :, :nr_mix]
30
+ means = y_hat[:, :, nr_mix:2 * nr_mix]
31
+ log_scales = torch.clamp(y_hat[:, :, 2 * nr_mix:3 * nr_mix], min=log_scale_min)
32
+
33
+ # B x T x 1 -> B x T x num_mixtures
34
+ y = y.expand_as(means)
35
+
36
+ centered_y = y - means
37
+ inv_stdv = torch.exp(-log_scales)
38
+ plus_in = inv_stdv * (centered_y + 1. / (num_classes - 1))
39
+ cdf_plus = torch.sigmoid(plus_in)
40
+ min_in = inv_stdv * (centered_y - 1. / (num_classes - 1))
41
+ cdf_min = torch.sigmoid(min_in)
42
+
43
+ # log probability for edge case of 0 (before scaling)
44
+ # equivalent: torch.log(F.sigmoid(plus_in))
45
+ log_cdf_plus = plus_in - F.softplus(plus_in)
46
+
47
+ # log probability for edge case of 255 (before scaling)
48
+ # equivalent: (1 - F.sigmoid(min_in)).log()
49
+ log_one_minus_cdf_min = -F.softplus(min_in)
50
+
51
+ # probability for all other cases
52
+ cdf_delta = cdf_plus - cdf_min
53
+
54
+ mid_in = inv_stdv * centered_y
55
+ # log probability in the center of the bin, to be used in extreme cases
56
+ # (not actually used in our code)
57
+ log_pdf_mid = mid_in - log_scales - 2. * F.softplus(mid_in)
58
+
59
+ # tf equivalent
60
+ """
61
+ log_probs = tf.where(x < -0.999, log_cdf_plus,
62
+ tf.where(x > 0.999, log_one_minus_cdf_min,
63
+ tf.where(cdf_delta > 1e-5,
64
+ tf.log(tf.maximum(cdf_delta, 1e-12)),
65
+ log_pdf_mid - np.log(127.5))))
66
+ """
67
+ # TODO: cdf_delta <= 1e-5 actually can happen. How can we choose the value
68
+ # for num_classes=65536 case? 1e-7? not sure..
69
+ inner_inner_cond = (cdf_delta > 1e-5).float()
70
+
71
+ inner_inner_out = inner_inner_cond * \
72
+ torch.log(torch.clamp(cdf_delta, min=1e-12)) + \
73
+ (1. - inner_inner_cond) * (log_pdf_mid - np.log((num_classes - 1) / 2))
74
+ inner_cond = (y > 0.999).float()
75
+ inner_out = inner_cond * log_one_minus_cdf_min + (1. - inner_cond) * inner_inner_out
76
+ cond = (y < -0.999).float()
77
+ log_probs = cond * log_cdf_plus + (1. - cond) * inner_out
78
+
79
+ log_probs = log_probs + F.log_softmax(logit_probs, -1)
80
+
81
+ if reduce:
82
+ return -torch.mean(log_sum_exp(log_probs))
83
+ else:
84
+ return -log_sum_exp(log_probs).unsqueeze(-1)
85
+
86
+
87
+ def sample_from_discretized_mix_logistic(y, log_scale_min=None):
88
+ """
89
+ Sample from discretized mixture of logistic distributions
90
+ Args:
91
+ y (Tensor): B x C x T
92
+ log_scale_min (float): Log scale minimum value
93
+ Returns:
94
+ Tensor: sample in range of [-1, 1].
95
+ """
96
+ if log_scale_min is None:
97
+ log_scale_min = float(np.log(1e-14))
98
+ assert y.size(1) % 3 == 0
99
+ nr_mix = y.size(1) // 3
100
+
101
+ # B x T x C
102
+ y = y.transpose(1, 2)
103
+ logit_probs = y[:, :, :nr_mix]
104
+
105
+ # sample mixture indicator from softmax
106
+ temp = logit_probs.data.new(logit_probs.size()).uniform_(1e-5, 1.0 - 1e-5)
107
+ temp = logit_probs.data - torch.log(- torch.log(temp))
108
+ _, argmax = temp.max(dim=-1)
109
+
110
+ # (B, T) -> (B, T, nr_mix)
111
+ one_hot = to_one_hot(argmax, nr_mix)
112
+ # select logistic parameters
113
+ means = torch.sum(y[:, :, nr_mix:2 * nr_mix] * one_hot, dim=-1)
114
+ log_scales = torch.clamp(torch.sum(
115
+ y[:, :, 2 * nr_mix:3 * nr_mix] * one_hot, dim=-1), min=log_scale_min)
116
+ # sample from logistic & clip to interval
117
+ # we don't actually round to the nearest 8bit value when sampling
118
+ u = means.data.new(means.size()).uniform_(1e-5, 1.0 - 1e-5)
119
+ x = means + torch.exp(log_scales) * (torch.log(u) - torch.log(1. - u))
120
+
121
+ x = torch.clamp(torch.clamp(x, min=-1.), max=1.)
122
+
123
+ return x
124
+
125
+
126
+ def to_one_hot(tensor, n, fill_with=1.):
127
+ # we perform one hot encore with respect to the last axis
128
+ one_hot = torch.FloatTensor(tensor.size() + (n,)).zero_()
129
+ if tensor.is_cuda:
130
+ one_hot = one_hot.cuda()
131
+ one_hot.scatter_(len(tensor.size()), tensor.unsqueeze(-1), fill_with)
132
+ return one_hot
backend/app/vocoder/hparams.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from synthesizer.hparams import hparams as _syn_hp
2
+
3
+
4
+ # Audio settings------------------------------------------------------------------------
5
+ # Match the values of the synthesizer
6
+ sample_rate = _syn_hp.sample_rate
7
+ n_fft = _syn_hp.n_fft
8
+ num_mels = _syn_hp.num_mels
9
+ hop_length = _syn_hp.hop_size
10
+ win_length = _syn_hp.win_size
11
+ fmin = _syn_hp.fmin
12
+ min_level_db = _syn_hp.min_level_db
13
+ ref_level_db = _syn_hp.ref_level_db
14
+ mel_max_abs_value = _syn_hp.max_abs_value
15
+ preemphasis = _syn_hp.preemphasis
16
+ apply_preemphasis = _syn_hp.preemphasize
17
+
18
+ bits = 9 # bit depth of signal
19
+ mu_law = True # Recommended to suppress noise if using raw bits in hp.voc_mode
20
+ # below
21
+
22
+
23
+ # WAVERNN / VOCODER --------------------------------------------------------------------------------
24
+ voc_mode = 'RAW' # either 'RAW' (softmax on raw bits) or 'MOL' (sample from
25
+ # mixture of logistics)
26
+ voc_upsample_factors = (5, 5, 8) # NB - this needs to correctly factorise hop_length
27
+ voc_rnn_dims = 512
28
+ voc_fc_dims = 512
29
+ voc_compute_dims = 128
30
+ voc_res_out_dims = 128
31
+ voc_res_blocks = 10
32
+
33
+ # Training
34
+ voc_batch_size = 100
35
+ voc_lr = 1e-4
36
+ voc_gen_at_checkpoint = 5 # number of samples to generate at each checkpoint
37
+ voc_pad = 2 # this will pad the input so that the resnet can 'see' wider
38
+ # than input length
39
+ voc_seq_len = hop_length * 5 # must be a multiple of hop_length
40
+
41
+ # Generating / Synthesizing
42
+ voc_gen_batched = True # very fast (realtime+) single utterance batched generation
43
+ voc_target = 8000 # target number of samples to be generated in each batch entry
44
+ voc_overlap = 400 # number of samples for crossfading between batches
backend/app/vocoder/inference.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .models.fatchord_version import WaveRNN
2
+ from . import hparams as hp
3
+ import torch
4
+
5
+
6
+ _model = None # type: WaveRNN
7
+
8
+ def load_model(weights_fpath, verbose=True):
9
+ global _model, _device
10
+
11
+ if verbose:
12
+ print("Building Wave-RNN")
13
+ _model = WaveRNN(
14
+ rnn_dims=hp.voc_rnn_dims,
15
+ fc_dims=hp.voc_fc_dims,
16
+ bits=hp.bits,
17
+ pad=hp.voc_pad,
18
+ upsample_factors=hp.voc_upsample_factors,
19
+ feat_dims=hp.num_mels,
20
+ compute_dims=hp.voc_compute_dims,
21
+ res_out_dims=hp.voc_res_out_dims,
22
+ res_blocks=hp.voc_res_blocks,
23
+ hop_length=hp.hop_length,
24
+ sample_rate=hp.sample_rate,
25
+ mode=hp.voc_mode
26
+ )
27
+
28
+ if torch.cuda.is_available():
29
+ _model = _model.cuda()
30
+ _device = torch.device('cuda')
31
+ else:
32
+ _device = torch.device('cpu')
33
+
34
+ if verbose:
35
+ print("Loading model weights at %s" % weights_fpath)
36
+ checkpoint = torch.load(weights_fpath, _device)
37
+ _model.load_state_dict(checkpoint['model_state'])
38
+ _model.eval()
39
+
40
+
41
+ def is_loaded():
42
+ return _model is not None
43
+
44
+
45
+ def infer_waveform(mel, normalize=True, batched=True, target=8000, overlap=800,
46
+ progress_callback=None):
47
+ """
48
+ Infers the waveform of a mel spectrogram output by the synthesizer (the format must match
49
+ that of the synthesizer!)
50
+
51
+ :param normalize:
52
+ :param batched:
53
+ :param target:
54
+ :param overlap:
55
+ :return:
56
+ """
57
+ import sys
58
+ if _model is None:
59
+ raise Exception("Please load Wave-RNN in memory before using it")
60
+
61
+ print(f"[Vocoder] Input mel-spectrogram shape: {mel.shape}")
62
+ print(f"[Vocoder] Normalize: {normalize}, Batched: {batched}, Target: {target}, Overlap: {overlap}")
63
+ print(f"[Vocoder] Device: {_device}, Model on: {next(_model.parameters()).device}")
64
+
65
+ try:
66
+ if normalize:
67
+ mel = mel / hp.mel_max_abs_value
68
+ mel = torch.from_numpy(mel[None, ...])
69
+ print(f"[Vocoder] Mel tensor shape after processing: {mel.shape}, dtype: {mel.dtype}")
70
+
71
+ print("[Vocoder] Starting waveform generation (this may take a while on CPU)...")
72
+ sys.stdout.flush()
73
+
74
+ wav = _model.generate(mel, batched, target, overlap, hp.mu_law, progress_callback)
75
+
76
+ print(f"[Vocoder] Waveform generated successfully, shape: {wav.shape}")
77
+ return wav
78
+ except Exception as e:
79
+ print(f"[Vocoder] ✗ Error during vocoding: {e}")
80
+ import traceback
81
+ traceback.print_exc()
82
+ sys.stdout.flush()
83
+ raise
backend/app/vocoder/models/fatchord_version.py ADDED
@@ -0,0 +1,434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ from ..distribution import sample_from_discretized_mix_logistic
5
+ from ..display import *
6
+ from ..audio import *
7
+
8
+
9
+ class ResBlock(nn.Module):
10
+ def __init__(self, dims):
11
+ super().__init__()
12
+ self.conv1 = nn.Conv1d(dims, dims, kernel_size=1, bias=False)
13
+ self.conv2 = nn.Conv1d(dims, dims, kernel_size=1, bias=False)
14
+ self.batch_norm1 = nn.BatchNorm1d(dims)
15
+ self.batch_norm2 = nn.BatchNorm1d(dims)
16
+
17
+ def forward(self, x):
18
+ residual = x
19
+ x = self.conv1(x)
20
+ x = self.batch_norm1(x)
21
+ x = F.relu(x)
22
+ x = self.conv2(x)
23
+ x = self.batch_norm2(x)
24
+ return x + residual
25
+
26
+
27
+ class MelResNet(nn.Module):
28
+ def __init__(self, res_blocks, in_dims, compute_dims, res_out_dims, pad):
29
+ super().__init__()
30
+ k_size = pad * 2 + 1
31
+ self.conv_in = nn.Conv1d(in_dims, compute_dims, kernel_size=k_size, bias=False)
32
+ self.batch_norm = nn.BatchNorm1d(compute_dims)
33
+ self.layers = nn.ModuleList()
34
+ for i in range(res_blocks):
35
+ self.layers.append(ResBlock(compute_dims))
36
+ self.conv_out = nn.Conv1d(compute_dims, res_out_dims, kernel_size=1)
37
+
38
+ def forward(self, x):
39
+ x = self.conv_in(x)
40
+ x = self.batch_norm(x)
41
+ x = F.relu(x)
42
+ for f in self.layers: x = f(x)
43
+ x = self.conv_out(x)
44
+ return x
45
+
46
+
47
+ class Stretch2d(nn.Module):
48
+ def __init__(self, x_scale, y_scale):
49
+ super().__init__()
50
+ self.x_scale = x_scale
51
+ self.y_scale = y_scale
52
+
53
+ def forward(self, x):
54
+ b, c, h, w = x.size()
55
+ x = x.unsqueeze(-1).unsqueeze(3)
56
+ x = x.repeat(1, 1, 1, self.y_scale, 1, self.x_scale)
57
+ return x.view(b, c, h * self.y_scale, w * self.x_scale)
58
+
59
+
60
+ class UpsampleNetwork(nn.Module):
61
+ def __init__(self, feat_dims, upsample_scales, compute_dims,
62
+ res_blocks, res_out_dims, pad):
63
+ super().__init__()
64
+ total_scale = np.cumprod(upsample_scales)[-1]
65
+ self.indent = pad * total_scale
66
+ self.resnet = MelResNet(res_blocks, feat_dims, compute_dims, res_out_dims, pad)
67
+ self.resnet_stretch = Stretch2d(total_scale, 1)
68
+ self.up_layers = nn.ModuleList()
69
+ for scale in upsample_scales:
70
+ k_size = (1, scale * 2 + 1)
71
+ padding = (0, scale)
72
+ stretch = Stretch2d(scale, 1)
73
+ conv = nn.Conv2d(1, 1, kernel_size=k_size, padding=padding, bias=False)
74
+ conv.weight.data.fill_(1. / k_size[1])
75
+ self.up_layers.append(stretch)
76
+ self.up_layers.append(conv)
77
+
78
+ def forward(self, m):
79
+ aux = self.resnet(m).unsqueeze(1)
80
+ aux = self.resnet_stretch(aux)
81
+ aux = aux.squeeze(1)
82
+ m = m.unsqueeze(1)
83
+ for f in self.up_layers: m = f(m)
84
+ m = m.squeeze(1)[:, :, self.indent:-self.indent]
85
+ return m.transpose(1, 2), aux.transpose(1, 2)
86
+
87
+
88
+ class WaveRNN(nn.Module):
89
+ def __init__(self, rnn_dims, fc_dims, bits, pad, upsample_factors,
90
+ feat_dims, compute_dims, res_out_dims, res_blocks,
91
+ hop_length, sample_rate, mode='RAW'):
92
+ super().__init__()
93
+ self.mode = mode
94
+ self.pad = pad
95
+ if self.mode == 'RAW' :
96
+ self.n_classes = 2 ** bits
97
+ elif self.mode == 'MOL' :
98
+ self.n_classes = 30
99
+ else :
100
+ RuntimeError("Unknown model mode value - ", self.mode)
101
+
102
+ self.rnn_dims = rnn_dims
103
+ self.aux_dims = res_out_dims // 4
104
+ self.hop_length = hop_length
105
+ self.sample_rate = sample_rate
106
+
107
+ self.upsample = UpsampleNetwork(feat_dims, upsample_factors, compute_dims, res_blocks, res_out_dims, pad)
108
+ self.I = nn.Linear(feat_dims + self.aux_dims + 1, rnn_dims)
109
+ self.rnn1 = nn.GRU(rnn_dims, rnn_dims, batch_first=True)
110
+ self.rnn2 = nn.GRU(rnn_dims + self.aux_dims, rnn_dims, batch_first=True)
111
+ self.fc1 = nn.Linear(rnn_dims + self.aux_dims, fc_dims)
112
+ self.fc2 = nn.Linear(fc_dims + self.aux_dims, fc_dims)
113
+ self.fc3 = nn.Linear(fc_dims, self.n_classes)
114
+
115
+ self.step = nn.Parameter(torch.zeros(1).long(), requires_grad=False)
116
+ self.num_params()
117
+
118
+ def forward(self, x, mels):
119
+ self.step += 1
120
+ bsize = x.size(0)
121
+ if torch.cuda.is_available():
122
+ h1 = torch.zeros(1, bsize, self.rnn_dims).cuda()
123
+ h2 = torch.zeros(1, bsize, self.rnn_dims).cuda()
124
+ else:
125
+ h1 = torch.zeros(1, bsize, self.rnn_dims).cpu()
126
+ h2 = torch.zeros(1, bsize, self.rnn_dims).cpu()
127
+ mels, aux = self.upsample(mels)
128
+
129
+ aux_idx = [self.aux_dims * i for i in range(5)]
130
+ a1 = aux[:, :, aux_idx[0]:aux_idx[1]]
131
+ a2 = aux[:, :, aux_idx[1]:aux_idx[2]]
132
+ a3 = aux[:, :, aux_idx[2]:aux_idx[3]]
133
+ a4 = aux[:, :, aux_idx[3]:aux_idx[4]]
134
+
135
+ x = torch.cat([x.unsqueeze(-1), mels, a1], dim=2)
136
+ x = self.I(x)
137
+ res = x
138
+ x, _ = self.rnn1(x, h1)
139
+
140
+ x = x + res
141
+ res = x
142
+ x = torch.cat([x, a2], dim=2)
143
+ x, _ = self.rnn2(x, h2)
144
+
145
+ x = x + res
146
+ x = torch.cat([x, a3], dim=2)
147
+ x = F.relu(self.fc1(x))
148
+
149
+ x = torch.cat([x, a4], dim=2)
150
+ x = F.relu(self.fc2(x))
151
+ return self.fc3(x)
152
+
153
+ def generate(self, mels, batched, target, overlap, mu_law, progress_callback=None):
154
+ mu_law = mu_law if self.mode == 'RAW' else False
155
+ progress_callback = progress_callback or self.gen_display
156
+
157
+ self.eval()
158
+ output = []
159
+ start = time.time()
160
+ rnn1 = self.get_gru_cell(self.rnn1)
161
+ rnn2 = self.get_gru_cell(self.rnn2)
162
+
163
+ with torch.no_grad():
164
+ if torch.cuda.is_available():
165
+ mels = mels.cuda()
166
+ else:
167
+ mels = mels.cpu()
168
+ wave_len = (mels.size(-1) - 1) * self.hop_length
169
+ mels = self.pad_tensor(mels.transpose(1, 2), pad=self.pad, side='both')
170
+ mels, aux = self.upsample(mels.transpose(1, 2))
171
+
172
+ if batched:
173
+ mels = self.fold_with_overlap(mels, target, overlap)
174
+ aux = self.fold_with_overlap(aux, target, overlap)
175
+
176
+ b_size, seq_len, _ = mels.size()
177
+
178
+ if torch.cuda.is_available():
179
+ h1 = torch.zeros(b_size, self.rnn_dims).cuda()
180
+ h2 = torch.zeros(b_size, self.rnn_dims).cuda()
181
+ x = torch.zeros(b_size, 1).cuda()
182
+ else:
183
+ h1 = torch.zeros(b_size, self.rnn_dims).cpu()
184
+ h2 = torch.zeros(b_size, self.rnn_dims).cpu()
185
+ x = torch.zeros(b_size, 1).cpu()
186
+
187
+ d = self.aux_dims
188
+ aux_split = [aux[:, :, d * i:d * (i + 1)] for i in range(4)]
189
+
190
+ for i in range(seq_len):
191
+
192
+ m_t = mels[:, i, :]
193
+
194
+ a1_t, a2_t, a3_t, a4_t = (a[:, i, :] for a in aux_split)
195
+
196
+ x = torch.cat([x, m_t, a1_t], dim=1)
197
+ x = self.I(x)
198
+ h1 = rnn1(x, h1)
199
+
200
+ x = x + h1
201
+ inp = torch.cat([x, a2_t], dim=1)
202
+ h2 = rnn2(inp, h2)
203
+
204
+ x = x + h2
205
+ x = torch.cat([x, a3_t], dim=1)
206
+ x = F.relu(self.fc1(x))
207
+
208
+ x = torch.cat([x, a4_t], dim=1)
209
+ x = F.relu(self.fc2(x))
210
+
211
+ logits = self.fc3(x)
212
+
213
+ if self.mode == 'MOL':
214
+ sample = sample_from_discretized_mix_logistic(logits.unsqueeze(0).transpose(1, 2))
215
+ output.append(sample.view(-1))
216
+ if torch.cuda.is_available():
217
+ # x = torch.FloatTensor([[sample]]).cuda()
218
+ x = sample.transpose(0, 1).cuda()
219
+ else:
220
+ x = sample.transpose(0, 1)
221
+
222
+ elif self.mode == 'RAW' :
223
+ posterior = F.softmax(logits, dim=1)
224
+ distrib = torch.distributions.Categorical(posterior)
225
+
226
+ sample = 2 * distrib.sample().float() / (self.n_classes - 1.) - 1.
227
+ output.append(sample)
228
+ x = sample.unsqueeze(-1)
229
+ else:
230
+ raise RuntimeError("Unknown model mode value - ", self.mode)
231
+
232
+ if i % 100 == 0:
233
+ gen_rate = (i + 1) / (time.time() - start) * b_size / 1000
234
+ progress_callback(i, seq_len, b_size, gen_rate)
235
+
236
+ output = torch.stack(output).transpose(0, 1)
237
+ output = output.cpu().numpy()
238
+ output = output.astype(np.float64)
239
+
240
+ if batched:
241
+ output = self.xfade_and_unfold(output, target, overlap)
242
+ else:
243
+ output = output[0]
244
+
245
+ if mu_law:
246
+ output = decode_mu_law(output, self.n_classes, False)
247
+ if hp.apply_preemphasis:
248
+ output = de_emphasis(output)
249
+
250
+ # Fade-out at the end to avoid signal cutting out suddenly
251
+ fade_out = np.linspace(1, 0, 20 * self.hop_length)
252
+ output = output[:wave_len]
253
+ output[-20 * self.hop_length:] *= fade_out
254
+
255
+ self.train()
256
+
257
+ return output
258
+
259
+
260
+ def gen_display(self, i, seq_len, b_size, gen_rate):
261
+ pbar = progbar(i, seq_len)
262
+ msg = f'| {pbar} {i*b_size}/{seq_len*b_size} | Batch Size: {b_size} | Gen Rate: {gen_rate:.1f}kHz | '
263
+ stream(msg)
264
+
265
+ def get_gru_cell(self, gru):
266
+ gru_cell = nn.GRUCell(gru.input_size, gru.hidden_size)
267
+ gru_cell.weight_hh.data = gru.weight_hh_l0.data
268
+ gru_cell.weight_ih.data = gru.weight_ih_l0.data
269
+ gru_cell.bias_hh.data = gru.bias_hh_l0.data
270
+ gru_cell.bias_ih.data = gru.bias_ih_l0.data
271
+ return gru_cell
272
+
273
+ def pad_tensor(self, x, pad, side='both'):
274
+ # NB - this is just a quick method i need right now
275
+ # i.e., it won't generalise to other shapes/dims
276
+ b, t, c = x.size()
277
+ total = t + 2 * pad if side == 'both' else t + pad
278
+ if torch.cuda.is_available():
279
+ padded = torch.zeros(b, total, c).cuda()
280
+ else:
281
+ padded = torch.zeros(b, total, c).cpu()
282
+ if side == 'before' or side == 'both':
283
+ padded[:, pad:pad + t, :] = x
284
+ elif side == 'after':
285
+ padded[:, :t, :] = x
286
+ return padded
287
+
288
+ def fold_with_overlap(self, x, target, overlap):
289
+
290
+ ''' Fold the tensor with overlap for quick batched inference.
291
+ Overlap will be used for crossfading in xfade_and_unfold()
292
+
293
+ Args:
294
+ x (tensor) : Upsampled conditioning features.
295
+ shape=(1, timesteps, features)
296
+ target (int) : Target timesteps for each index of batch
297
+ overlap (int) : Timesteps for both xfade and rnn warmup
298
+
299
+ Return:
300
+ (tensor) : shape=(num_folds, target + 2 * overlap, features)
301
+
302
+ Details:
303
+ x = [[h1, h2, ... hn]]
304
+
305
+ Where each h is a vector of conditioning features
306
+
307
+ Eg: target=2, overlap=1 with x.size(1)=10
308
+
309
+ folded = [[h1, h2, h3, h4],
310
+ [h4, h5, h6, h7],
311
+ [h7, h8, h9, h10]]
312
+ '''
313
+
314
+ _, total_len, features = x.size()
315
+
316
+ # Calculate variables needed
317
+ num_folds = (total_len - overlap) // (target + overlap)
318
+ extended_len = num_folds * (overlap + target) + overlap
319
+ remaining = total_len - extended_len
320
+
321
+ # Pad if some time steps poking out
322
+ if remaining != 0:
323
+ num_folds += 1
324
+ padding = target + 2 * overlap - remaining
325
+ x = self.pad_tensor(x, padding, side='after')
326
+
327
+ if torch.cuda.is_available():
328
+ folded = torch.zeros(num_folds, target + 2 * overlap, features).cuda()
329
+ else:
330
+ folded = torch.zeros(num_folds, target + 2 * overlap, features).cpu()
331
+
332
+ # Get the values for the folded tensor
333
+ for i in range(num_folds):
334
+ start = i * (target + overlap)
335
+ end = start + target + 2 * overlap
336
+ folded[i] = x[:, start:end, :]
337
+
338
+ return folded
339
+
340
+ def xfade_and_unfold(self, y, target, overlap):
341
+
342
+ ''' Applies a crossfade and unfolds into a 1d array.
343
+
344
+ Args:
345
+ y (ndarry) : Batched sequences of audio samples
346
+ shape=(num_folds, target + 2 * overlap)
347
+ dtype=np.float64
348
+ overlap (int) : Timesteps for both xfade and rnn warmup
349
+
350
+ Return:
351
+ (ndarry) : audio samples in a 1d array
352
+ shape=(total_len)
353
+ dtype=np.float64
354
+
355
+ Details:
356
+ y = [[seq1],
357
+ [seq2],
358
+ [seq3]]
359
+
360
+ Apply a gain envelope at both ends of the sequences
361
+
362
+ y = [[seq1_in, seq1_target, seq1_out],
363
+ [seq2_in, seq2_target, seq2_out],
364
+ [seq3_in, seq3_target, seq3_out]]
365
+
366
+ Stagger and add up the groups of samples:
367
+
368
+ [seq1_in, seq1_target, (seq1_out + seq2_in), seq2_target, ...]
369
+
370
+ '''
371
+
372
+ num_folds, length = y.shape
373
+ target = length - 2 * overlap
374
+ total_len = num_folds * (target + overlap) + overlap
375
+
376
+ # Need some silence for the rnn warmup
377
+ silence_len = overlap // 2
378
+ fade_len = overlap - silence_len
379
+ silence = np.zeros((silence_len), dtype=np.float64)
380
+
381
+ # Equal power crossfade
382
+ t = np.linspace(-1, 1, fade_len, dtype=np.float64)
383
+ fade_in = np.sqrt(0.5 * (1 + t))
384
+ fade_out = np.sqrt(0.5 * (1 - t))
385
+
386
+ # Concat the silence to the fades
387
+ fade_in = np.concatenate([silence, fade_in])
388
+ fade_out = np.concatenate([fade_out, silence])
389
+
390
+ # Apply the gain to the overlap samples
391
+ y[:, :overlap] *= fade_in
392
+ y[:, -overlap:] *= fade_out
393
+
394
+ unfolded = np.zeros((total_len), dtype=np.float64)
395
+
396
+ # Loop to add up all the samples
397
+ for i in range(num_folds):
398
+ start = i * (target + overlap)
399
+ end = start + target + 2 * overlap
400
+ unfolded[start:end] += y[i]
401
+
402
+ return unfolded
403
+
404
+ def get_step(self) :
405
+ return self.step.data.item()
406
+
407
+ def checkpoint(self, model_dir, optimizer) :
408
+ k_steps = self.get_step() // 1000
409
+ self.save(model_dir.joinpath("checkpoint_%dk_steps.pt" % k_steps), optimizer)
410
+
411
+ def log(self, path, msg) :
412
+ with open(path, 'a') as f:
413
+ print(msg, file=f)
414
+
415
+ def load(self, path, optimizer) :
416
+ checkpoint = torch.load(path)
417
+ if "optimizer_state" in checkpoint:
418
+ self.load_state_dict(checkpoint["model_state"])
419
+ optimizer.load_state_dict(checkpoint["optimizer_state"])
420
+ else:
421
+ # Backwards compatibility
422
+ self.load_state_dict(checkpoint)
423
+
424
+ def save(self, path, optimizer) :
425
+ torch.save({
426
+ "model_state": self.state_dict(),
427
+ "optimizer_state": optimizer.state_dict(),
428
+ }, path)
429
+
430
+ def num_params(self, print_out=True):
431
+ parameters = filter(lambda p: p.requires_grad, self.parameters())
432
+ parameters = sum([np.prod(p.size()) for p in parameters]) / 1_000_000
433
+ if print_out :
434
+ print('Trainable Parameters: %.3fM' % parameters)
backend/app/voice_cloning.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Core voice cloning logic shared by the API routes."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import shutil
6
+ import gc
7
+ import torch
8
+ from pathlib import Path
9
+ from typing import Dict, Tuple
10
+
11
+ import numpy as np
12
+ import soundfile as sf
13
+ from huggingface_hub import hf_hub_download
14
+
15
+ from encoder import inference as encoder_infer
16
+ from synthesizer import inference as synthesizer_infer
17
+ from synthesizer.hparams import hparams as syn_hp
18
+ from app.vocoder import inference as vocoder_infer
19
+
20
+
21
+ MODEL_SPECS: Dict[str, Tuple[str, str]] = {
22
+ "encoder.pt": ("AJ50/voice-clone-encoder", "encoder.pt"),
23
+ "synthesizer.pt": ("AJ50/voice-clone-synthesizer", "synthesizer.pt"),
24
+ "vocoder.pt": ("AJ50/voice-clone-vocoder", "vocoder.pt"),
25
+ }
26
+
27
+
28
+ def ensure_default_models(models_dir: Path) -> None:
29
+ """Download the required pretrained weights if they are missing."""
30
+
31
+ target_dir = models_dir / "default"
32
+ target_dir.mkdir(parents=True, exist_ok=True)
33
+
34
+ for filename, (repo_id, repo_filename) in MODEL_SPECS.items():
35
+ destination = target_dir / filename
36
+ if destination.exists():
37
+ continue
38
+
39
+ print(f"[Models] Downloading {filename} from {repo_id}...")
40
+ downloaded_path = Path(
41
+ hf_hub_download(repo_id=repo_id, filename=repo_filename)
42
+ )
43
+ shutil.copy2(downloaded_path, destination)
44
+ print(f"[Models] Saved to {destination}")
45
+
46
+
47
+ def synthesize(voice_path: Path, text: str, models_dir: Path, out_path: Path) -> Path:
48
+ """Run end-to-end voice cloning and return the generated audio path."""
49
+
50
+ ensure_default_models(models_dir)
51
+
52
+ enc_path = models_dir / "default" / "encoder.pt"
53
+ syn_path = models_dir / "default" / "synthesizer.pt"
54
+ voc_path = models_dir / "default" / "vocoder.pt"
55
+
56
+ for model_path in (enc_path, syn_path, voc_path):
57
+ if not model_path.exists():
58
+ raise RuntimeError(f"Model file missing: {model_path}")
59
+
60
+ print("[VoiceCloning] Loading encoder...")
61
+ encoder_infer.load_model(enc_path)
62
+ print("[VoiceCloning] Loading synthesizer...")
63
+ synthesizer = synthesizer_infer.Synthesizer(syn_path)
64
+ print("[VoiceCloning] Loading vocoder...")
65
+ vocoder_infer.load_model(voc_path)
66
+
67
+ if not voice_path.exists():
68
+ raise RuntimeError(f"Reference voice file not found: {voice_path}")
69
+
70
+ print("[VoiceCloning] Preprocessing reference audio...")
71
+ wav = encoder_infer.preprocess_wav(voice_path)
72
+ embed = encoder_infer.embed_utterance(wav)
73
+
74
+ print("[VoiceCloning] Generating mel-spectrogram...")
75
+ mels = synthesizer.synthesize_spectrograms([text], [embed])
76
+ mel = mels[0]
77
+
78
+ print("[VoiceCloning] Vocoding waveform...")
79
+ try:
80
+ waveform = synthesizer.griffin_lim(mel).astype(np.float32)
81
+ except Exception:
82
+ waveform = vocoder_infer.infer_waveform(
83
+ mel, normalize=True, batched=False, target=8000, overlap=800
84
+ ).astype(np.float32)
85
+
86
+ out_path.parent.mkdir(parents=True, exist_ok=True)
87
+ sf.write(out_path.as_posix(), waveform, syn_hp.sample_rate)
88
+ print(f"[VoiceCloning] Audio saved to {out_path}")
89
+
90
+ # Memory optimization for Render free tier
91
+ print("[VoiceCloning] Cleaning up models to free memory...")
92
+ try:
93
+ # Clear model caches
94
+ if hasattr(encoder_infer, '_model'):
95
+ encoder_infer._model = None
96
+ if hasattr(synthesizer_infer, '_model'):
97
+ synthesizer_infer._model = None
98
+ if hasattr(vocoder_infer, '_model'):
99
+ vocoder_infer._model = None
100
+
101
+ # Force garbage collection
102
+ gc.collect()
103
+ if torch.cuda.is_available():
104
+ torch.cuda.empty_cache()
105
+ except Exception as e:
106
+ print(f"[VoiceCloning] Warning during cleanup: {e}")
107
+
108
+ return out_path
backend/download_models.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Download models from HuggingFace on startup
3
+ Run this once or on container startup for Render
4
+ """
5
+
6
+ from pathlib import Path
7
+ from huggingface_hub import hf_hub_download
8
+ import shutil
9
+ import sys
10
+
11
+ MODEL_SPECS = {
12
+ "encoder.pt": ("AJ50/voice-clone-encoder", "encoder.pt"),
13
+ "synthesizer.pt": ("AJ50/voice-clone-synthesizer", "synthesizer.pt"),
14
+ "vocoder.pt": ("AJ50/voice-clone-vocoder", "vocoder.pt"),
15
+ }
16
+
17
+ def download_models(models_dir: Path) -> None:
18
+ """Download required models from HuggingFace if missing"""
19
+
20
+ target_dir = models_dir / "default"
21
+ target_dir.mkdir(parents=True, exist_ok=True)
22
+
23
+ print(f"[Models] Target directory: {target_dir}")
24
+
25
+ for filename, (repo_id, repo_filename) in MODEL_SPECS.items():
26
+ destination = target_dir / filename
27
+
28
+ # Skip if already exists
29
+ if destination.exists():
30
+ size_mb = destination.stat().st_size / (1024 * 1024)
31
+ print(f"✓ {filename} already exists ({size_mb:.1f} MB)")
32
+ continue
33
+
34
+ print(f"[Models] Downloading {filename} from {repo_id}...")
35
+ try:
36
+ downloaded_path = Path(
37
+ hf_hub_download(repo_id=repo_id, filename=repo_filename)
38
+ )
39
+ shutil.copy2(downloaded_path, destination)
40
+ size_mb = destination.stat().st_size / (1024 * 1024)
41
+ print(f"✓ Saved {filename} ({size_mb:.1f} MB) to {destination}")
42
+ except Exception as e:
43
+ print(f"✗ Failed to download {filename}: {e}")
44
+ return False
45
+
46
+ print("[Models] All models downloaded successfully!")
47
+ return True
48
+
49
+ if __name__ == "__main__":
50
+ backend_dir = Path(__file__).parent
51
+ models_dir = backend_dir / "models"
52
+
53
+ success = download_models(models_dir)
54
+ sys.exit(0 if success else 1)
backend/encoder/__init__.py ADDED
File without changes
backend/encoder/audio.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from scipy.ndimage.morphology import binary_dilation
2
+ from encoder.params_data import *
3
+ from pathlib import Path
4
+ from typing import Optional, Union
5
+ from warnings import warn
6
+ import numpy as np
7
+ import librosa
8
+ import struct
9
+
10
+ try:
11
+ import webrtcvad
12
+ except:
13
+ warn("Unable to import 'webrtcvad'. This package enables noise removal and is recommended.")
14
+ webrtcvad=None
15
+
16
+ int16_max = (2 ** 15) - 1
17
+
18
+
19
+ def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
20
+ source_sr: Optional[int] = None,
21
+ normalize: Optional[bool] = True,
22
+ trim_silence: Optional[bool] = True):
23
+ """
24
+ Applies the preprocessing operations used in training the Speaker Encoder to a waveform
25
+ either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
26
+
27
+ :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not
28
+ just .wav), either the waveform as a numpy array of floats.
29
+ :param source_sr: if passing an audio waveform, the sampling rate of the waveform before
30
+ preprocessing. After preprocessing, the waveform's sampling rate will match the data
31
+ hyperparameters. If passing a filepath, the sampling rate will be automatically detected and
32
+ this argument will be ignored.
33
+ """
34
+ # Load the wav from disk if needed
35
+ if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
36
+ wav, source_sr = librosa.load(str(fpath_or_wav), sr=None)
37
+ else:
38
+ wav = fpath_or_wav
39
+
40
+ # Resample the wav if needed
41
+ if source_sr is not None and source_sr != sampling_rate:
42
+ wav = librosa.resample(y=wav, orig_sr=source_sr, target_sr=sampling_rate)
43
+
44
+ # Apply the preprocessing: normalize volume and shorten long silences
45
+ if normalize:
46
+ wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
47
+ if webrtcvad and trim_silence:
48
+ wav = trim_long_silences(wav)
49
+
50
+ return wav
51
+
52
+
53
+ def wav_to_mel_spectrogram(wav):
54
+ """
55
+ Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
56
+ Note: this not a log-mel spectrogram.
57
+ """
58
+ frames = librosa.feature.melspectrogram(
59
+ y=wav,
60
+ sr=sampling_rate,
61
+ n_fft=int(sampling_rate * mel_window_length / 1000),
62
+ hop_length=int(sampling_rate * mel_window_step / 1000),
63
+ n_mels=mel_n_channels
64
+ )
65
+ return frames.astype(np.float32).T
66
+
67
+
68
+ def trim_long_silences(wav):
69
+ """
70
+ Ensures that segments without voice in the waveform remain no longer than a
71
+ threshold determined by the VAD parameters in params.py.
72
+
73
+ :param wav: the raw waveform as a numpy array of floats
74
+ :return: the same waveform with silences trimmed away (length <= original wav length)
75
+ """
76
+ # Compute the voice detection window size
77
+ samples_per_window = (vad_window_length * sampling_rate) // 1000
78
+
79
+ # Trim the end of the audio to have a multiple of the window size
80
+ wav = wav[:len(wav) - (len(wav) % samples_per_window)]
81
+
82
+ # Convert the float waveform to 16-bit mono PCM
83
+ pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
84
+
85
+ # Perform voice activation detection
86
+ voice_flags = []
87
+ vad = webrtcvad.Vad(mode=3)
88
+ for window_start in range(0, len(wav), samples_per_window):
89
+ window_end = window_start + samples_per_window
90
+ voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
91
+ sample_rate=sampling_rate))
92
+ voice_flags = np.array(voice_flags)
93
+
94
+ # Smooth the voice detection with a moving average
95
+ def moving_average(array, width):
96
+ array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
97
+ ret = np.cumsum(array_padded, dtype=float)
98
+ ret[width:] = ret[width:] - ret[:-width]
99
+ return ret[width - 1:] / width
100
+
101
+ audio_mask = moving_average(voice_flags, vad_moving_average_width)
102
+ audio_mask = np.round(audio_mask).astype(bool)
103
+
104
+ # Dilate the voiced regions
105
+ audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
106
+ audio_mask = np.repeat(audio_mask, samples_per_window)
107
+
108
+ return wav[audio_mask == True]
109
+
110
+
111
+ def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
112
+ if increase_only and decrease_only:
113
+ raise ValueError("Both increase only and decrease only are set")
114
+ dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2))
115
+ if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
116
+ return wav
117
+ return wav * (10 ** (dBFS_change / 20))
backend/encoder/inference.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from encoder.params_data import *
2
+ from encoder.model import SpeakerEncoder
3
+ from encoder.audio import preprocess_wav # We want to expose this function from here
4
+ from matplotlib import cm
5
+ from encoder import audio
6
+ from pathlib import Path
7
+ import numpy as np
8
+ import torch
9
+
10
+ _model = None # type: SpeakerEncoder
11
+ _device = None # type: torch.device
12
+
13
+
14
+ def load_model(weights_fpath: Path, device=None):
15
+ """
16
+ Loads the model in memory. If this function is not explicitely called, it will be run on the
17
+ first call to embed_frames() with the default weights file.
18
+
19
+ :param weights_fpath: the path to saved model weights.
20
+ :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The
21
+ model will be loaded and will run on this device. Outputs will however always be on the cpu.
22
+ If None, will default to your GPU if it"s available, otherwise your CPU.
23
+ """
24
+ # TODO: I think the slow loading of the encoder might have something to do with the device it
25
+ # was saved on. Worth investigating.
26
+ global _model, _device
27
+ if device is None:
28
+ _device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
29
+ elif isinstance(device, str):
30
+ _device = torch.device(device)
31
+ _model = SpeakerEncoder(_device, torch.device("cpu"))
32
+ checkpoint = torch.load(weights_fpath, _device)
33
+ _model.load_state_dict(checkpoint["model_state"])
34
+ _model.eval()
35
+ print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"]))
36
+
37
+
38
+ def is_loaded():
39
+ return _model is not None
40
+
41
+
42
+ def embed_frames_batch(frames_batch):
43
+ """
44
+ Computes embeddings for a batch of mel spectrogram.
45
+
46
+ :param frames_batch: a batch mel of spectrogram as a numpy array of float32 of shape
47
+ (batch_size, n_frames, n_channels)
48
+ :return: the embeddings as a numpy array of float32 of shape (batch_size, model_embedding_size)
49
+ """
50
+ if _model is None:
51
+ raise Exception("Model was not loaded. Call load_model() before inference.")
52
+
53
+ frames = torch.from_numpy(frames_batch).to(_device)
54
+ embed = _model.forward(frames).detach().cpu().numpy()
55
+ return embed
56
+
57
+
58
+ def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames,
59
+ min_pad_coverage=0.75, overlap=0.5):
60
+ """
61
+ Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain
62
+ partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel
63
+ spectrogram slices are returned, so as to make each partial utterance waveform correspond to
64
+ its spectrogram. This function assumes that the mel spectrogram parameters used are those
65
+ defined in params_data.py.
66
+
67
+ The returned ranges may be indexing further than the length of the waveform. It is
68
+ recommended that you pad the waveform with zeros up to wave_slices[-1].stop.
69
+
70
+ :param n_samples: the number of samples in the waveform
71
+ :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial
72
+ utterance
73
+ :param min_pad_coverage: when reaching the last partial utterance, it may or may not have
74
+ enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
75
+ then the last partial utterance will be considered, as if we padded the audio. Otherwise,
76
+ it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial
77
+ utterance, this parameter is ignored so that the function always returns at least 1 slice.
78
+ :param overlap: by how much the partial utterance should overlap. If set to 0, the partial
79
+ utterances are entirely disjoint.
80
+ :return: the waveform slices and mel spectrogram slices as lists of array slices. Index
81
+ respectively the waveform and the mel spectrogram with these slices to obtain the partial
82
+ utterances.
83
+ """
84
+ assert 0 <= overlap < 1
85
+ assert 0 < min_pad_coverage <= 1
86
+
87
+ samples_per_frame = int((sampling_rate * mel_window_step / 1000))
88
+ n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
89
+ frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1)
90
+
91
+ # Compute the slices
92
+ wav_slices, mel_slices = [], []
93
+ steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
94
+ for i in range(0, steps, frame_step):
95
+ mel_range = np.array([i, i + partial_utterance_n_frames])
96
+ wav_range = mel_range * samples_per_frame
97
+ mel_slices.append(slice(*mel_range))
98
+ wav_slices.append(slice(*wav_range))
99
+
100
+ # Evaluate whether extra padding is warranted or not
101
+ last_wav_range = wav_slices[-1]
102
+ coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
103
+ if coverage < min_pad_coverage and len(mel_slices) > 1:
104
+ mel_slices = mel_slices[:-1]
105
+ wav_slices = wav_slices[:-1]
106
+
107
+ return wav_slices, mel_slices
108
+
109
+
110
+ def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
111
+ """
112
+ Computes an embedding for a single utterance.
113
+
114
+ # TODO: handle multiple wavs to benefit from batching on GPU
115
+ :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32
116
+ :param using_partials: if True, then the utterance is split in partial utterances of
117
+ <partial_utterance_n_frames> frames and the utterance embedding is computed from their
118
+ normalized average. If False, the utterance is instead computed from feeding the entire
119
+ spectogram to the network.
120
+ :param return_partials: if True, the partial embeddings will also be returned along with the
121
+ wav slices that correspond to the partial embeddings.
122
+ :param kwargs: additional arguments to compute_partial_splits()
123
+ :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If
124
+ <return_partials> is True, the partial utterances as a numpy array of float32 of shape
125
+ (n_partials, model_embedding_size) and the wav partials as a list of slices will also be
126
+ returned. If <using_partials> is simultaneously set to False, both these values will be None
127
+ instead.
128
+ """
129
+ # Process the entire utterance if not using partials
130
+ if not using_partials:
131
+ frames = audio.wav_to_mel_spectrogram(wav)
132
+ embed = embed_frames_batch(frames[None, ...])[0]
133
+ if return_partials:
134
+ return embed, None, None
135
+ return embed
136
+
137
+ # Compute where to split the utterance into partials and pad if necessary
138
+ wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs)
139
+ max_wave_length = wave_slices[-1].stop
140
+ if max_wave_length >= len(wav):
141
+ wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
142
+
143
+ # Split the utterance into partials
144
+ frames = audio.wav_to_mel_spectrogram(wav)
145
+ frames_batch = np.array([frames[s] for s in mel_slices])
146
+ partial_embeds = embed_frames_batch(frames_batch)
147
+
148
+ # Compute the utterance embedding from the partial embeddings
149
+ raw_embed = np.mean(partial_embeds, axis=0)
150
+ embed = raw_embed / np.linalg.norm(raw_embed, 2)
151
+
152
+ if return_partials:
153
+ return embed, partial_embeds, wave_slices
154
+ return embed
155
+
156
+
157
+ def embed_speaker(wavs, **kwargs):
158
+ raise NotImplemented()
159
+
160
+
161
+ def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)):
162
+ import matplotlib.pyplot as plt
163
+ if ax is None:
164
+ ax = plt.gca()
165
+
166
+ if shape is None:
167
+ height = int(np.sqrt(len(embed)))
168
+ shape = (height, -1)
169
+ embed = embed.reshape(shape)
170
+
171
+ cmap = cm.get_cmap()
172
+ mappable = ax.imshow(embed, cmap=cmap)
173
+ cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04)
174
+ sm = cm.ScalarMappable(cmap=cmap)
175
+ sm.set_clim(*color_range)
176
+
177
+ ax.set_xticks([]), ax.set_yticks([])
178
+ ax.set_title(title)
backend/encoder/model.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from encoder.params_model import *
2
+ from encoder.params_data import *
3
+ from scipy.interpolate import interp1d
4
+ from sklearn.metrics import roc_curve
5
+ from torch.nn.utils import clip_grad_norm_
6
+ from scipy.optimize import brentq
7
+ from torch import nn
8
+ import numpy as np
9
+ import torch
10
+
11
+
12
+ class SpeakerEncoder(nn.Module):
13
+ def __init__(self, device, loss_device):
14
+ super().__init__()
15
+ self.loss_device = loss_device
16
+
17
+ # Network defition
18
+ self.lstm = nn.LSTM(input_size=mel_n_channels,
19
+ hidden_size=model_hidden_size,
20
+ num_layers=model_num_layers,
21
+ batch_first=True).to(device)
22
+ self.linear = nn.Linear(in_features=model_hidden_size,
23
+ out_features=model_embedding_size).to(device)
24
+ self.relu = torch.nn.ReLU().to(device)
25
+
26
+ # Cosine similarity scaling (with fixed initial parameter values)
27
+ self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device)
28
+ self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device)
29
+
30
+ # Loss
31
+ self.loss_fn = nn.CrossEntropyLoss().to(loss_device)
32
+
33
+ def do_gradient_ops(self):
34
+ # Gradient scale
35
+ self.similarity_weight.grad *= 0.01
36
+ self.similarity_bias.grad *= 0.01
37
+
38
+ # Gradient clipping
39
+ clip_grad_norm_(self.parameters(), 3, norm_type=2)
40
+
41
+ def forward(self, utterances, hidden_init=None):
42
+ """
43
+ Computes the embeddings of a batch of utterance spectrograms.
44
+
45
+ :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape
46
+ (batch_size, n_frames, n_channels)
47
+ :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers,
48
+ batch_size, hidden_size). Will default to a tensor of zeros if None.
49
+ :return: the embeddings as a tensor of shape (batch_size, embedding_size)
50
+ """
51
+ # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
52
+ # and the final cell state.
53
+ out, (hidden, cell) = self.lstm(utterances, hidden_init)
54
+
55
+ # We take only the hidden state of the last layer
56
+ embeds_raw = self.relu(self.linear(hidden[-1]))
57
+
58
+ # L2-normalize it
59
+ embeds = embeds_raw / (torch.norm(embeds_raw, dim=1, keepdim=True) + 1e-5)
60
+
61
+ return embeds
62
+
63
+ def similarity_matrix(self, embeds):
64
+ """
65
+ Computes the similarity matrix according the section 2.1 of GE2E.
66
+
67
+ :param embeds: the embeddings as a tensor of shape (speakers_per_batch,
68
+ utterances_per_speaker, embedding_size)
69
+ :return: the similarity matrix as a tensor of shape (speakers_per_batch,
70
+ utterances_per_speaker, speakers_per_batch)
71
+ """
72
+ speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
73
+
74
+ # Inclusive centroids (1 per speaker). Cloning is needed for reverse differentiation
75
+ centroids_incl = torch.mean(embeds, dim=1, keepdim=True)
76
+ centroids_incl = centroids_incl.clone() / (torch.norm(centroids_incl, dim=2, keepdim=True) + 1e-5)
77
+
78
+ # Exclusive centroids (1 per utterance)
79
+ centroids_excl = (torch.sum(embeds, dim=1, keepdim=True) - embeds)
80
+ centroids_excl /= (utterances_per_speaker - 1)
81
+ centroids_excl = centroids_excl.clone() / (torch.norm(centroids_excl, dim=2, keepdim=True) + 1e-5)
82
+
83
+ # Similarity matrix. The cosine similarity of already 2-normed vectors is simply the dot
84
+ # product of these vectors (which is just an element-wise multiplication reduced by a sum).
85
+ # We vectorize the computation for efficiency.
86
+ sim_matrix = torch.zeros(speakers_per_batch, utterances_per_speaker,
87
+ speakers_per_batch).to(self.loss_device)
88
+ mask_matrix = 1 - np.eye(speakers_per_batch, dtype=np.int)
89
+ for j in range(speakers_per_batch):
90
+ mask = np.where(mask_matrix[j])[0]
91
+ sim_matrix[mask, :, j] = (embeds[mask] * centroids_incl[j]).sum(dim=2)
92
+ sim_matrix[j, :, j] = (embeds[j] * centroids_excl[j]).sum(dim=1)
93
+
94
+ ## Even more vectorized version (slower maybe because of transpose)
95
+ # sim_matrix2 = torch.zeros(speakers_per_batch, speakers_per_batch, utterances_per_speaker
96
+ # ).to(self.loss_device)
97
+ # eye = np.eye(speakers_per_batch, dtype=np.int)
98
+ # mask = np.where(1 - eye)
99
+ # sim_matrix2[mask] = (embeds[mask[0]] * centroids_incl[mask[1]]).sum(dim=2)
100
+ # mask = np.where(eye)
101
+ # sim_matrix2[mask] = (embeds * centroids_excl).sum(dim=2)
102
+ # sim_matrix2 = sim_matrix2.transpose(1, 2)
103
+
104
+ sim_matrix = sim_matrix * self.similarity_weight + self.similarity_bias
105
+ return sim_matrix
106
+
107
+ def loss(self, embeds):
108
+ """
109
+ Computes the softmax loss according the section 2.1 of GE2E.
110
+
111
+ :param embeds: the embeddings as a tensor of shape (speakers_per_batch,
112
+ utterances_per_speaker, embedding_size)
113
+ :return: the loss and the EER for this batch of embeddings.
114
+ """
115
+ speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
116
+
117
+ # Loss
118
+ sim_matrix = self.similarity_matrix(embeds)
119
+ sim_matrix = sim_matrix.reshape((speakers_per_batch * utterances_per_speaker,
120
+ speakers_per_batch))
121
+ ground_truth = np.repeat(np.arange(speakers_per_batch), utterances_per_speaker)
122
+ target = torch.from_numpy(ground_truth).long().to(self.loss_device)
123
+ loss = self.loss_fn(sim_matrix, target)
124
+
125
+ # EER (not backpropagated)
126
+ with torch.no_grad():
127
+ inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=np.int)[0]
128
+ labels = np.array([inv_argmax(i) for i in ground_truth])
129
+ preds = sim_matrix.detach().cpu().numpy()
130
+
131
+ # Snippet from https://yangcha.github.io/EER-ROC/
132
+ fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten())
133
+ eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
134
+
135
+ return loss, eer
backend/encoder/params_data.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ## Mel-filterbank
3
+ mel_window_length = 25 # In milliseconds
4
+ mel_window_step = 10 # In milliseconds
5
+ mel_n_channels = 40
6
+
7
+
8
+ ## Audio
9
+ sampling_rate = 16000
10
+ # Number of spectrogram frames in a partial utterance
11
+ partials_n_frames = 160 # 1600 ms
12
+ # Number of spectrogram frames at inference
13
+ inference_n_frames = 80 # 800 ms
14
+
15
+
16
+ ## Voice Activation Detection
17
+ # Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
18
+ # This sets the granularity of the VAD. Should not need to be changed.
19
+ vad_window_length = 30 # In milliseconds
20
+ # Number of frames to average together when performing the moving average smoothing.
21
+ # The larger this value, the larger the VAD variations must be to not get smoothed out.
22
+ vad_moving_average_width = 8
23
+ # Maximum number of consecutive silent frames a segment can have.
24
+ vad_max_silence_length = 6
25
+
26
+
27
+ ## Audio volume normalization
28
+ audio_norm_target_dBFS = -30
29
+
backend/encoder/params_model.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ## Model parameters
3
+ model_hidden_size = 256
4
+ model_embedding_size = 256
5
+ model_num_layers = 3
6
+
7
+
8
+ ## Training parameters
9
+ learning_rate_init = 1e-4
10
+ speakers_per_batch = 64
11
+ utterances_per_speaker = 10
backend/enrolled_voices/voice_26bfa1ef.mp3 ADDED
Binary file (20.6 kB). View file
 
backend/enrolled_voices/voice_72beeda9.mp3 ADDED
Binary file (20.6 kB). View file
 
backend/enrolled_voices/voices.json ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "voice_705f524b",
4
+ "name": "Pragyan",
5
+ "filename": "voice_705f524b.wav",
6
+ "filepath": "enrolled_voices\\voice_705f524b.wav",
7
+ "createdAt": "2025-11-05T11:15:58.834934"
8
+ },
9
+ {
10
+ "id": "voice_5b7e198d",
11
+ "name": "Pragyan",
12
+ "filename": "voice_5b7e198d.wav",
13
+ "filepath": "enrolled_voices\\voice_5b7e198d.wav",
14
+ "createdAt": "2025-11-05T11:23:18.943413"
15
+ },
16
+ {
17
+ "id": "voice_e0a7c06e",
18
+ "name": "Pragyan",
19
+ "filename": "voice_e0a7c06e.mp3",
20
+ "filepath": "enrolled_voices\\voice_e0a7c06e.mp3",
21
+ "createdAt": "2025-11-05T11:31:33.094765"
22
+ },
23
+ {
24
+ "id": "voice_7d278c5f",
25
+ "name": "mY",
26
+ "filename": "voice_7d278c5f.mp3",
27
+ "filepath": "enrolled_voices\\voice_7d278c5f.mp3",
28
+ "createdAt": "2025-11-05T11:49:35.933861"
29
+ },
30
+ {
31
+ "id": "voice_44c22d65",
32
+ "name": "My1",
33
+ "filename": "voice_44c22d65.mp3",
34
+ "filepath": "enrolled_voices\\voice_44c22d65.mp3",
35
+ "createdAt": "2025-11-05T11:49:52.844973"
36
+ },
37
+ {
38
+ "id": "voice_eb54f62d",
39
+ "name": "MY2",
40
+ "filename": "voice_eb54f62d.mp3",
41
+ "filepath": "enrolled_voices\\voice_eb54f62d.mp3",
42
+ "createdAt": "2025-11-05T11:50:13.886497"
43
+ },
44
+ {
45
+ "id": "voice_ecb824ec",
46
+ "name": "Monu",
47
+ "filename": "voice_ecb824ec.wav",
48
+ "filepath": "enrolled_voices\\voice_ecb824ec.wav",
49
+ "createdAt": "2025-11-06T10:28:22.279407"
50
+ },
51
+ {
52
+ "id": "voice_0adf8594",
53
+ "name": "Pragyan1",
54
+ "filename": "voice_0adf8594.wav",
55
+ "filepath": "enrolled_voices\\voice_0adf8594.wav",
56
+ "createdAt": "2025-11-06T14:22:06.737234"
57
+ },
58
+ {
59
+ "id": "voice_fd577924",
60
+ "name": "MY3",
61
+ "filename": "voice_fd577924.wav",
62
+ "filepath": "enrolled_voices\\voice_fd577924.wav",
63
+ "createdAt": "2025-11-20T15:15:40.488404"
64
+ },
65
+ {
66
+ "id": "voice_a51275b7",
67
+ "name": "Testing Voice",
68
+ "filename": "voice_a51275b7.wav",
69
+ "filepath": "enrolled_voices\\voice_a51275b7.wav",
70
+ "createdAt": "2025-11-20T15:23:43.665441"
71
+ },
72
+ {
73
+ "id": "voice_ea85f251",
74
+ "name": "test",
75
+ "filename": "voice_ea85f251.wav",
76
+ "filepath": "enrolled_voices\\voice_ea85f251.wav",
77
+ "createdAt": "2025-11-25T09:47:22.148753"
78
+ },
79
+ {
80
+ "id": "voice_a4e34f00",
81
+ "name": "Class",
82
+ "filename": "voice_a4e34f00.wav",
83
+ "filepath": "enrolled_voices\\voice_a4e34f00.wav",
84
+ "createdAt": "2025-11-25T10:32:08.525704"
85
+ },
86
+ {
87
+ "id": "voice_26bfa1ef",
88
+ "name": "Saksham voice",
89
+ "filename": "voice_26bfa1ef.mp3",
90
+ "filepath": "E:\\Sem 5\\mini proejct main\\pragyan branch\\backend\\enrolled_voices\\voice_26bfa1ef.mp3",
91
+ "createdAt": "2025-11-28T11:08:59.773738"
92
+ },
93
+ {
94
+ "id": "voice_72beeda9",
95
+ "name": "Saksham voice",
96
+ "filename": "voice_72beeda9.mp3",
97
+ "filepath": "E:\\Sem 5\\mini proejct main\\pragyan branch\\backend\\enrolled_voices\\voice_72beeda9.mp3",
98
+ "createdAt": "2025-11-28T11:16:33.409663"
99
+ }
100
+ ]
backend/requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ flask==2.3.3
2
+ flask-cors==4.0.0
3
+ gunicorn==21.2.0
4
+ torch>=2.5.0
5
+ librosa>=0.10.0
6
+ soundfile>=0.12.0
7
+ numpy>=1.21.0
8
+ huggingface_hub>=0.19.0
9
+ matplotlib>=3.5.0
10
+ webrtcvad==2.0.10
11
+ scipy>=1.6.0
12
+ scikit-learn>=1.1.0
13
+ unidecode>=1.2.0
14
+ inflect>=6.0.0
backend/runtime.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ python-3.10.0
backend/synthesizer/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ #
backend/synthesizer/audio.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import librosa.filters
3
+ import numpy as np
4
+ from scipy import signal
5
+ from scipy.io import wavfile
6
+ import soundfile as sf
7
+
8
+
9
+ def load_wav(path, sr):
10
+ return librosa.core.load(path, sr=sr)[0]
11
+
12
+ def save_wav(wav, path, sr):
13
+ wav *= 32767 / max(0.01, np.max(np.abs(wav)))
14
+ #proposed by @dsmiller
15
+ wavfile.write(path, sr, wav.astype(np.int16))
16
+
17
+ def save_wavenet_wav(wav, path, sr):
18
+ sf.write(path, wav.astype(np.float32), sr)
19
+
20
+ def preemphasis(wav, k, preemphasize=True):
21
+ if preemphasize:
22
+ return signal.lfilter([1, -k], [1], wav)
23
+ return wav
24
+
25
+ def inv_preemphasis(wav, k, inv_preemphasize=True):
26
+ if inv_preemphasize:
27
+ return signal.lfilter([1], [1, -k], wav)
28
+ return wav
29
+
30
+ #From https://github.com/r9y9/wavenet_vocoder/blob/master/audio.py
31
+ def start_and_end_indices(quantized, silence_threshold=2):
32
+ for start in range(quantized.size):
33
+ if abs(quantized[start] - 127) > silence_threshold:
34
+ break
35
+ for end in range(quantized.size - 1, 1, -1):
36
+ if abs(quantized[end] - 127) > silence_threshold:
37
+ break
38
+
39
+ assert abs(quantized[start] - 127) > silence_threshold
40
+ assert abs(quantized[end] - 127) > silence_threshold
41
+
42
+ return start, end
43
+
44
+ def get_hop_size(hparams):
45
+ hop_size = hparams.hop_size
46
+ if hop_size is None:
47
+ assert hparams.frame_shift_ms is not None
48
+ hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
49
+ return hop_size
50
+
51
+ def linearspectrogram(wav, hparams):
52
+ D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
53
+ S = _amp_to_db(np.abs(D), hparams) - hparams.ref_level_db
54
+
55
+ if hparams.signal_normalization:
56
+ return _normalize(S, hparams)
57
+ return S
58
+
59
+ def melspectrogram(wav, hparams):
60
+ D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
61
+ S = _amp_to_db(_linear_to_mel(np.abs(D), hparams), hparams) - hparams.ref_level_db
62
+
63
+ if hparams.signal_normalization:
64
+ return _normalize(S, hparams)
65
+ return S
66
+
67
+ def inv_linear_spectrogram(linear_spectrogram, hparams):
68
+ """Converts linear spectrogram to waveform using librosa"""
69
+ if hparams.signal_normalization:
70
+ D = _denormalize(linear_spectrogram, hparams)
71
+ else:
72
+ D = linear_spectrogram
73
+
74
+ S = _db_to_amp(D + hparams.ref_level_db) #Convert back to linear
75
+
76
+ if hparams.use_lws:
77
+ processor = _lws_processor(hparams)
78
+ D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
79
+ y = processor.istft(D).astype(np.float32)
80
+ return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
81
+ else:
82
+ return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)
83
+
84
+ def inv_mel_spectrogram(mel_spectrogram, hparams):
85
+ """Converts mel spectrogram to waveform using librosa"""
86
+ if hparams.signal_normalization:
87
+ D = _denormalize(mel_spectrogram, hparams)
88
+ else:
89
+ D = mel_spectrogram
90
+
91
+ S = _mel_to_linear(_db_to_amp(D + hparams.ref_level_db), hparams) # Convert back to linear
92
+
93
+ if hparams.use_lws:
94
+ processor = _lws_processor(hparams)
95
+ D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
96
+ y = processor.istft(D).astype(np.float32)
97
+ return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
98
+ else:
99
+ return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)
100
+
101
+ def _lws_processor(hparams):
102
+ import lws
103
+ return lws.lws(hparams.n_fft, get_hop_size(hparams), fftsize=hparams.win_size, mode="speech")
104
+
105
+ def _griffin_lim(S, hparams):
106
+ """librosa implementation of Griffin-Lim
107
+ Based on https://github.com/librosa/librosa/issues/434
108
+ """
109
+ angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
110
+ S_complex = np.abs(S).astype(np.complex)
111
+ y = _istft(S_complex * angles, hparams)
112
+ for i in range(hparams.griffin_lim_iters):
113
+ angles = np.exp(1j * np.angle(_stft(y, hparams)))
114
+ y = _istft(S_complex * angles, hparams)
115
+ return y
116
+
117
+ def _stft(y, hparams):
118
+ if hparams.use_lws:
119
+ return _lws_processor(hparams).stft(y).T
120
+ else:
121
+ return librosa.stft(y=y, n_fft=hparams.n_fft, hop_length=get_hop_size(hparams), win_length=hparams.win_size)
122
+
123
+ def _istft(y, hparams):
124
+ return librosa.istft(y, hop_length=get_hop_size(hparams), win_length=hparams.win_size)
125
+
126
+ ##########################################################
127
+ #Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!)
128
+ def num_frames(length, fsize, fshift):
129
+ """Compute number of time frames of spectrogram
130
+ """
131
+ pad = (fsize - fshift)
132
+ if length % fshift == 0:
133
+ M = (length + pad * 2 - fsize) // fshift + 1
134
+ else:
135
+ M = (length + pad * 2 - fsize) // fshift + 2
136
+ return M
137
+
138
+
139
+ def pad_lr(x, fsize, fshift):
140
+ """Compute left and right padding
141
+ """
142
+ M = num_frames(len(x), fsize, fshift)
143
+ pad = (fsize - fshift)
144
+ T = len(x) + 2 * pad
145
+ r = (M - 1) * fshift + fsize - T
146
+ return pad, pad + r
147
+ ##########################################################
148
+ #Librosa correct padding
149
+ def librosa_pad_lr(x, fsize, fshift):
150
+ return 0, (x.shape[0] // fshift + 1) * fshift - x.shape[0]
151
+
152
+ # Conversions
153
+ _mel_basis = None
154
+ _inv_mel_basis = None
155
+
156
+ def _linear_to_mel(spectogram, hparams):
157
+ global _mel_basis
158
+ if _mel_basis is None:
159
+ _mel_basis = _build_mel_basis(hparams)
160
+ return np.dot(_mel_basis, spectogram)
161
+
162
+ def _mel_to_linear(mel_spectrogram, hparams):
163
+ global _inv_mel_basis
164
+ if _inv_mel_basis is None:
165
+ _inv_mel_basis = np.linalg.pinv(_build_mel_basis(hparams))
166
+ return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram))
167
+
168
+ def _build_mel_basis(hparams):
169
+ assert hparams.fmax <= hparams.sample_rate // 2
170
+ return librosa.filters.mel(
171
+ sr=hparams.sample_rate,
172
+ n_fft=hparams.n_fft,
173
+ n_mels=hparams.num_mels,
174
+ fmin=hparams.fmin,
175
+ fmax=hparams.fmax
176
+ )
177
+
178
+ def _amp_to_db(x, hparams):
179
+ min_level = np.exp(hparams.min_level_db / 20 * np.log(10))
180
+ return 20 * np.log10(np.maximum(min_level, x))
181
+
182
+ def _db_to_amp(x):
183
+ return np.power(10.0, (x) * 0.05)
184
+
185
+ def _normalize(S, hparams):
186
+ if hparams.allow_clipping_in_normalization:
187
+ if hparams.symmetric_mels:
188
+ return np.clip((2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value,
189
+ -hparams.max_abs_value, hparams.max_abs_value)
190
+ else:
191
+ return np.clip(hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)), 0, hparams.max_abs_value)
192
+
193
+ assert S.max() <= 0 and S.min() - hparams.min_level_db >= 0
194
+ if hparams.symmetric_mels:
195
+ return (2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value
196
+ else:
197
+ return hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db))
198
+
199
+ def _denormalize(D, hparams):
200
+ if hparams.allow_clipping_in_normalization:
201
+ if hparams.symmetric_mels:
202
+ return (((np.clip(D, -hparams.max_abs_value,
203
+ hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value))
204
+ + hparams.min_level_db)
205
+ else:
206
+ return ((np.clip(D, 0, hparams.max_abs_value) * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
207
+
208
+ if hparams.symmetric_mels:
209
+ return (((D + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) + hparams.min_level_db)
210
+ else:
211
+ return ((D * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
backend/synthesizer/hparams.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ import pprint
3
+
4
+ class HParams(object):
5
+ def __init__(self, **kwargs): self.__dict__.update(kwargs)
6
+ def __setitem__(self, key, value): setattr(self, key, value)
7
+ def __getitem__(self, key): return getattr(self, key)
8
+ def __repr__(self): return pprint.pformat(self.__dict__)
9
+
10
+ def parse(self, string):
11
+ # Overrides hparams from a comma-separated string of name=value pairs
12
+ if len(string) > 0:
13
+ overrides = [s.split("=") for s in string.split(",")]
14
+ keys, values = zip(*overrides)
15
+ keys = list(map(str.strip, keys))
16
+ values = list(map(str.strip, values))
17
+ for k in keys:
18
+ self.__dict__[k] = ast.literal_eval(values[keys.index(k)])
19
+ return self
20
+
21
+ hparams = HParams(
22
+ ### Signal Processing (used in both synthesizer and vocoder)
23
+ sample_rate = 16000,
24
+ n_fft = 800,
25
+ num_mels = 80,
26
+ hop_size = 200, # Tacotron uses 12.5 ms frame shift (set to sample_rate * 0.0125)
27
+ win_size = 800, # Tacotron uses 50 ms frame length (set to sample_rate * 0.050)
28
+ fmin = 55,
29
+ min_level_db = -100,
30
+ ref_level_db = 20,
31
+ max_abs_value = 4., # Gradient explodes if too big, premature convergence if too small.
32
+ preemphasis = 0.97, # Filter coefficient to use if preemphasize is True
33
+ preemphasize = True,
34
+
35
+ ### Tacotron Text-to-Speech (TTS)
36
+ tts_embed_dims = 512, # Embedding dimension for the graphemes/phoneme inputs
37
+ tts_encoder_dims = 256,
38
+ tts_decoder_dims = 128,
39
+ tts_postnet_dims = 512,
40
+ tts_encoder_K = 5,
41
+ tts_lstm_dims = 1024,
42
+ tts_postnet_K = 5,
43
+ tts_num_highways = 4,
44
+ tts_dropout = 0.5,
45
+ tts_cleaner_names = ["english_cleaners"],
46
+ tts_stop_threshold = -3.4, # Value below which audio generation ends.
47
+ # For example, for a range of [-4, 4], this
48
+ # will terminate the sequence at the first
49
+ # frame that has all values < -3.4
50
+
51
+ ### Tacotron Training
52
+ tts_schedule = [(2, 1e-3, 20_000, 12), # Progressive training schedule
53
+ (2, 5e-4, 40_000, 12), # (r, lr, step, batch_size)
54
+ (2, 2e-4, 80_000, 12), #
55
+ (2, 1e-4, 160_000, 12), # r = reduction factor (# of mel frames
56
+ (2, 3e-5, 320_000, 12), # synthesized for each decoder iteration)
57
+ (2, 1e-5, 640_000, 12)], # lr = learning rate
58
+
59
+ tts_clip_grad_norm = 1.0, # clips the gradient norm to prevent explosion - set to None if not needed
60
+ tts_eval_interval = 500, # Number of steps between model evaluation (sample generation)
61
+ # Set to -1 to generate after completing epoch, or 0 to disable
62
+
63
+ tts_eval_num_samples = 1, # Makes this number of samples
64
+
65
+ ### Data Preprocessing
66
+ max_mel_frames = 900,
67
+ rescale = True,
68
+ rescaling_max = 0.9,
69
+ synthesis_batch_size = 16, # For vocoder preprocessing and inference.
70
+
71
+ ### Mel Visualization and Griffin-Lim
72
+ signal_normalization = True,
73
+ power = 1.5,
74
+ griffin_lim_iters = 60,
75
+
76
+ ### Audio processing options
77
+ fmax = 7600, # Should not exceed (sample_rate // 2)
78
+ allow_clipping_in_normalization = True, # Used when signal_normalization = True
79
+ clip_mels_length = True, # If true, discards samples exceeding max_mel_frames
80
+ use_lws = False, # "Fast spectrogram phase recovery using local weighted sums"
81
+ symmetric_mels = True, # Sets mel range to [-max_abs_value, max_abs_value] if True,
82
+ # and [0, max_abs_value] if False
83
+ trim_silence = True, # Use with sample_rate of 16000 for best results
84
+
85
+ ### SV2TTS
86
+ speaker_embedding_size = 256, # Dimension for the speaker embedding
87
+ silence_min_duration_split = 0.4, # Duration in seconds of a silence for an utterance to be split
88
+ utterance_min_duration = 1.6, # Duration in seconds below which utterances are discarded
89
+ )
90
+
91
+ def hparams_debug_string():
92
+ return str(hparams)
backend/synthesizer/inference.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from synthesizer import audio
3
+ from synthesizer.hparams import hparams
4
+ from synthesizer.models.tacotron import Tacotron
5
+ from synthesizer.utils.symbols import symbols
6
+ from synthesizer.utils.text import text_to_sequence
7
+ from app.vocoder.display import simple_table
8
+ from pathlib import Path
9
+ from typing import Union, List
10
+ import numpy as np
11
+ import librosa
12
+
13
+
14
+ class Synthesizer:
15
+ sample_rate = hparams.sample_rate
16
+ hparams = hparams
17
+
18
+ def __init__(self, model_fpath: Path, verbose=True):
19
+ """
20
+ The model isn't instantiated and loaded in memory until needed or until load() is called.
21
+
22
+ :param model_fpath: path to the trained model file
23
+ :param verbose: if False, prints less information when using the model
24
+ """
25
+ self.model_fpath = model_fpath
26
+ self.verbose = verbose
27
+
28
+ # Check for GPU
29
+ if torch.cuda.is_available():
30
+ self.device = torch.device("cuda")
31
+ else:
32
+ self.device = torch.device("cpu")
33
+ if self.verbose:
34
+ print("Synthesizer using device:", self.device)
35
+
36
+ # Tacotron model will be instantiated later on first use.
37
+ self._model = None
38
+
39
+ def is_loaded(self):
40
+ """
41
+ Whether the model is loaded in memory.
42
+ """
43
+ return self._model is not None
44
+
45
+ def load(self):
46
+ """
47
+ Instantiates and loads the model given the weights file that was passed in the constructor.
48
+ """
49
+ self._model = Tacotron(embed_dims=hparams.tts_embed_dims,
50
+ num_chars=len(symbols),
51
+ encoder_dims=hparams.tts_encoder_dims,
52
+ decoder_dims=hparams.tts_decoder_dims,
53
+ n_mels=hparams.num_mels,
54
+ fft_bins=hparams.num_mels,
55
+ postnet_dims=hparams.tts_postnet_dims,
56
+ encoder_K=hparams.tts_encoder_K,
57
+ lstm_dims=hparams.tts_lstm_dims,
58
+ postnet_K=hparams.tts_postnet_K,
59
+ num_highways=hparams.tts_num_highways,
60
+ dropout=hparams.tts_dropout,
61
+ stop_threshold=hparams.tts_stop_threshold,
62
+ speaker_embedding_size=hparams.speaker_embedding_size).to(self.device)
63
+
64
+ self._model.load(self.model_fpath)
65
+ self._model.eval()
66
+
67
+ if self.verbose:
68
+ print("Loaded synthesizer \"%s\" trained to step %d" % (self.model_fpath.name, self._model.state_dict()["step"]))
69
+
70
+ def synthesize_spectrograms(self, texts: List[str],
71
+ embeddings: Union[np.ndarray, List[np.ndarray]],
72
+ return_alignments=False):
73
+ """
74
+ Synthesizes mel spectrograms from texts and speaker embeddings.
75
+
76
+ :param texts: a list of N text prompts to be synthesized
77
+ :param embeddings: a numpy array or list of speaker embeddings of shape (N, 256)
78
+ :param return_alignments: if True, a matrix representing the alignments between the
79
+ characters
80
+ and each decoder output step will be returned for each spectrogram
81
+ :return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the
82
+ sequence length of spectrogram i, and possibly the alignments.
83
+ """
84
+ # Load the model on the first request.
85
+ if not self.is_loaded():
86
+ self.load()
87
+
88
+ # Preprocess text inputs
89
+ inputs = [text_to_sequence(text.strip(), hparams.tts_cleaner_names) for text in texts]
90
+ if not isinstance(embeddings, list):
91
+ embeddings = [embeddings]
92
+
93
+ # Batch inputs
94
+ batched_inputs = [inputs[i:i+hparams.synthesis_batch_size]
95
+ for i in range(0, len(inputs), hparams.synthesis_batch_size)]
96
+ batched_embeds = [embeddings[i:i+hparams.synthesis_batch_size]
97
+ for i in range(0, len(embeddings), hparams.synthesis_batch_size)]
98
+
99
+ specs = []
100
+ for i, batch in enumerate(batched_inputs, 1):
101
+ if self.verbose:
102
+ print(f"\n| Generating {i}/{len(batched_inputs)}")
103
+
104
+ # Pad texts so they are all the same length
105
+ text_lens = [len(text) for text in batch]
106
+ max_text_len = max(text_lens)
107
+ chars = [pad1d(text, max_text_len) for text in batch]
108
+ chars = np.stack(chars)
109
+
110
+ # Stack speaker embeddings into 2D array for batch processing
111
+ speaker_embeds = np.stack(batched_embeds[i-1])
112
+
113
+ # Convert to tensor
114
+ chars = torch.tensor(chars).long().to(self.device)
115
+ speaker_embeddings = torch.tensor(speaker_embeds).float().to(self.device)
116
+
117
+ # Inference
118
+ _, mels, alignments = self._model.generate(chars, speaker_embeddings)
119
+ mels = mels.detach().cpu().numpy()
120
+ for m in mels:
121
+ # Trim silence from end of each spectrogram
122
+ while np.max(m[:, -1]) < hparams.tts_stop_threshold:
123
+ m = m[:, :-1]
124
+ specs.append(m)
125
+
126
+ if self.verbose:
127
+ print("\n\nDone.\n")
128
+ return (specs, alignments) if return_alignments else specs
129
+
130
+ @staticmethod
131
+ def load_preprocess_wav(fpath):
132
+ """
133
+ Loads and preprocesses an audio file under the same conditions the audio files were used to
134
+ train the synthesizer.
135
+ """
136
+ wav = librosa.load(str(fpath), hparams.sample_rate)[0]
137
+ if hparams.rescale:
138
+ wav = wav / np.abs(wav).max() * hparams.rescaling_max
139
+ return wav
140
+
141
+ @staticmethod
142
+ def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray]):
143
+ """
144
+ Creates a mel spectrogram from an audio file in the same manner as the mel spectrograms that
145
+ were fed to the synthesizer when training.
146
+ """
147
+ if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
148
+ wav = Synthesizer.load_preprocess_wav(fpath_or_wav)
149
+ else:
150
+ wav = fpath_or_wav
151
+
152
+ mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
153
+ return mel_spectrogram
154
+
155
+ @staticmethod
156
+ def griffin_lim(mel):
157
+ """
158
+ Inverts a mel spectrogram using Griffin-Lim. The mel spectrogram is expected to have been built
159
+ with the same parameters present in hparams.py.
160
+ """
161
+ return audio.inv_mel_spectrogram(mel, hparams)
162
+
163
+
164
+ def pad1d(x, max_len, pad_value=0):
165
+ return np.pad(x, (0, max_len - len(x)), mode="constant", constant_values=pad_value)
backend/synthesizer/models/tacotron.py ADDED
@@ -0,0 +1,542 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.nn.functional as F
6
+ from pathlib import Path
7
+ from typing import Union
8
+
9
+
10
+ class HighwayNetwork(nn.Module):
11
+ def __init__(self, size):
12
+ super().__init__()
13
+ self.W1 = nn.Linear(size, size)
14
+ self.W2 = nn.Linear(size, size)
15
+ self.W1.bias.data.fill_(0.)
16
+
17
+ def forward(self, x):
18
+ x1 = self.W1(x)
19
+ x2 = self.W2(x)
20
+ g = torch.sigmoid(x2)
21
+ y = g * F.relu(x1) + (1. - g) * x
22
+ return y
23
+
24
+
25
+ class Encoder(nn.Module):
26
+ def __init__(self, embed_dims, num_chars, encoder_dims, K, num_highways, dropout):
27
+ super().__init__()
28
+ prenet_dims = (encoder_dims, encoder_dims)
29
+ cbhg_channels = encoder_dims
30
+ self.embedding = nn.Embedding(num_chars, embed_dims)
31
+ self.pre_net = PreNet(embed_dims, fc1_dims=prenet_dims[0], fc2_dims=prenet_dims[1],
32
+ dropout=dropout)
33
+ self.cbhg = CBHG(K=K, in_channels=cbhg_channels, channels=cbhg_channels,
34
+ proj_channels=[cbhg_channels, cbhg_channels],
35
+ num_highways=num_highways)
36
+
37
+ def forward(self, x, speaker_embedding=None):
38
+ x = self.embedding(x)
39
+ x = self.pre_net(x)
40
+ x.transpose_(1, 2)
41
+ x = self.cbhg(x)
42
+ if speaker_embedding is not None:
43
+ x = self.add_speaker_embedding(x, speaker_embedding)
44
+ return x
45
+
46
+ def add_speaker_embedding(self, x, speaker_embedding):
47
+ # SV2TTS
48
+ # The input x is the encoder output and is a 3D tensor with size (batch_size, num_chars, tts_embed_dims)
49
+ # When training, speaker_embedding is also a 2D tensor with size (batch_size, speaker_embedding_size)
50
+ # (for inference, speaker_embedding is a 1D tensor with size (speaker_embedding_size))
51
+ # This concats the speaker embedding for each char in the encoder output
52
+
53
+ # Save the dimensions as human-readable names
54
+ batch_size = x.size()[0]
55
+ num_chars = x.size()[1]
56
+
57
+ if speaker_embedding.dim() == 1:
58
+ idx = 0
59
+ else:
60
+ idx = 1
61
+
62
+ # Start by making a copy of each speaker embedding to match the input text length
63
+ # The output of this has size (batch_size, num_chars * tts_embed_dims)
64
+ speaker_embedding_size = speaker_embedding.size()[idx]
65
+ e = speaker_embedding.repeat_interleave(num_chars, dim=idx)
66
+
67
+ # Reshape it and transpose
68
+ e = e.reshape(batch_size, speaker_embedding_size, num_chars)
69
+ e = e.transpose(1, 2)
70
+
71
+ # Concatenate the tiled speaker embedding with the encoder output
72
+ x = torch.cat((x, e), 2)
73
+ return x
74
+
75
+
76
+ class BatchNormConv(nn.Module):
77
+ def __init__(self, in_channels, out_channels, kernel, relu=True):
78
+ super().__init__()
79
+ self.conv = nn.Conv1d(in_channels, out_channels, kernel, stride=1, padding=kernel // 2, bias=False)
80
+ self.bnorm = nn.BatchNorm1d(out_channels)
81
+ self.relu = relu
82
+
83
+ def forward(self, x):
84
+ x = self.conv(x)
85
+ x = F.relu(x) if self.relu is True else x
86
+ return self.bnorm(x)
87
+
88
+
89
+ class CBHG(nn.Module):
90
+ def __init__(self, K, in_channels, channels, proj_channels, num_highways):
91
+ super().__init__()
92
+
93
+ # List of all rnns to call `flatten_parameters()` on
94
+ self._to_flatten = []
95
+
96
+ self.bank_kernels = [i for i in range(1, K + 1)]
97
+ self.conv1d_bank = nn.ModuleList()
98
+ for k in self.bank_kernels:
99
+ conv = BatchNormConv(in_channels, channels, k)
100
+ self.conv1d_bank.append(conv)
101
+
102
+ self.maxpool = nn.MaxPool1d(kernel_size=2, stride=1, padding=1)
103
+
104
+ self.conv_project1 = BatchNormConv(len(self.bank_kernels) * channels, proj_channels[0], 3)
105
+ self.conv_project2 = BatchNormConv(proj_channels[0], proj_channels[1], 3, relu=False)
106
+
107
+ # Fix the highway input if necessary
108
+ if proj_channels[-1] != channels:
109
+ self.highway_mismatch = True
110
+ self.pre_highway = nn.Linear(proj_channels[-1], channels, bias=False)
111
+ else:
112
+ self.highway_mismatch = False
113
+
114
+ self.highways = nn.ModuleList()
115
+ for i in range(num_highways):
116
+ hn = HighwayNetwork(channels)
117
+ self.highways.append(hn)
118
+
119
+ self.rnn = nn.GRU(channels, channels // 2, batch_first=True, bidirectional=True)
120
+ self._to_flatten.append(self.rnn)
121
+
122
+ # Avoid fragmentation of RNN parameters and associated warning
123
+ self._flatten_parameters()
124
+
125
+ def forward(self, x):
126
+ # Although we `_flatten_parameters()` on init, when using DataParallel
127
+ # the model gets replicated, making it no longer guaranteed that the
128
+ # weights are contiguous in GPU memory. Hence, we must call it again
129
+ self._flatten_parameters()
130
+
131
+ # Save these for later
132
+ residual = x
133
+ seq_len = x.size(-1)
134
+ conv_bank = []
135
+
136
+ # Convolution Bank
137
+ for conv in self.conv1d_bank:
138
+ c = conv(x) # Convolution
139
+ conv_bank.append(c[:, :, :seq_len])
140
+
141
+ # Stack along the channel axis
142
+ conv_bank = torch.cat(conv_bank, dim=1)
143
+
144
+ # dump the last padding to fit residual
145
+ x = self.maxpool(conv_bank)[:, :, :seq_len]
146
+
147
+ # Conv1d projections
148
+ x = self.conv_project1(x)
149
+ x = self.conv_project2(x)
150
+
151
+ # Residual Connect
152
+ x = x + residual
153
+
154
+ # Through the highways
155
+ x = x.transpose(1, 2)
156
+ if self.highway_mismatch is True:
157
+ x = self.pre_highway(x)
158
+ for h in self.highways: x = h(x)
159
+
160
+ # And then the RNN
161
+ x, _ = self.rnn(x)
162
+ return x
163
+
164
+ def _flatten_parameters(self):
165
+ """Calls `flatten_parameters` on all the rnns used by the WaveRNN. Used
166
+ to improve efficiency and avoid PyTorch yelling at us."""
167
+ [m.flatten_parameters() for m in self._to_flatten]
168
+
169
+ class PreNet(nn.Module):
170
+ def __init__(self, in_dims, fc1_dims=256, fc2_dims=128, dropout=0.5):
171
+ super().__init__()
172
+ self.fc1 = nn.Linear(in_dims, fc1_dims)
173
+ self.fc2 = nn.Linear(fc1_dims, fc2_dims)
174
+ self.p = dropout
175
+
176
+ def forward(self, x):
177
+ x = self.fc1(x)
178
+ x = F.relu(x)
179
+ x = F.dropout(x, self.p, training=True)
180
+ x = self.fc2(x)
181
+ x = F.relu(x)
182
+ x = F.dropout(x, self.p, training=True)
183
+ return x
184
+
185
+
186
+ class Attention(nn.Module):
187
+ def __init__(self, attn_dims):
188
+ super().__init__()
189
+ self.W = nn.Linear(attn_dims, attn_dims, bias=False)
190
+ self.v = nn.Linear(attn_dims, 1, bias=False)
191
+
192
+ def forward(self, encoder_seq_proj, query, t):
193
+
194
+ # print(encoder_seq_proj.shape)
195
+ # Transform the query vector
196
+ query_proj = self.W(query).unsqueeze(1)
197
+
198
+ # Compute the scores
199
+ u = self.v(torch.tanh(encoder_seq_proj + query_proj))
200
+ scores = F.softmax(u, dim=1)
201
+
202
+ return scores.transpose(1, 2)
203
+
204
+
205
+ class LSA(nn.Module):
206
+ def __init__(self, attn_dim, kernel_size=31, filters=32):
207
+ super().__init__()
208
+ self.conv = nn.Conv1d(1, filters, padding=(kernel_size - 1) // 2, kernel_size=kernel_size, bias=True)
209
+ self.L = nn.Linear(filters, attn_dim, bias=False)
210
+ self.W = nn.Linear(attn_dim, attn_dim, bias=True) # Include the attention bias in this term
211
+ self.v = nn.Linear(attn_dim, 1, bias=False)
212
+ self.cumulative = None
213
+ self.attention = None
214
+
215
+ def init_attention(self, encoder_seq_proj):
216
+ device = next(self.parameters()).device # use same device as parameters
217
+ b, t, c = encoder_seq_proj.size()
218
+ self.cumulative = torch.zeros(b, t, device=device)
219
+ self.attention = torch.zeros(b, t, device=device)
220
+
221
+ def forward(self, encoder_seq_proj, query, t, chars):
222
+
223
+ if t == 0: self.init_attention(encoder_seq_proj)
224
+
225
+ processed_query = self.W(query).unsqueeze(1)
226
+
227
+ location = self.cumulative.unsqueeze(1)
228
+ processed_loc = self.L(self.conv(location).transpose(1, 2))
229
+
230
+ u = self.v(torch.tanh(processed_query + encoder_seq_proj + processed_loc))
231
+ u = u.squeeze(-1)
232
+
233
+ # Mask zero padding chars
234
+ u = u * (chars != 0).float()
235
+
236
+ # Smooth Attention
237
+ # scores = torch.sigmoid(u) / torch.sigmoid(u).sum(dim=1, keepdim=True)
238
+ scores = F.softmax(u, dim=1)
239
+ self.attention = scores
240
+ self.cumulative = self.cumulative + self.attention
241
+
242
+ return scores.unsqueeze(-1).transpose(1, 2)
243
+
244
+
245
+ class Decoder(nn.Module):
246
+ # Class variable because its value doesn't change between classes
247
+ # yet ought to be scoped by class because its a property of a Decoder
248
+ max_r = 20
249
+ def __init__(self, n_mels, encoder_dims, decoder_dims, lstm_dims,
250
+ dropout, speaker_embedding_size):
251
+ super().__init__()
252
+ self.register_buffer("r", torch.tensor(1, dtype=torch.int))
253
+ self.n_mels = n_mels
254
+ prenet_dims = (decoder_dims * 2, decoder_dims * 2)
255
+ self.prenet = PreNet(n_mels, fc1_dims=prenet_dims[0], fc2_dims=prenet_dims[1],
256
+ dropout=dropout)
257
+ self.attn_net = LSA(decoder_dims)
258
+ self.attn_rnn = nn.GRUCell(encoder_dims + prenet_dims[1] + speaker_embedding_size, decoder_dims)
259
+ self.rnn_input = nn.Linear(encoder_dims + decoder_dims + speaker_embedding_size, lstm_dims)
260
+ self.res_rnn1 = nn.LSTMCell(lstm_dims, lstm_dims)
261
+ self.res_rnn2 = nn.LSTMCell(lstm_dims, lstm_dims)
262
+ self.mel_proj = nn.Linear(lstm_dims, n_mels * self.max_r, bias=False)
263
+ self.stop_proj = nn.Linear(encoder_dims + speaker_embedding_size + lstm_dims, 1)
264
+
265
+ def zoneout(self, prev, current, p=0.1):
266
+ device = next(self.parameters()).device # Use same device as parameters
267
+ mask = torch.zeros(prev.size(), device=device).bernoulli_(p)
268
+ return prev * mask + current * (1 - mask)
269
+
270
+ def forward(self, encoder_seq, encoder_seq_proj, prenet_in,
271
+ hidden_states, cell_states, context_vec, t, chars):
272
+
273
+ # Need this for reshaping mels
274
+ batch_size = encoder_seq.size(0)
275
+
276
+ # Unpack the hidden and cell states
277
+ attn_hidden, rnn1_hidden, rnn2_hidden = hidden_states
278
+ rnn1_cell, rnn2_cell = cell_states
279
+
280
+ # PreNet for the Attention RNN
281
+ prenet_out = self.prenet(prenet_in)
282
+
283
+ # Compute the Attention RNN hidden state
284
+ attn_rnn_in = torch.cat([context_vec, prenet_out], dim=-1)
285
+ attn_hidden = self.attn_rnn(attn_rnn_in.squeeze(1), attn_hidden)
286
+
287
+ # Compute the attention scores
288
+ scores = self.attn_net(encoder_seq_proj, attn_hidden, t, chars)
289
+
290
+ # Dot product to create the context vector
291
+ context_vec = scores @ encoder_seq
292
+ context_vec = context_vec.squeeze(1)
293
+
294
+ # Concat Attention RNN output w. Context Vector & project
295
+ x = torch.cat([context_vec, attn_hidden], dim=1)
296
+ x = self.rnn_input(x)
297
+
298
+ # Compute first Residual RNN
299
+ rnn1_hidden_next, rnn1_cell = self.res_rnn1(x, (rnn1_hidden, rnn1_cell))
300
+ if self.training:
301
+ rnn1_hidden = self.zoneout(rnn1_hidden, rnn1_hidden_next)
302
+ else:
303
+ rnn1_hidden = rnn1_hidden_next
304
+ x = x + rnn1_hidden
305
+
306
+ # Compute second Residual RNN
307
+ rnn2_hidden_next, rnn2_cell = self.res_rnn2(x, (rnn2_hidden, rnn2_cell))
308
+ if self.training:
309
+ rnn2_hidden = self.zoneout(rnn2_hidden, rnn2_hidden_next)
310
+ else:
311
+ rnn2_hidden = rnn2_hidden_next
312
+ x = x + rnn2_hidden
313
+
314
+ # Project Mels
315
+ mels = self.mel_proj(x)
316
+ mels = mels.view(batch_size, self.n_mels, self.max_r)[:, :, :self.r]
317
+ hidden_states = (attn_hidden, rnn1_hidden, rnn2_hidden)
318
+ cell_states = (rnn1_cell, rnn2_cell)
319
+
320
+ # Stop token prediction
321
+ s = torch.cat((x, context_vec), dim=1)
322
+ s = self.stop_proj(s)
323
+ stop_tokens = torch.sigmoid(s)
324
+
325
+ return mels, scores, hidden_states, cell_states, context_vec, stop_tokens
326
+
327
+
328
+ class Tacotron(nn.Module):
329
+ def __init__(self, embed_dims, num_chars, encoder_dims, decoder_dims, n_mels,
330
+ fft_bins, postnet_dims, encoder_K, lstm_dims, postnet_K, num_highways,
331
+ dropout, stop_threshold, speaker_embedding_size):
332
+ super().__init__()
333
+ self.n_mels = n_mels
334
+ self.lstm_dims = lstm_dims
335
+ self.encoder_dims = encoder_dims
336
+ self.decoder_dims = decoder_dims
337
+ self.speaker_embedding_size = speaker_embedding_size
338
+ self.encoder = Encoder(embed_dims, num_chars, encoder_dims,
339
+ encoder_K, num_highways, dropout)
340
+ self.encoder_proj = nn.Linear(encoder_dims + speaker_embedding_size, decoder_dims, bias=False)
341
+ self.decoder = Decoder(n_mels, encoder_dims, decoder_dims, lstm_dims,
342
+ dropout, speaker_embedding_size)
343
+ self.postnet = CBHG(postnet_K, n_mels, postnet_dims,
344
+ [postnet_dims, fft_bins], num_highways)
345
+ self.post_proj = nn.Linear(postnet_dims, fft_bins, bias=False)
346
+
347
+ self.init_model()
348
+ self.num_params()
349
+
350
+ self.register_buffer("step", torch.zeros(1, dtype=torch.long))
351
+ self.register_buffer("stop_threshold", torch.tensor(stop_threshold, dtype=torch.float32))
352
+
353
+ @property
354
+ def r(self):
355
+ return self.decoder.r.item()
356
+
357
+ @r.setter
358
+ def r(self, value):
359
+ self.decoder.r = self.decoder.r.new_tensor(value, requires_grad=False)
360
+
361
+ def forward(self, x, m, speaker_embedding):
362
+ device = next(self.parameters()).device # use same device as parameters
363
+
364
+ self.step += 1
365
+ batch_size, _, steps = m.size()
366
+
367
+ # Initialise all hidden states and pack into tuple
368
+ attn_hidden = torch.zeros(batch_size, self.decoder_dims, device=device)
369
+ rnn1_hidden = torch.zeros(batch_size, self.lstm_dims, device=device)
370
+ rnn2_hidden = torch.zeros(batch_size, self.lstm_dims, device=device)
371
+ hidden_states = (attn_hidden, rnn1_hidden, rnn2_hidden)
372
+
373
+ # Initialise all lstm cell states and pack into tuple
374
+ rnn1_cell = torch.zeros(batch_size, self.lstm_dims, device=device)
375
+ rnn2_cell = torch.zeros(batch_size, self.lstm_dims, device=device)
376
+ cell_states = (rnn1_cell, rnn2_cell)
377
+
378
+ # <GO> Frame for start of decoder loop
379
+ go_frame = torch.zeros(batch_size, self.n_mels, device=device)
380
+
381
+ # Need an initial context vector
382
+ context_vec = torch.zeros(batch_size, self.encoder_dims + self.speaker_embedding_size, device=device)
383
+
384
+ # SV2TTS: Run the encoder with the speaker embedding
385
+ # The projection avoids unnecessary matmuls in the decoder loop
386
+ encoder_seq = self.encoder(x, speaker_embedding)
387
+ encoder_seq_proj = self.encoder_proj(encoder_seq)
388
+
389
+ # Need a couple of lists for outputs
390
+ mel_outputs, attn_scores, stop_outputs = [], [], []
391
+
392
+ # Run the decoder loop
393
+ for t in range(0, steps, self.r):
394
+ prenet_in = m[:, :, t - 1] if t > 0 else go_frame
395
+ mel_frames, scores, hidden_states, cell_states, context_vec, stop_tokens = \
396
+ self.decoder(encoder_seq, encoder_seq_proj, prenet_in,
397
+ hidden_states, cell_states, context_vec, t, x)
398
+ mel_outputs.append(mel_frames)
399
+ attn_scores.append(scores)
400
+ stop_outputs.extend([stop_tokens] * self.r)
401
+
402
+ # Concat the mel outputs into sequence
403
+ mel_outputs = torch.cat(mel_outputs, dim=2)
404
+
405
+ # Post-Process for Linear Spectrograms
406
+ postnet_out = self.postnet(mel_outputs)
407
+ linear = self.post_proj(postnet_out)
408
+ linear = linear.transpose(1, 2)
409
+
410
+ # For easy visualisation
411
+ attn_scores = torch.cat(attn_scores, 1)
412
+ # attn_scores = attn_scores.cpu().data.numpy()
413
+ stop_outputs = torch.cat(stop_outputs, 1)
414
+
415
+ return mel_outputs, linear, attn_scores, stop_outputs
416
+
417
+ def generate(self, x, speaker_embedding=None, steps=2000):
418
+ import sys
419
+
420
+ self.eval()
421
+ device = next(self.parameters()).device # use same device as parameters
422
+
423
+ batch_size, _ = x.size()
424
+
425
+ # Need to initialise all hidden states and pack into tuple for tidyness
426
+ attn_hidden = torch.zeros(batch_size, self.decoder_dims, device=device)
427
+ rnn1_hidden = torch.zeros(batch_size, self.lstm_dims, device=device)
428
+ rnn2_hidden = torch.zeros(batch_size, self.lstm_dims, device=device)
429
+ hidden_states = (attn_hidden, rnn1_hidden, rnn2_hidden)
430
+
431
+ # Need to initialise all lstm cell states and pack into tuple for tidyness
432
+ rnn1_cell = torch.zeros(batch_size, self.lstm_dims, device=device)
433
+ rnn2_cell = torch.zeros(batch_size, self.lstm_dims, device=device)
434
+ cell_states = (rnn1_cell, rnn2_cell)
435
+
436
+ # Need a <GO> Frame for start of decoder loop
437
+ go_frame = torch.zeros(batch_size, self.n_mels, device=device)
438
+
439
+ # Need an initial context vector
440
+ context_vec = torch.zeros(batch_size, self.encoder_dims + self.speaker_embedding_size, device=device)
441
+
442
+ # SV2TTS: Run the encoder with the speaker embedding
443
+ # The projection avoids unnecessary matmuls in the decoder loop
444
+ print(" [Tacotron] Running encoder...", end='', flush=True)
445
+ sys.stdout.flush()
446
+ encoder_seq = self.encoder(x, speaker_embedding)
447
+ encoder_seq_proj = self.encoder_proj(encoder_seq)
448
+ print(" OK")
449
+ sys.stdout.flush()
450
+
451
+ # Need a couple of lists for outputs
452
+ mel_outputs, attn_scores, stop_outputs = [], [], []
453
+
454
+ # Run the decoder loop
455
+ print(f" [Tacotron] Decoder loop: 0/{steps} steps", end='')
456
+ sys.stdout.flush()
457
+ for t in range(0, steps, self.r):
458
+ prenet_in = mel_outputs[-1][:, :, -1] if t > 0 else go_frame
459
+ mel_frames, scores, hidden_states, cell_states, context_vec, stop_tokens = \
460
+ self.decoder(encoder_seq, encoder_seq_proj, prenet_in,
461
+ hidden_states, cell_states, context_vec, t, x)
462
+ mel_outputs.append(mel_frames)
463
+ attn_scores.append(scores)
464
+ stop_outputs.extend([stop_tokens] * self.r)
465
+
466
+ # Progress every 100 steps
467
+ if t % 100 == 0:
468
+ print(f"\r [Tacotron] Decoder loop: {t}/{steps} steps", end='')
469
+ sys.stdout.flush()
470
+
471
+ # Stop the loop when all stop tokens in batch exceed threshold
472
+ if (stop_tokens > 0.5).all() and t > 10:
473
+ print(f"\r [Tacotron] Decoder loop: {t}/{steps} steps (stopped early)")
474
+ sys.stdout.flush()
475
+ break
476
+
477
+ print(f"\r [Tacotron] Decoder loop: {len(mel_outputs) * self.r}/{steps} steps (complete)")
478
+ sys.stdout.flush()
479
+
480
+ # Concat the mel outputs into sequence
481
+ print(" [Tacotron] Concatenating and post-processing...", end='', flush=True)
482
+ sys.stdout.flush()
483
+ mel_outputs = torch.cat(mel_outputs, dim=2)
484
+
485
+ # Post-Process for Linear Spectrograms
486
+ postnet_out = self.postnet(mel_outputs)
487
+ linear = self.post_proj(postnet_out)
488
+
489
+ linear = linear.transpose(1, 2)
490
+
491
+ # For easy visualisation
492
+ attn_scores = torch.cat(attn_scores, 1)
493
+ stop_outputs = torch.cat(stop_outputs, 1)
494
+
495
+ print(" OK")
496
+ sys.stdout.flush()
497
+ self.train()
498
+
499
+ return mel_outputs, linear, attn_scores
500
+
501
+ def init_model(self):
502
+ for p in self.parameters():
503
+ if p.dim() > 1: nn.init.xavier_uniform_(p)
504
+
505
+ def get_step(self):
506
+ return self.step.data.item()
507
+
508
+ def reset_step(self):
509
+ # assignment to parameters or buffers is overloaded, updates internal dict entry
510
+ self.step = self.step.data.new_tensor(1)
511
+
512
+ def log(self, path, msg):
513
+ with open(path, "a") as f:
514
+ print(msg, file=f)
515
+
516
+ def load(self, path, optimizer=None):
517
+ # Use device of model params as location for loaded state
518
+ device = next(self.parameters()).device
519
+ checkpoint = torch.load(str(path), map_location=device)
520
+ self.load_state_dict(checkpoint["model_state"])
521
+
522
+ if "optimizer_state" in checkpoint and optimizer is not None:
523
+ optimizer.load_state_dict(checkpoint["optimizer_state"])
524
+
525
+ def save(self, path, optimizer=None):
526
+ if optimizer is not None:
527
+ torch.save({
528
+ "model_state": self.state_dict(),
529
+ "optimizer_state": optimizer.state_dict(),
530
+ }, str(path))
531
+ else:
532
+ torch.save({
533
+ "model_state": self.state_dict(),
534
+ }, str(path))
535
+
536
+
537
+ def num_params(self, print_out=True):
538
+ parameters = filter(lambda p: p.requires_grad, self.parameters())
539
+ parameters = sum([np.prod(p.size()) for p in parameters]) / 1_000_000
540
+ if print_out:
541
+ print("Trainable Parameters: %.3fM" % parameters)
542
+ return parameters
backend/synthesizer/utils/__init__.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+
4
+ _output_ref = None
5
+ _replicas_ref = None
6
+
7
+ def data_parallel_workaround(model, *input):
8
+ global _output_ref
9
+ global _replicas_ref
10
+ device_ids = list(range(torch.cuda.device_count()))
11
+ output_device = device_ids[0]
12
+ replicas = torch.nn.parallel.replicate(model, device_ids)
13
+ # input.shape = (num_args, batch, ...)
14
+ inputs = torch.nn.parallel.scatter(input, device_ids)
15
+ # inputs.shape = (num_gpus, num_args, batch/num_gpus, ...)
16
+ replicas = replicas[:len(inputs)]
17
+ outputs = torch.nn.parallel.parallel_apply(replicas, inputs)
18
+ y_hat = torch.nn.parallel.gather(outputs, output_device)
19
+ _output_ref = outputs
20
+ _replicas_ref = replicas
21
+ return y_hat
22
+
23
+
24
+ class ValueWindow():
25
+ def __init__(self, window_size=100):
26
+ self._window_size = window_size
27
+ self._values = []
28
+
29
+ def append(self, x):
30
+ self._values = self._values[-(self._window_size - 1):] + [x]
31
+
32
+ @property
33
+ def sum(self):
34
+ return sum(self._values)
35
+
36
+ @property
37
+ def count(self):
38
+ return len(self._values)
39
+
40
+ @property
41
+ def average(self):
42
+ return self.sum / max(1, self.count)
43
+
44
+ def reset(self):
45
+ self._values = []
backend/synthesizer/utils/cleaners.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Cleaners are transformations that run over the input text at both training and eval time.
3
+
4
+ Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
5
+ hyperparameter. Some cleaners are English-specific. You"ll typically want to use:
6
+ 1. "english_cleaners" for English text
7
+ 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
8
+ the Unidecode library (https://pypi.python.org/pypi/Unidecode)
9
+ 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
10
+ the symbols in symbols.py to match your data).
11
+ """
12
+ import re
13
+ from unidecode import unidecode
14
+ from .numbers import normalize_numbers
15
+
16
+
17
+ # Regular expression matching whitespace:
18
+ _whitespace_re = re.compile(r"\s+")
19
+
20
+ # List of (regular expression, replacement) pairs for abbreviations:
21
+ _abbreviations = [(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) for x in [
22
+ ("mrs", "misess"),
23
+ ("mr", "mister"),
24
+ ("dr", "doctor"),
25
+ ("st", "saint"),
26
+ ("co", "company"),
27
+ ("jr", "junior"),
28
+ ("maj", "major"),
29
+ ("gen", "general"),
30
+ ("drs", "doctors"),
31
+ ("rev", "reverend"),
32
+ ("lt", "lieutenant"),
33
+ ("hon", "honorable"),
34
+ ("sgt", "sergeant"),
35
+ ("capt", "captain"),
36
+ ("esq", "esquire"),
37
+ ("ltd", "limited"),
38
+ ("col", "colonel"),
39
+ ("ft", "fort"),
40
+ ]]
41
+
42
+
43
+ def expand_abbreviations(text):
44
+ for regex, replacement in _abbreviations:
45
+ text = re.sub(regex, replacement, text)
46
+ return text
47
+
48
+
49
+ def expand_numbers(text):
50
+ return normalize_numbers(text)
51
+
52
+
53
+ def lowercase(text):
54
+ """lowercase input tokens."""
55
+ return text.lower()
56
+
57
+
58
+ def collapse_whitespace(text):
59
+ return re.sub(_whitespace_re, " ", text)
60
+
61
+
62
+ def convert_to_ascii(text):
63
+ return unidecode(text)
64
+
65
+
66
+ def basic_cleaners(text):
67
+ """Basic pipeline that lowercases and collapses whitespace without transliteration."""
68
+ text = lowercase(text)
69
+ text = collapse_whitespace(text)
70
+ return text
71
+
72
+
73
+ def transliteration_cleaners(text):
74
+ """Pipeline for non-English text that transliterates to ASCII."""
75
+ text = convert_to_ascii(text)
76
+ text = lowercase(text)
77
+ text = collapse_whitespace(text)
78
+ return text
79
+
80
+
81
+ def english_cleaners(text):
82
+ """Pipeline for English text, including number and abbreviation expansion."""
83
+ text = convert_to_ascii(text)
84
+ text = lowercase(text)
85
+ text = expand_numbers(text)
86
+ text = expand_abbreviations(text)
87
+ text = collapse_whitespace(text)
88
+ return text
backend/synthesizer/utils/numbers.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import inflect
3
+
4
+
5
+ _inflect = inflect.engine()
6
+ _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
7
+ _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
8
+ _pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
9
+ _dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
10
+ _ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
11
+ _number_re = re.compile(r"[0-9]+")
12
+
13
+
14
+ def _remove_commas(m):
15
+ return m.group(1).replace(",", "")
16
+
17
+
18
+ def _expand_decimal_point(m):
19
+ return m.group(1).replace(".", " point ")
20
+
21
+
22
+ def _expand_dollars(m):
23
+ match = m.group(1)
24
+ parts = match.split(".")
25
+ if len(parts) > 2:
26
+ return match + " dollars" # Unexpected format
27
+ dollars = int(parts[0]) if parts[0] else 0
28
+ cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
29
+ if dollars and cents:
30
+ dollar_unit = "dollar" if dollars == 1 else "dollars"
31
+ cent_unit = "cent" if cents == 1 else "cents"
32
+ return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
33
+ elif dollars:
34
+ dollar_unit = "dollar" if dollars == 1 else "dollars"
35
+ return "%s %s" % (dollars, dollar_unit)
36
+ elif cents:
37
+ cent_unit = "cent" if cents == 1 else "cents"
38
+ return "%s %s" % (cents, cent_unit)
39
+ else:
40
+ return "zero dollars"
41
+
42
+
43
+ def _expand_ordinal(m):
44
+ return _inflect.number_to_words(m.group(0))
45
+
46
+
47
+ def _expand_number(m):
48
+ num = int(m.group(0))
49
+ if num > 1000 and num < 3000:
50
+ if num == 2000:
51
+ return "two thousand"
52
+ elif num > 2000 and num < 2010:
53
+ return "two thousand " + _inflect.number_to_words(num % 100)
54
+ elif num % 100 == 0:
55
+ return _inflect.number_to_words(num // 100) + " hundred"
56
+ else:
57
+ return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ")
58
+ else:
59
+ return _inflect.number_to_words(num, andword="")
60
+
61
+
62
+ def normalize_numbers(text):
63
+ text = re.sub(_comma_number_re, _remove_commas, text)
64
+ text = re.sub(_pounds_re, r"\1 pounds", text)
65
+ text = re.sub(_dollars_re, _expand_dollars, text)
66
+ text = re.sub(_decimal_number_re, _expand_decimal_point, text)
67
+ text = re.sub(_ordinal_re, _expand_ordinal, text)
68
+ text = re.sub(_number_re, _expand_number, text)
69
+ return text
backend/synthesizer/utils/symbols.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Defines the set of symbols used in text input to the model.
3
+
4
+ The default is a set of ASCII characters that works well for English or text that has been run
5
+ through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
6
+ """
7
+ # from . import cmudict
8
+
9
+ _pad = "_"
10
+ _eos = "~"
11
+ _characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'\"(),-.:;? "
12
+
13
+ # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
14
+ #_arpabet = ["@' + s for s in cmudict.valid_symbols]
15
+
16
+ # Export all symbols:
17
+ symbols = [_pad, _eos] + list(_characters) #+ _arpabet
backend/synthesizer/utils/text.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .symbols import symbols
2
+ from . import cleaners
3
+ import re
4
+
5
+
6
+ # Mappings from symbol to numeric ID and vice versa:
7
+ _symbol_to_id = {s: i for i, s in enumerate(symbols)}
8
+ _id_to_symbol = {i: s for i, s in enumerate(symbols)}
9
+
10
+ # Regular expression matching text enclosed in curly braces:
11
+ _curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)")
12
+
13
+
14
+ def text_to_sequence(text, cleaner_names):
15
+ """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
16
+
17
+ The text can optionally have ARPAbet sequences enclosed in curly braces embedded
18
+ in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
19
+
20
+ Args:
21
+ text: string to convert to a sequence
22
+ cleaner_names: names of the cleaner functions to run the text through
23
+
24
+ Returns:
25
+ List of integers corresponding to the symbols in the text
26
+ """
27
+ sequence = []
28
+
29
+ # Check for curly braces and treat their contents as ARPAbet:
30
+ while len(text):
31
+ m = _curly_re.match(text)
32
+ if not m:
33
+ sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
34
+ break
35
+ sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
36
+ sequence += _arpabet_to_sequence(m.group(2))
37
+ text = m.group(3)
38
+
39
+ # Append EOS token
40
+ sequence.append(_symbol_to_id["~"])
41
+ return sequence
42
+
43
+
44
+ def sequence_to_text(sequence):
45
+ """Converts a sequence of IDs back to a string"""
46
+ result = ""
47
+ for symbol_id in sequence:
48
+ if symbol_id in _id_to_symbol:
49
+ s = _id_to_symbol[symbol_id]
50
+ # Enclose ARPAbet back in curly braces:
51
+ if len(s) > 1 and s[0] == "@":
52
+ s = "{%s}" % s[1:]
53
+ result += s
54
+ return result.replace("}{", " ")
55
+
56
+
57
+ def _clean_text(text, cleaner_names):
58
+ for name in cleaner_names:
59
+ cleaner = getattr(cleaners, name)
60
+ if not cleaner:
61
+ raise Exception("Unknown cleaner: %s" % name)
62
+ text = cleaner(text)
63
+ return text
64
+
65
+
66
+ def _symbols_to_sequence(symbols):
67
+ return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
68
+
69
+
70
+ def _arpabet_to_sequence(text):
71
+ return _symbols_to_sequence(["@" + s for s in text.split()])
72
+
73
+
74
+ def _should_keep_symbol(s):
75
+ return s in _symbol_to_id and s not in ("_", "~")
backend/wsgi.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Gunicorn entry point for the voice cloning backend."""
2
+
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ # Ensure backend directory is in the path for imports
7
+ backend_dir = Path(__file__).parent
8
+ if str(backend_dir) not in sys.path:
9
+ sys.path.insert(0, str(backend_dir))
10
+
11
+ from app import app
12
+
13
+
14
+ if __name__ == "__main__":
15
+ app.run()
frontend/.env.development ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # Local development
2
+ VITE_API_URL=http://localhost:5000
3
+ FLASK_ENV=development
4
+ DEBUG=true
frontend/.env.production ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Production deployment
2
+ VITE_API_URL=https://voice-cloning-personalized-speech.onrender.com
frontend/.gitignore ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Logs
2
+ logs
3
+ *.log
4
+ npm-debug.log*
5
+ yarn-debug.log*
6
+ yarn-error.log*
7
+ pnpm-debug.log*
8
+ lerna-debug.log*
9
+
10
+ # Dependencies
11
+ /node_modules
12
+ /.pnp
13
+ .pnp.js
14
+
15
+ # Testing
16
+ /coverage
17
+
18
+ # Next.js
19
+ /.next/
20
+ /out/
21
+
22
+ # Production
23
+ /build
24
+ /dist
25
+ /dist-ssr
26
+
27
+ # Local env files
28
+ .env*.local
29
+ .env
30
+
31
+ # Debug logs
32
+ npm-debug.log*
33
+ yarn-debug.log*
34
+ yarn-error.log*
35
+ pnpm-debug.log*
36
+
37
+ # Editor directories and files
38
+ .idea
39
+ .vscode/*
40
+ !.vscode/extensions.json
41
+ .DS_Store
42
+ *.suo
43
+ *.ntvs*
44
+ *.njsproj
45
+ *.sln
46
+ *.sw?
47
+
48
+ # System Files
49
+ .DS_Store
50
+ Thumbs.db
51
+
52
+ # Cache
53
+ .cache/
54
+ .temp/
55
+ .tmp/
56
+
57
+ # Misc
58
+ .vercel
59
+ .next
60
+ .vercel_build_output
61
+
62
+ # Local Netlify folder
63
+ .netlify
64
+
65
+ # Optional npm cache directory
66
+ .npm
67
+
68
+ # Optional eslint cache
69
+ .eslintcache
70
+
71
+ # Optional REPL history
72
+ .node_repl_history
73
+
74
+ # Output of 'npm pack'
75
+ *.tgz
76
+
77
+ # Yarn Integrity file
78
+ .yarn-integrity
79
+
80
+ # dotenv environment variables file
81
+ .env*.local
82
+ .env
83
+
84
+ # parcel-bundler cache (https://parceljs.org/)
85
+ .parcel-cache
86
+
87
+ # Next.js build output
88
+ .next
89
+ out
90
+
91
+ # Vercel
92
+ .vercel
93
+
94
+ # TypeScript
95
+ *.tsbuildinfo
96
+ next-env.d.ts
97
+
98
+ # Optional stylelint cache
99
+ .stylelintcache
frontend/README.md ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Voice Cloning – Personalized Speech Synthesis (Frontend)
2
+
3
+ > Note: On first load, please wait 2–3 minutes. The app initializes several 3D elements which can take time to fetch and compile in the browser, including examples like:
4
+ > - Spline-powered scenes and backgrounds
5
+ > - Interactive Orb (Three.js) with real-time interaction
6
+ > - Particle Field and Floating Elements
7
+ > - Speaker/Microphone 3D scenes and visualizers
8
+
9
+ This repository contains the fully custom-built frontend for a Voice Cloning and Personalized Speech Synthesis application.
10
+
11
+ - Modern, responsive UI with smooth 3D visuals and an accessible design system.
12
+
13
+ ---
14
+
15
+ ## Overview
16
+
17
+ The frontend provides:
18
+
19
+ - A clean interface to enroll voice samples and synthesize speech.
20
+ - Real-time audio recording, waveform visualization, and playback controls.
21
+ - Rich 3D/animated visuals to enhance the user experience (Spline and Three.js).
22
+ - A component-driven architecture for maintainability and reusability.
23
+
24
+ ---
25
+
26
+ ## Features
27
+
28
+ - Audio
29
+ - Audio recorder and waveform visualization
30
+ - Error boundaries and robust UI states
31
+
32
+ - 3D & Visuals
33
+ - Spline background scenes
34
+ - Interactive Orb, Particle Field, Floating Elements
35
+ - Speaker/Microphone scenes and animated transitions
36
+
37
+ - UI/UX
38
+ - shadcn/ui components with Tailwind CSS
39
+ - Responsive, accessible design
40
+ - Theming and utility-first styling
41
+
42
+ ---
43
+
44
+ ## Tech Stack
45
+
46
+ - Vite (bundler & dev server)
47
+ - React (UI) + TypeScript
48
+ - Tailwind CSS + PostCSS
49
+ - shadcn/ui component library
50
+ - Three.js & Spline (3D scenes and interactions)
51
+ - ESLint (code quality) and modern TS configs
52
+
53
+ ---
54
+
55
+ ## Getting Started
56
+
57
+ Prerequisites:
58
+
59
+ - Node.js and npm installed (recommend using nvm)
60
+
61
+ Install and run:
62
+
63
+ ```bash
64
+ npm install
65
+ npm run dev
66
+ ```
67
+
68
+ Open the local URL printed in the terminal. First load may take 2–3 minutes due to 3D assets.
69
+
70
+ ---
71
+
72
+ ## Available Scripts
73
+
74
+ - `npm run dev` – Start the development server
75
+ - `npm run build` – Build for production into `dist/`
76
+ - `npm run preview` – Preview the production build locally
77
+
78
+ ---
79
+
80
+ ## Project Structure (high level)
81
+
82
+ - `src/`
83
+ - `components/`
84
+ - `audio/` – Recorder, waveform, audio UI
85
+ - `three/` – Interactive Orb, Particle Field, Speaker/Mic scenes, Spline background
86
+ - `ui/` – shadcn/ui component wrappers and utilities
87
+ - `pages/` – App pages and routing
88
+ - `lib/` – Utility functions
89
+
90
+ - `public/` – Static assets (icons, placeholders, robots.txt)
91
+ - `tailwind.config.ts`, `postcss.config.js` – Styling configuration
92
+ - `eslint.config.js` – Linting configuration
93
+
94
+ ---
95
+
96
+ ## Deployment
97
+
98
+ Build a production bundle:
99
+
100
+ ```bash
101
+ npm run build
102
+ npm run preview
103
+ ```
104
+
105
+ Deploy the contents of `dist/` to your hosting of choice (e.g., Netlify, Vercel, GitHub Pages, or a static server).
106
+
107
+ ---
108
+
109
+ ## License
110
+
111
+ Copyright The project owner. All rights reserved.
frontend/components.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "$schema": "https://ui.shadcn.com/schema.json",
3
+ "style": "default",
4
+ "rsc": false,
5
+ "tsx": true,
6
+ "tailwind": {
7
+ "config": "tailwind.config.ts",
8
+ "css": "src/index.css",
9
+ "baseColor": "slate",
10
+ "cssVariables": true,
11
+ "prefix": ""
12
+ },
13
+ "aliases": {
14
+ "components": "@/components",
15
+ "utils": "@/lib/utils",
16
+ "ui": "@/components/ui",
17
+ "lib": "@/lib",
18
+ "hooks": "@/hooks"
19
+ }
20
+ }
frontend/eslint.config.js ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import js from "@eslint/js";
2
+ import globals from "globals";
3
+ import reactHooks from "eslint-plugin-react-hooks";
4
+ import reactRefresh from "eslint-plugin-react-refresh";
5
+ import tseslint from "typescript-eslint";
6
+
7
+ export default tseslint.config(
8
+ { ignores: ["dist"] },
9
+ {
10
+ extends: [js.configs.recommended, ...tseslint.configs.recommended],
11
+ files: ["**/*.{ts,tsx}"],
12
+ languageOptions: {
13
+ ecmaVersion: 2020,
14
+ globals: globals.browser,
15
+ },
16
+ plugins: {
17
+ "react-hooks": reactHooks,
18
+ "react-refresh": reactRefresh,
19
+ },
20
+ rules: {
21
+ ...reactHooks.configs.recommended.rules,
22
+ "react-refresh/only-export-components": [
23
+ "warn",
24
+ { allowConstantExport: true },
25
+ ],
26
+ "@typescript-eslint/no-unused-vars": "off",
27
+ },
28
+ }
29
+ );
frontend/index.html ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
6
+ <title>Dhwanii Voice Cloning AI</title>
7
+ <meta name="description" content="Voice cloning and speech synthesis demo" />
8
+ <meta name="author" content="Dhwanii Voice Cloning AI" />
9
+
10
+ <meta property="og:title" content="Dhwanii Voice Cloning AI" />
11
+ <meta property="og:description" content="Voice cloning and speech synthesis demo" />
12
+ <meta property="og:type" content="website" />
13
+ <meta property="og:image" content="/favicon.ico" />
14
+
15
+ <meta name="twitter:card" content="summary" />
16
+ <meta name="twitter:site" content="@Arjitsharma00074" />
17
+ <meta name="twitter:image" content="/favicon.ico" />
18
+ </head>
19
+
20
+ <body>
21
+ <div id="root"></div>
22
+ <script type="module" src="/src/main.tsx"></script>
23
+ </body>
24
+ </html>
frontend/package-lock.json ADDED
The diff for this file is too large to render. See raw diff
 
frontend/package.json ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "vite_react_shadcn_ts",
3
+ "private": true,
4
+ "version": "0.0.0",
5
+ "type": "module",
6
+ "scripts": {
7
+ "dev": "vite",
8
+ "build": "vite build",
9
+ "build:dev": "vite build --mode development",
10
+ "lint": "eslint .",
11
+ "preview": "vite preview"
12
+ },
13
+ "dependencies": {
14
+ "@hookform/resolvers": "^3.10.0",
15
+ "@radix-ui/react-accordion": "^1.2.11",
16
+ "@radix-ui/react-alert-dialog": "^1.1.14",
17
+ "@radix-ui/react-aspect-ratio": "^1.1.7",
18
+ "@radix-ui/react-avatar": "^1.1.10",
19
+ "@radix-ui/react-checkbox": "^1.3.2",
20
+ "@radix-ui/react-collapsible": "^1.1.11",
21
+ "@radix-ui/react-context-menu": "^2.2.15",
22
+ "@radix-ui/react-dialog": "^1.1.14",
23
+ "@radix-ui/react-dropdown-menu": "^2.1.15",
24
+ "@radix-ui/react-hover-card": "^1.1.14",
25
+ "@radix-ui/react-label": "^2.1.7",
26
+ "@radix-ui/react-menubar": "^1.1.15",
27
+ "@radix-ui/react-navigation-menu": "^1.2.13",
28
+ "@radix-ui/react-popover": "^1.1.14",
29
+ "@radix-ui/react-progress": "^1.1.7",
30
+ "@radix-ui/react-radio-group": "^1.3.7",
31
+ "@radix-ui/react-scroll-area": "^1.2.9",
32
+ "@radix-ui/react-select": "^2.2.5",
33
+ "@radix-ui/react-separator": "^1.1.7",
34
+ "@radix-ui/react-slider": "^1.3.5",
35
+ "@radix-ui/react-slot": "^1.2.3",
36
+ "@radix-ui/react-switch": "^1.2.5",
37
+ "@radix-ui/react-tabs": "^1.1.12",
38
+ "@radix-ui/react-toast": "^1.2.14",
39
+ "@radix-ui/react-toggle": "^1.1.9",
40
+ "@radix-ui/react-toggle-group": "^1.1.10",
41
+ "@radix-ui/react-tooltip": "^1.2.7",
42
+ "@react-three/drei": "^9.122.0",
43
+ "@react-three/fiber": "^8.18.0",
44
+ "@react-three/postprocessing": "^2.19.1",
45
+ "@splinetool/react-spline": "^4.1.0",
46
+ "@splinetool/runtime": "^1.10.55",
47
+ "@tanstack/react-query": "^5.83.0",
48
+ "class-variance-authority": "^0.7.1",
49
+ "clsx": "^2.1.1",
50
+ "cmdk": "^1.1.1",
51
+ "date-fns": "^3.6.0",
52
+ "embla-carousel-react": "^8.6.0",
53
+ "input-otp": "^1.4.2",
54
+ "lucide-react": "^0.462.0",
55
+ "next-themes": "^0.3.0",
56
+ "react": "^18.3.1",
57
+ "react-day-picker": "^8.10.1",
58
+ "react-dom": "^18.3.1",
59
+ "react-hook-form": "^7.61.1",
60
+ "react-resizable-panels": "^2.1.9",
61
+ "react-router-dom": "^6.30.1",
62
+ "recharts": "^2.15.4",
63
+ "sonner": "^1.7.4",
64
+ "tailwind-merge": "^2.6.0",
65
+ "tailwindcss-animate": "^1.0.7",
66
+ "three": "^0.169.0",
67
+ "vaul": "^0.9.9",
68
+ "zod": "^3.25.76"
69
+ },
70
+ "devDependencies": {
71
+ "@eslint/js": "^9.32.0",
72
+ "@tailwindcss/typography": "^0.5.16",
73
+ "@types/node": "^22.16.5",
74
+ "@types/react": "^18.3.23",
75
+ "@types/react-dom": "^18.3.7",
76
+ "@vitejs/plugin-react-swc": "^3.11.0",
77
+ "autoprefixer": "^10.4.21",
78
+ "eslint": "^9.32.0",
79
+ "eslint-plugin-react-hooks": "^5.2.0",
80
+ "eslint-plugin-react-refresh": "^0.4.20",
81
+ "globals": "^15.15.0",
82
+ "postcss": "^8.5.6",
83
+ "tailwindcss": "^3.4.17",
84
+ "typescript": "^5.8.3",
85
+ "typescript-eslint": "^8.38.0",
86
+ "vite": "^7.1.4"
87
+ }
88
+ }
frontend/postcss.config.js ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ export default {
2
+ plugins: {
3
+ tailwindcss: {},
4
+ autoprefixer: {},
5
+ },
6
+ }
frontend/public/placeholder.svg ADDED
frontend/public/robots.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ User-agent: Googlebot
2
+ Allow: /
3
+
4
+ User-agent: Bingbot
5
+ Allow: /
6
+
7
+ User-agent: Twitterbot
8
+ Allow: /
9
+
10
+ User-agent: facebookexternalhit
11
+ Allow: /
12
+
13
+ User-agent: *
14
+ Allow: /