Spaces:
Sleeping
Sleeping
Initial voice cloning backend with all dependencies
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .dockerignore +18 -0
- .gitignore +62 -0
- Dockerfile +25 -0
- README.md +570 -10
- backend/.env.example +15 -0
- backend/__init__.py +1 -0
- backend/app/__init__.py +34 -0
- backend/app/routes.py +409 -0
- backend/app/vocoder/audio.py +108 -0
- backend/app/vocoder/display.py +127 -0
- backend/app/vocoder/distribution.py +132 -0
- backend/app/vocoder/hparams.py +44 -0
- backend/app/vocoder/inference.py +83 -0
- backend/app/vocoder/models/fatchord_version.py +434 -0
- backend/app/voice_cloning.py +108 -0
- backend/download_models.py +54 -0
- backend/encoder/__init__.py +0 -0
- backend/encoder/audio.py +117 -0
- backend/encoder/inference.py +178 -0
- backend/encoder/model.py +135 -0
- backend/encoder/params_data.py +29 -0
- backend/encoder/params_model.py +11 -0
- backend/enrolled_voices/voice_26bfa1ef.mp3 +0 -0
- backend/enrolled_voices/voice_72beeda9.mp3 +0 -0
- backend/enrolled_voices/voices.json +100 -0
- backend/requirements.txt +14 -0
- backend/runtime.txt +1 -0
- backend/synthesizer/__init__.py +1 -0
- backend/synthesizer/audio.py +211 -0
- backend/synthesizer/hparams.py +92 -0
- backend/synthesizer/inference.py +165 -0
- backend/synthesizer/models/tacotron.py +542 -0
- backend/synthesizer/utils/__init__.py +45 -0
- backend/synthesizer/utils/cleaners.py +88 -0
- backend/synthesizer/utils/numbers.py +69 -0
- backend/synthesizer/utils/symbols.py +17 -0
- backend/synthesizer/utils/text.py +75 -0
- backend/wsgi.py +15 -0
- frontend/.env.development +4 -0
- frontend/.env.production +2 -0
- frontend/.gitignore +99 -0
- frontend/README.md +111 -0
- frontend/components.json +20 -0
- frontend/eslint.config.js +29 -0
- frontend/index.html +24 -0
- frontend/package-lock.json +0 -0
- frontend/package.json +88 -0
- frontend/postcss.config.js +6 -0
- frontend/public/placeholder.svg +1 -0
- frontend/public/robots.txt +14 -0
.dockerignore
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__
|
| 2 |
+
*.pyc
|
| 3 |
+
.git
|
| 4 |
+
.env
|
| 5 |
+
.env.local
|
| 6 |
+
node_modules
|
| 7 |
+
dist
|
| 8 |
+
build
|
| 9 |
+
.DS_Store
|
| 10 |
+
*.log
|
| 11 |
+
.vscode
|
| 12 |
+
.idea
|
| 13 |
+
*.egg-info
|
| 14 |
+
.pytest_cache
|
| 15 |
+
frontend/node_modules
|
| 16 |
+
.next
|
| 17 |
+
.nuxt
|
| 18 |
+
.cache
|
.gitignore
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Model files - downloaded at build time, not stored in git
|
| 2 |
+
backend/models/default/*.pt
|
| 3 |
+
models/default/*.pt
|
| 4 |
+
*.pt
|
| 5 |
+
|
| 6 |
+
# Python
|
| 7 |
+
__pycache__/
|
| 8 |
+
*.py[cod]
|
| 9 |
+
*$py.class
|
| 10 |
+
*.so
|
| 11 |
+
.Python
|
| 12 |
+
env/
|
| 13 |
+
venv/
|
| 14 |
+
ENV/
|
| 15 |
+
build/
|
| 16 |
+
develop-eggs/
|
| 17 |
+
dist/
|
| 18 |
+
downloads/
|
| 19 |
+
eggs/
|
| 20 |
+
.eggs/
|
| 21 |
+
lib/
|
| 22 |
+
lib64/
|
| 23 |
+
parts/
|
| 24 |
+
sdist/
|
| 25 |
+
var/
|
| 26 |
+
wheels/
|
| 27 |
+
*.egg-info/
|
| 28 |
+
.installed.cfg
|
| 29 |
+
*.egg
|
| 30 |
+
|
| 31 |
+
# Environment variables
|
| 32 |
+
.env
|
| 33 |
+
.env.local
|
| 34 |
+
.env.*.local
|
| 35 |
+
backend/.env
|
| 36 |
+
|
| 37 |
+
# IDE
|
| 38 |
+
.vscode/
|
| 39 |
+
.idea/
|
| 40 |
+
*.swp
|
| 41 |
+
*.swo
|
| 42 |
+
*~
|
| 43 |
+
|
| 44 |
+
# OS
|
| 45 |
+
.DS_Store
|
| 46 |
+
Thumbs.db
|
| 47 |
+
|
| 48 |
+
# Node/Frontend
|
| 49 |
+
node_modules/
|
| 50 |
+
dist/
|
| 51 |
+
.next/
|
| 52 |
+
out/
|
| 53 |
+
|
| 54 |
+
# Build artifacts
|
| 55 |
+
outputs/
|
| 56 |
+
temp_uploads/
|
| 57 |
+
enrolled_voices/*.wav
|
| 58 |
+
enrolled_voices/*.mp3
|
| 59 |
+
|
| 60 |
+
# Cache
|
| 61 |
+
.cache/
|
| 62 |
+
.pytest_cache/
|
Dockerfile
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
# Install system dependencies
|
| 4 |
+
RUN apt-get update && apt-get install -y \
|
| 5 |
+
libsndfile1 libsndfile1-dev \
|
| 6 |
+
ffmpeg \
|
| 7 |
+
git \
|
| 8 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 9 |
+
|
| 10 |
+
WORKDIR /app
|
| 11 |
+
|
| 12 |
+
# Copy entire project
|
| 13 |
+
COPY . .
|
| 14 |
+
|
| 15 |
+
# Install Python dependencies
|
| 16 |
+
RUN pip install --no-cache-dir -r backend/requirements.txt
|
| 17 |
+
|
| 18 |
+
# Download models during build
|
| 19 |
+
RUN cd backend && python download_models.py
|
| 20 |
+
|
| 21 |
+
# Expose port (HF Spaces uses 7860)
|
| 22 |
+
EXPOSE 7860
|
| 23 |
+
|
| 24 |
+
# Start the application
|
| 25 |
+
CMD ["gunicorn", "--bind", "0.0.0.0:7860", "--workers", "1", "--timeout", "300", "backend.wsgi:app"]
|
README.md
CHANGED
|
@@ -1,12 +1,572 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
-
title: Voice Cloning Backend
|
| 3 |
-
emoji: 💻
|
| 4 |
-
colorFrom: red
|
| 5 |
-
colorTo: green
|
| 6 |
-
sdk: docker
|
| 7 |
-
pinned: false
|
| 8 |
-
license: mit
|
| 9 |
-
short_description: 'AI-powered Voice Cloning '
|
| 10 |
-
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
| 1 |
+
# Real-Time Voice Cloning (RTVC)
|
| 2 |
+
|
| 3 |
+
A complete full-stack voice cloning application with React frontend and PyTorch backend that can synthesize speech in anyone's voice from just a few seconds of audio reference.
|
| 4 |
+
|
| 5 |
+
[](https://www.python.org/downloads/)
|
| 6 |
+
[](https://pytorch.org/)
|
| 7 |
+
[](https://reactjs.org/)
|
| 8 |
+
[](https://www.typescriptlang.org/)
|
| 9 |
+
[](LICENSE)
|
| 10 |
+
|
| 11 |
+
## Features
|
| 12 |
+
|
| 13 |
+
- **Full Stack Application**: Modern React UI + Flask API + PyTorch backend
|
| 14 |
+
- **Voice Enrollment**: Record or upload voice samples directly in the browser
|
| 15 |
+
- **Speech Synthesis**: Generate cloned speech with intuitive interface
|
| 16 |
+
- **Voice Cloning**: Clone any voice with just 3-10 seconds of audio
|
| 17 |
+
- **Real-Time Generation**: Generate speech at 2-3x real-time speed on CPU
|
| 18 |
+
- **High Quality**: Natural-sounding synthetic speech using state-of-the-art models
|
| 19 |
+
- **Easy to Use**: Beautiful UI with 3D visualizations and audio waveforms
|
| 20 |
+
- **Multiple Formats**: Supports WAV, MP3, M4A, FLAC input audio
|
| 21 |
+
- **Multi-Language**: Supports English and Hindi text-to-speech
|
| 22 |
+
|
| 23 |
+
## Table of Contents
|
| 24 |
+
|
| 25 |
+
- [Demo](#demo)
|
| 26 |
+
- [Quick Start (Full Stack)](#quick-start-full-stack)
|
| 27 |
+
- [Deployment](#deployment)
|
| 28 |
+
- [How It Works](#how-it-works)
|
| 29 |
+
- [Installation](#installation)
|
| 30 |
+
- [Project Structure](#project-structure)
|
| 31 |
+
- [Usage Examples](#usage-examples)
|
| 32 |
+
- [API Documentation](#api-documentation)
|
| 33 |
+
- [Troubleshooting](#troubleshooting)
|
| 34 |
+
- [Technical Details](#technical-details)
|
| 35 |
+
- [Credits](#credits)
|
| 36 |
+
|
| 37 |
+
## Demo
|
| 38 |
+
|
| 39 |
+
**Frontend UI**: Modern React interface with 3D visualizations
|
| 40 |
+
**Voice Enrollment**: Record/upload voice samples → Backend saves to database
|
| 41 |
+
**Speech Synthesis**: Select voice + Enter text → Backend generates cloned speech
|
| 42 |
+
**Playback**: Listen to generated audio directly in browser or download
|
| 43 |
+
|
| 44 |
+
## Quick Start (Full Stack)
|
| 45 |
+
|
| 46 |
+
### Option 1: Using the Startup Script (Easiest)
|
| 47 |
+
|
| 48 |
+
```powershell
|
| 49 |
+
# Windows PowerShell
|
| 50 |
+
cd rtvc
|
| 51 |
+
.\start_app.ps1
|
| 52 |
+
```
|
| 53 |
+
|
| 54 |
+
This will:
|
| 55 |
+
1. Start the Backend API server (port 5000)
|
| 56 |
+
2. Start the Frontend dev server (port 8080)
|
| 57 |
+
3. Open your browser to http://localhost:8080
|
| 58 |
+
|
| 59 |
+
### Option 2: Manual Start
|
| 60 |
+
|
| 61 |
+
**Terminal 1 - Backend API:**
|
| 62 |
+
```bash
|
| 63 |
+
cd rtvc
|
| 64 |
+
python api_server.py
|
| 65 |
+
```
|
| 66 |
+
|
| 67 |
+
**Terminal 2 - Frontend:**
|
| 68 |
+
```bash
|
| 69 |
+
cd "rtvc/Frontend Voice Cloning"
|
| 70 |
+
npm run dev
|
| 71 |
+
```
|
| 72 |
+
|
| 73 |
+
Then open http://localhost:8080 in your browser.
|
| 74 |
+
|
| 75 |
+
## Deployment
|
| 76 |
+
|
| 77 |
+
### Production Deployment Stack
|
| 78 |
+
|
| 79 |
+
**Frontend**: Netlify (Free tier)
|
| 80 |
+
**Backend**: Render (Free tier)
|
| 81 |
+
**Models**: HuggingFace Hub (Free)
|
| 82 |
+
|
| 83 |
+
See [DEPLOYMENT.md](DEPLOYMENT.md) for complete deployment guide.
|
| 84 |
+
|
| 85 |
+
#### Quick Deployment
|
| 86 |
+
|
| 87 |
+
1. **Deploy Backend to Render**
|
| 88 |
+
- Push to GitHub
|
| 89 |
+
- Connect Render to GitHub repo
|
| 90 |
+
- Use `render.yaml` configuration
|
| 91 |
+
- Models auto-download on first deploy (~10 minutes)
|
| 92 |
+
|
| 93 |
+
2. **Deploy Frontend to Netlify**
|
| 94 |
+
- Connect Netlify to GitHub repo
|
| 95 |
+
- Set base directory: `frontend`
|
| 96 |
+
- Environment: `VITE_API_URL=your-render-backend-url`
|
| 97 |
+
|
| 98 |
+
3. **Test**
|
| 99 |
+
- Visit your Netlify URL
|
| 100 |
+
- API calls automatically route to Render backend
|
| 101 |
+
|
| 102 |
+
**Pricing**: Free tier for both (with optional paid upgrades)
|
| 103 |
+
|
| 104 |
+
### Using the Application
|
| 105 |
+
|
| 106 |
+
1. **Enroll a Voice**:
|
| 107 |
+
- Go to "Voice Enrollment" section
|
| 108 |
+
- Enter a voice name
|
| 109 |
+
- Record audio (3-10 seconds) or upload a file
|
| 110 |
+
- Click "Enroll Voice"
|
| 111 |
+
|
| 112 |
+
2. **Generate Speech**:
|
| 113 |
+
- Go to "Speech Synthesis" section
|
| 114 |
+
- Select your enrolled voice
|
| 115 |
+
- Enter text to synthesize
|
| 116 |
+
- Click "Generate Speech"
|
| 117 |
+
- Play or download the result
|
| 118 |
+
|
| 119 |
+
For detailed integration information, see [INTEGRATION_GUIDE.md](INTEGRATION_GUIDE.md).
|
| 120 |
+
|
| 121 |
+
## How It Works
|
| 122 |
+
|
| 123 |
+
The system uses a 3-stage pipeline based on the SV2TTS (Speaker Verification to Text-to-Speech) architecture:
|
| 124 |
+
|
| 125 |
+
```
|
| 126 |
+
Reference Audio → [Encoder] → Speaker Embedding (256-d vector)
|
| 127 |
+
↓
|
| 128 |
+
Text Input → [Synthesizer (Tacotron)] → Mel-Spectrogram
|
| 129 |
+
↓
|
| 130 |
+
[Vocoder (WaveRNN)] → Audio Output
|
| 131 |
+
```
|
| 132 |
+
|
| 133 |
+
### Pipeline Stages:
|
| 134 |
+
|
| 135 |
+
1. **Speaker Encoder** - Extracts a unique voice "fingerprint" from reference audio
|
| 136 |
+
2. **Synthesizer** - Generates mel-spectrograms from text conditioned on speaker embedding
|
| 137 |
+
3. **Vocoder** - Converts mel-spectrograms to high-quality audio waveforms
|
| 138 |
+
|
| 139 |
+
## Installation
|
| 140 |
+
|
| 141 |
+
### Prerequisites
|
| 142 |
+
|
| 143 |
+
- Python 3.11 or higher
|
| 144 |
+
- Windows/Linux/macOS
|
| 145 |
+
- ~2 GB disk space for models
|
| 146 |
+
- 4 GB RAM minimum (8 GB recommended)
|
| 147 |
+
|
| 148 |
+
### Step 1: Clone the Repository
|
| 149 |
+
|
| 150 |
+
```bash
|
| 151 |
+
git clone https://github.com/yourusername/rtvc.git
|
| 152 |
+
cd rtvc
|
| 153 |
+
```
|
| 154 |
+
|
| 155 |
+
### Step 2: Install Dependencies
|
| 156 |
+
|
| 157 |
+
```bash
|
| 158 |
+
pip install torch numpy librosa scipy soundfile webrtcvad tqdm unidecode inflect matplotlib numba
|
| 159 |
+
```
|
| 160 |
+
|
| 161 |
+
Or install PyTorch with CUDA for GPU acceleration:
|
| 162 |
+
|
| 163 |
+
```bash
|
| 164 |
+
pip install torch --index-url https://download.pytorch.org/whl/cu118
|
| 165 |
+
pip install numpy librosa scipy soundfile webrtcvad tqdm unidecode inflect matplotlib numba
|
| 166 |
+
```
|
| 167 |
+
|
| 168 |
+
### Step 3: Download Pretrained Models
|
| 169 |
+
|
| 170 |
+
Download the pretrained models from [Google Drive](https://drive.google.com/drive/folders/1fU6umc5uQAVR2udZdHX-lDgXYzTyqG_j):
|
| 171 |
+
|
| 172 |
+
| Model | Size | Description |
|
| 173 |
+
|-------|------|-------------|
|
| 174 |
+
| encoder.pt | 17 MB | Speaker encoder model |
|
| 175 |
+
| synthesizer.pt | 370 MB | Tacotron synthesizer model |
|
| 176 |
+
| vocoder.pt | 53 MB | WaveRNN vocoder model |
|
| 177 |
+
|
| 178 |
+
Place all three files in the `models/default/` directory.
|
| 179 |
+
|
| 180 |
+
### Step 4: Verify Installation
|
| 181 |
+
|
| 182 |
+
```bash
|
| 183 |
+
python clone_my_voice.py
|
| 184 |
+
```
|
| 185 |
+
|
| 186 |
+
If you see errors about missing models, check that all three `.pt` files are in `models/default/`.
|
| 187 |
+
|
| 188 |
+
## Quick Start
|
| 189 |
+
|
| 190 |
+
### Method 1: Simple Script (Recommended)
|
| 191 |
+
|
| 192 |
+
1. Open `clone_my_voice.py`
|
| 193 |
+
2. Edit these lines:
|
| 194 |
+
|
| 195 |
+
```python
|
| 196 |
+
# Your voice sample file
|
| 197 |
+
VOICE_FILE = r"sample\your_voice.mp3"
|
| 198 |
+
|
| 199 |
+
# The text you want to be spoken
|
| 200 |
+
TEXT_TO_CLONE = """
|
| 201 |
+
Your text here. Can be multiple sentences or even paragraphs!
|
| 202 |
+
"""
|
| 203 |
+
|
| 204 |
+
# Output location
|
| 205 |
+
OUTPUT_FILE = r"outputs\cloned_voice.wav"
|
| 206 |
+
```
|
| 207 |
+
|
| 208 |
+
3. Run it:
|
| 209 |
+
|
| 210 |
+
```bash
|
| 211 |
+
python clone_my_voice.py
|
| 212 |
+
```
|
| 213 |
+
|
| 214 |
+
### Method 2: Command Line
|
| 215 |
+
|
| 216 |
+
```bash
|
| 217 |
+
python run_cli.py --voice "path/to/voice.wav" --text "Text to synthesize" --out "output.wav"
|
| 218 |
+
```
|
| 219 |
+
|
| 220 |
+
### Method 3: Advanced Runner Script
|
| 221 |
+
|
| 222 |
+
```bash
|
| 223 |
+
python run_voice_cloning.py
|
| 224 |
+
```
|
| 225 |
+
|
| 226 |
+
Edit the paths and text inside the script before running.
|
| 227 |
+
|
| 228 |
+
## Project Structure
|
| 229 |
+
|
| 230 |
+
```
|
| 231 |
+
rtvc/
|
| 232 |
+
├── clone_my_voice.py # Simple script - EDIT THIS to clone your voice!
|
| 233 |
+
├── run_cli.py # Command-line interface
|
| 234 |
+
│
|
| 235 |
+
├── encoder/ # Speaker Encoder Module
|
| 236 |
+
│ ├── __init__.py
|
| 237 |
+
│ ├── audio.py # Audio preprocessing for encoder
|
| 238 |
+
│ ├── inference.py # Encoder inference functions
|
| 239 |
+
│ ├── model.py # SpeakerEncoder neural network
|
| 240 |
+
│ ├── params_data.py # Data hyperparameters
|
| 241 |
+
│ └── params_model.py # Model hyperparameters
|
| 242 |
+
│
|
| 243 |
+
├── synthesizer/ # Tacotron Synthesizer Module
|
| 244 |
+
│ ├── __init__.py
|
| 245 |
+
│ ├── audio.py # Audio processing for synthesizer
|
| 246 |
+
│ ├── hparams.py # All synthesizer hyperparameters
|
| 247 |
+
│ ├── inference.py # Synthesizer inference class
|
| 248 |
+
│ │
|
| 249 |
+
│ ├── models/
|
| 250 |
+
│ │ └── tacotron.py # Tacotron 2 architecture
|
| 251 |
+
│ │
|
| 252 |
+
│ └── utils/
|
| 253 |
+
│ ├── cleaners.py # Text cleaning functions
|
| 254 |
+
│ ├── numbers.py # Number-to-text conversion
|
| 255 |
+
│ ├── symbols.py # Character/phoneme symbols
|
| 256 |
+
│ └── text.py # Text-to-sequence conversion
|
| 257 |
+
│
|
| 258 |
+
├── vocoder/ # WaveRNN Vocoder Module
|
| 259 |
+
│ ├── audio.py # Audio utilities for vocoder
|
| 260 |
+
│ ├── display.py # Progress display utilities
|
| 261 |
+
│ ├── distribution.py # Probability distributions
|
| 262 |
+
│ ├── hparams.py # Vocoder hyperparameters
|
| 263 |
+
│ ├── inference.py # Vocoder inference functions
|
| 264 |
+
│ │
|
| 265 |
+
│ └── models/
|
| 266 |
+
│ └── fatchord_version.py # WaveRNN architecture
|
| 267 |
+
│
|
| 268 |
+
├── utils/
|
| 269 |
+
│ └── default_models.py # Model download utilities
|
| 270 |
+
│
|
| 271 |
+
├── models/
|
| 272 |
+
│ └── default/ # Pretrained models go here
|
| 273 |
+
│ ├── encoder.pt # (17 MB)
|
| 274 |
+
│ ├── synthesizer.pt # (370 MB) - Must download!
|
| 275 |
+
│ └── vocoder.pt # (53 MB)
|
| 276 |
+
│
|
| 277 |
+
├── sample/ # Put your voice samples here
|
| 278 |
+
│ └── your_voice.mp3
|
| 279 |
+
│
|
| 280 |
+
└── outputs/ # Generated audio outputs
|
| 281 |
+
└── cloned_voice.wav
|
| 282 |
+
```
|
| 283 |
+
|
| 284 |
+
### Key Files Explained
|
| 285 |
+
|
| 286 |
+
| File | Purpose |
|
| 287 |
+
|------|---------|
|
| 288 |
+
| `clone_my_voice.py` | **START HERE** - Simplest way to clone your voice |
|
| 289 |
+
| `run_cli.py` | Command-line tool for voice cloning |
|
| 290 |
+
| `encoder/inference.py` | Loads encoder and extracts speaker embeddings |
|
| 291 |
+
| `synthesizer/inference.py` | Loads synthesizer and generates mel-spectrograms |
|
| 292 |
+
| `vocoder/inference.py` | Loads vocoder and generates waveforms |
|
| 293 |
+
| `**/hparams.py` | Configuration files for each module |
|
| 294 |
+
|
| 295 |
+
## Usage Examples
|
| 296 |
+
|
| 297 |
+
### Example 1: Basic Voice Cloning
|
| 298 |
+
|
| 299 |
+
```bash
|
| 300 |
+
python clone_my_voice.py
|
| 301 |
+
```
|
| 302 |
+
|
| 303 |
+
Edit `clone_my_voice.py` first:
|
| 304 |
+
```python
|
| 305 |
+
VOICE_FILE = r"sample\my_voice.mp3"
|
| 306 |
+
TEXT_TO_CLONE = "Hello, this is my cloned voice!"
|
| 307 |
+
```
|
| 308 |
+
|
| 309 |
+
### Example 2: Multiple Outputs
|
| 310 |
+
|
| 311 |
+
```bash
|
| 312 |
+
# Generate first output
|
| 313 |
+
python run_cli.py --voice "voice.wav" --text "First message" --out "output1.wav"
|
| 314 |
+
|
| 315 |
+
# Generate second output with same voice
|
| 316 |
+
python run_cli.py --voice "voice.wav" --text "Second message" --out "output2.wav"
|
| 317 |
+
```
|
| 318 |
+
|
| 319 |
+
### Example 3: Long Text
|
| 320 |
+
|
| 321 |
+
```bash
|
| 322 |
+
python run_cli.py --voice "voice.wav" --text "This is a very long text that spans multiple sentences. The voice cloning system will synthesize all of it in the reference voice. You can make it as long as you need."
|
| 323 |
+
```
|
| 324 |
+
|
| 325 |
+
### Example 4: Different Voice Samples
|
| 326 |
+
|
| 327 |
+
```bash
|
| 328 |
+
# Clone voice A
|
| 329 |
+
python run_cli.py --voice "person_a.wav" --text "Message from person A"
|
| 330 |
+
|
| 331 |
+
# Clone voice B
|
| 332 |
+
python run_cli.py --voice "person_b.wav" --text "Message from person B"
|
| 333 |
+
```
|
| 334 |
+
|
| 335 |
+
## Troubleshooting
|
| 336 |
+
|
| 337 |
+
### Common Issues
|
| 338 |
+
|
| 339 |
+
#### "Model file not found"
|
| 340 |
+
|
| 341 |
+
**Solution**: Download the models from Google Drive and place them in `models/default/`:
|
| 342 |
+
- https://drive.google.com/drive/folders/1fU6umc5uQAVR2udZdHX-lDgXYzTyqG_j
|
| 343 |
+
|
| 344 |
+
Verify file sizes:
|
| 345 |
+
```bash
|
| 346 |
+
# Windows
|
| 347 |
+
dir models\default\*.pt
|
| 348 |
+
|
| 349 |
+
# Linux/Mac
|
| 350 |
+
ls -lh models/default/*.pt
|
| 351 |
+
```
|
| 352 |
+
|
| 353 |
+
Expected sizes:
|
| 354 |
+
- encoder.pt: 17,090,379 bytes (17 MB)
|
| 355 |
+
- synthesizer.pt: 370,554,559 bytes (370 MB) - Most common issue!
|
| 356 |
+
- vocoder.pt: 53,845,290 bytes (53 MB)
|
| 357 |
+
|
| 358 |
+
#### "Reference voice file not found"
|
| 359 |
+
|
| 360 |
+
**Solution**: Use absolute paths or check current directory:
|
| 361 |
+
```python
|
| 362 |
+
# Use absolute path
|
| 363 |
+
VOICE_FILE = r"C:\Users\YourName\Desktop\voice.mp3"
|
| 364 |
+
|
| 365 |
+
# Or relative from project root
|
| 366 |
+
VOICE_FILE = r"sample\voice.mp3"
|
| 367 |
+
```
|
| 368 |
+
|
| 369 |
+
#### Output sounds robotic or unclear
|
| 370 |
+
|
| 371 |
+
**Solutions**:
|
| 372 |
+
- Use a higher quality voice sample (16kHz+ sample rate)
|
| 373 |
+
- Ensure voice sample is 3-10 seconds long
|
| 374 |
+
- Remove background noise from voice sample
|
| 375 |
+
- Speak clearly and naturally in the reference audio
|
| 376 |
+
|
| 377 |
+
#### "AttributeError: module 'numpy' has no attribute 'cumproduct'"
|
| 378 |
+
|
| 379 |
+
**Solution**: This is already fixed in the code. If you see this:
|
| 380 |
+
```bash
|
| 381 |
+
pip install --upgrade numpy
|
| 382 |
+
```
|
| 383 |
+
|
| 384 |
+
#### Slow generation on CPU
|
| 385 |
+
|
| 386 |
+
**Solutions**:
|
| 387 |
+
- Normal speed: 2-3x real-time on modern CPUs
|
| 388 |
+
- For faster generation, install PyTorch with CUDA:
|
| 389 |
+
```bash
|
| 390 |
+
pip install torch --index-url https://download.pytorch.org/whl/cu118
|
| 391 |
+
```
|
| 392 |
+
|
| 393 |
+
Then the system will automatically use GPU if available.
|
| 394 |
+
|
| 395 |
+
### Getting Help
|
| 396 |
+
|
| 397 |
+
If you encounter other issues:
|
| 398 |
+
1. Check the `HOW_TO_RUN.md` file for detailed instructions
|
| 399 |
+
2. Verify all models are downloaded correctly
|
| 400 |
+
3. Ensure Python 3.11+ is installed
|
| 401 |
+
4. Check that all dependencies are installed
|
| 402 |
+
|
| 403 |
+
## Technical Details
|
| 404 |
+
|
| 405 |
+
### Audio Specifications
|
| 406 |
+
|
| 407 |
+
| Parameter | Value |
|
| 408 |
+
|-----------|-------|
|
| 409 |
+
| Sample Rate | 16,000 Hz |
|
| 410 |
+
| Channels | Mono |
|
| 411 |
+
| Bit Depth | 16-bit |
|
| 412 |
+
| FFT Size | 800 samples (50ms) |
|
| 413 |
+
| Hop Size | 200 samples (12.5ms) |
|
| 414 |
+
| Mel Channels | 80 (synthesizer/vocoder), 40 (encoder) |
|
| 415 |
+
|
| 416 |
+
### Model Architectures
|
| 417 |
+
|
| 418 |
+
#### Speaker Encoder
|
| 419 |
+
- **Type**: LSTM + Linear Projection
|
| 420 |
+
- **Input**: 40-channel mel-spectrogram
|
| 421 |
+
- **Output**: 256-dimensional speaker embedding
|
| 422 |
+
- **Parameters**: ~5M
|
| 423 |
+
|
| 424 |
+
#### Synthesizer (Tacotron 2)
|
| 425 |
+
- **Encoder**: CBHG (Convolution Bank + Highway + GRU)
|
| 426 |
+
- **Decoder**: Attention-based LSTM
|
| 427 |
+
- **PostNet**: 5-layer Residual CNN
|
| 428 |
+
- **Parameters**: ~31M
|
| 429 |
+
|
| 430 |
+
#### Vocoder (WaveRNN)
|
| 431 |
+
- **Type**: Recurrent Neural Vocoder
|
| 432 |
+
- **Mode**: Raw 9-bit with mu-law
|
| 433 |
+
- **Upsample Factors**: (5, 5, 8)
|
| 434 |
+
- **Parameters**: ~4.5M
|
| 435 |
+
|
| 436 |
+
### Text Processing
|
| 437 |
+
|
| 438 |
+
The system includes sophisticated text normalization:
|
| 439 |
+
- **Numbers**: "123" → "one hundred twenty three"
|
| 440 |
+
- **Currency**: "$5.50" → "five dollars, fifty cents"
|
| 441 |
+
- **Ordinals**: "1st" → "first"
|
| 442 |
+
- **Abbreviations**: "Dr." → "doctor"
|
| 443 |
+
- **Unicode**: Automatic transliteration to ASCII
|
| 444 |
+
|
| 445 |
+
### Performance
|
| 446 |
+
|
| 447 |
+
| Hardware | Generation Speed |
|
| 448 |
+
|----------|------------------|
|
| 449 |
+
| CPU (Intel i7) | 2-3x real-time |
|
| 450 |
+
| GPU (GTX 1060) | 10-15x real-time |
|
| 451 |
+
| GPU (RTX 3080) | 30-50x real-time |
|
| 452 |
+
|
| 453 |
+
Example: Generating 10 seconds of audio takes ~3-5 seconds on CPU.
|
| 454 |
+
|
| 455 |
+
## How to Use for Different Applications
|
| 456 |
+
|
| 457 |
+
### Podcast/Narration
|
| 458 |
+
```python
|
| 459 |
+
TEXT_TO_CLONE = """
|
| 460 |
+
Welcome to today's episode. In this podcast, we'll be discussing
|
| 461 |
+
the fascinating world of artificial intelligence and voice synthesis.
|
| 462 |
+
Let's dive right in!
|
| 463 |
+
"""
|
| 464 |
+
```
|
| 465 |
+
|
| 466 |
+
### Audiobook
|
| 467 |
+
```python
|
| 468 |
+
TEXT_TO_CLONE = """
|
| 469 |
+
Chapter One: The Beginning.
|
| 470 |
+
It was a dark and stormy night when everything changed.
|
| 471 |
+
The old house stood alone on the hill, its windows dark and unwelcoming.
|
| 472 |
+
"""
|
| 473 |
+
```
|
| 474 |
+
|
| 475 |
+
### Voiceover
|
| 476 |
+
```python
|
| 477 |
+
TEXT_TO_CLONE = """
|
| 478 |
+
Introducing the all-new product that will change your life.
|
| 479 |
+
With advanced features and intuitive design, it's the perfect solution.
|
| 480 |
+
"""
|
| 481 |
+
```
|
| 482 |
+
|
| 483 |
+
### Multiple Languages
|
| 484 |
+
The system supports English out of the box. For other languages:
|
| 485 |
+
1. Use English transliteration for best results
|
| 486 |
+
2. Or modify `synthesizer/utils/cleaners.py` for your language
|
| 487 |
+
|
| 488 |
+
## Comparison with Other Methods
|
| 489 |
+
|
| 490 |
+
| Method | Quality | Speed | Setup |
|
| 491 |
+
|--------|---------|-------|-------|
|
| 492 |
+
| Traditional TTS | Low | Fast | Easy |
|
| 493 |
+
| Commercial APIs | High | Fast | API Key Required |
|
| 494 |
+
| **This Project** | High | Medium | One-time Setup |
|
| 495 |
+
| Training from Scratch | High | Slow | Very Complex |
|
| 496 |
+
|
| 497 |
+
## Best Practices
|
| 498 |
+
|
| 499 |
+
### For Best Voice Quality:
|
| 500 |
+
|
| 501 |
+
1. **Reference Audio**:
|
| 502 |
+
- 3-10 seconds long
|
| 503 |
+
- Clear speech, no background noise
|
| 504 |
+
- Natural speaking tone (not reading/singing)
|
| 505 |
+
- 16kHz+ sample rate if possible
|
| 506 |
+
|
| 507 |
+
2. **Text Input**:
|
| 508 |
+
- Use proper punctuation for natural pauses
|
| 509 |
+
- Break very long texts into paragraphs
|
| 510 |
+
- Avoid excessive special characters
|
| 511 |
+
|
| 512 |
+
3. **Output**:
|
| 513 |
+
- Generate shorter clips for better quality
|
| 514 |
+
- Concatenate multiple clips if needed
|
| 515 |
+
- Post-process with audio editing software for polish
|
| 516 |
+
|
| 517 |
+
## Known Limitations
|
| 518 |
+
|
| 519 |
+
- Works best with English text
|
| 520 |
+
- Requires good quality reference audio
|
| 521 |
+
- May not perfectly capture very unique voice characteristics
|
| 522 |
+
- Background noise in reference affects output quality
|
| 523 |
+
- Very short reference audio (<3 seconds) may produce inconsistent results
|
| 524 |
+
|
| 525 |
+
## Future Improvements
|
| 526 |
+
|
| 527 |
+
- [ ] Add GUI interface
|
| 528 |
+
- [ ] Support for multiple languages
|
| 529 |
+
- [ ] Real-time streaming mode
|
| 530 |
+
- [ ] Voice mixing/morphing capabilities
|
| 531 |
+
- [ ] Fine-tuning on custom datasets
|
| 532 |
+
- [ ] Mobile app version
|
| 533 |
+
|
| 534 |
+
## Credits
|
| 535 |
+
|
| 536 |
+
This implementation is based on:
|
| 537 |
+
- **SV2TTS**: Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis
|
| 538 |
+
- **Tacotron 2**: Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions
|
| 539 |
+
- **WaveRNN**: Efficient Neural Audio Synthesis
|
| 540 |
+
|
| 541 |
+
Original research papers:
|
| 542 |
+
- [SV2TTS Paper](https://arxiv.org/abs/1806.04558)
|
| 543 |
+
- [Tacotron 2 Paper](https://arxiv.org/abs/1712.05884)
|
| 544 |
+
- [WaveRNN Paper](https://arxiv.org/abs/1802.08435)
|
| 545 |
+
|
| 546 |
+
## License
|
| 547 |
+
|
| 548 |
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
| 549 |
+
|
| 550 |
+
## Contributing
|
| 551 |
+
|
| 552 |
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
| 553 |
+
|
| 554 |
+
1. Fork the repository
|
| 555 |
+
2. Create your feature branch (`git checkout -b feature/AmazingFeature`)
|
| 556 |
+
3. Commit your changes (`git commit -m 'Add some AmazingFeature'`)
|
| 557 |
+
4. Push to the branch (`git push origin feature/AmazingFeature`)
|
| 558 |
+
5. Open a Pull Request
|
| 559 |
+
|
| 560 |
+
## Show Your Support
|
| 561 |
+
|
| 562 |
+
If this project helped you, please give it a star!
|
| 563 |
+
|
| 564 |
+
## Contact
|
| 565 |
+
|
| 566 |
+
For questions or support, please open an issue on GitHub.
|
| 567 |
+
|
| 568 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 569 |
|
| 570 |
+
**Made with love by the Voice Cloning Community**
|
| 571 |
+
|
| 572 |
+
*Last Updated: October 30, 2025*
|
backend/.env.example
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Flask backend environment variables
|
| 2 |
+
FLASK_APP=backend.app
|
| 3 |
+
FLASK_ENV=production
|
| 4 |
+
DEBUG=false
|
| 5 |
+
|
| 6 |
+
# HuggingFace configuration
|
| 7 |
+
HF_HOME=.cache/huggingface
|
| 8 |
+
|
| 9 |
+
# CORS configuration for production
|
| 10 |
+
CORS_ORIGINS=https://your-netlify-site.netlify.app
|
| 11 |
+
|
| 12 |
+
# Model configuration
|
| 13 |
+
MODEL_REPO_ENCODER=AJ50/voice-clone-encoder
|
| 14 |
+
MODEL_REPO_SYNTHESIZER=AJ50/voice-clone-synthesizer
|
| 15 |
+
MODEL_REPO_VOCODER=AJ50/voice-clone-vocoder
|
backend/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Backend package root to support relative imports."""
|
backend/app/__init__.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Application factory for the voice cloning backend."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
from flask import Flask
|
| 5 |
+
from flask_cors import CORS
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def create_app():
|
| 9 |
+
"""Create and configure the Flask application."""
|
| 10 |
+
|
| 11 |
+
app = Flask(__name__)
|
| 12 |
+
|
| 13 |
+
# CORS configuration - allow specific frontend URL or all origins
|
| 14 |
+
allowed_origins = os.getenv('FRONTEND_URL', '*').split(',')
|
| 15 |
+
cors_config = {
|
| 16 |
+
"origins": allowed_origins if allowed_origins != ['*'] else '*',
|
| 17 |
+
"methods": ["GET", "POST", "DELETE", "OPTIONS"],
|
| 18 |
+
"allow_headers": ["Content-Type", "Authorization"]
|
| 19 |
+
}
|
| 20 |
+
CORS(app, resources={r"/api/*": cors_config})
|
| 21 |
+
|
| 22 |
+
from .routes import bp
|
| 23 |
+
|
| 24 |
+
app.register_blueprint(bp)
|
| 25 |
+
|
| 26 |
+
# Root endpoint
|
| 27 |
+
@app.route('/')
|
| 28 |
+
def index():
|
| 29 |
+
return {'message': 'Voice Cloning API', 'status': 'running', 'api_prefix': '/api'}
|
| 30 |
+
|
| 31 |
+
return app
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
app = create_app()
|
backend/app/routes.py
ADDED
|
@@ -0,0 +1,409 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Flask API Backend for Voice Cloning
|
| 3 |
+
Integrates the Python voice cloning backend with the React frontend
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from flask import Blueprint, request, jsonify, send_file
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
import uuid
|
| 9 |
+
import json
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
import sys
|
| 12 |
+
|
| 13 |
+
from .voice_cloning import synthesize
|
| 14 |
+
|
| 15 |
+
bp = Blueprint('voice_cloning', __name__, url_prefix='/api')
|
| 16 |
+
|
| 17 |
+
BASE_DIR = Path(__file__).resolve().parents[1]
|
| 18 |
+
|
| 19 |
+
# Configuration
|
| 20 |
+
UPLOAD_FOLDER = BASE_DIR / 'enrolled_voices'
|
| 21 |
+
OUTPUT_FOLDER = BASE_DIR / 'outputs'
|
| 22 |
+
MODELS_DIR = BASE_DIR / 'models'
|
| 23 |
+
VOICES_DB = UPLOAD_FOLDER / 'voices.json'
|
| 24 |
+
|
| 25 |
+
# Create directories with parents
|
| 26 |
+
try:
|
| 27 |
+
UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
|
| 28 |
+
OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)
|
| 29 |
+
VOICES_DB.parent.mkdir(parents=True, exist_ok=True)
|
| 30 |
+
except Exception as e:
|
| 31 |
+
print(f"Failed to create directories: {e}")
|
| 32 |
+
sys.exit(1)
|
| 33 |
+
|
| 34 |
+
# Allowed audio extensions
|
| 35 |
+
ALLOWED_EXTENSIONS = {'mp3', 'wav', 'm4a', 'flac', 'ogg', 'webm'}
|
| 36 |
+
|
| 37 |
+
def allowed_file(filename):
|
| 38 |
+
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
|
| 39 |
+
|
| 40 |
+
def load_voices_db():
|
| 41 |
+
"""Load the voices database"""
|
| 42 |
+
if VOICES_DB.exists():
|
| 43 |
+
with open(VOICES_DB, 'r') as f:
|
| 44 |
+
return json.load(f)
|
| 45 |
+
return []
|
| 46 |
+
|
| 47 |
+
def save_voices_db(voices):
|
| 48 |
+
"""Save the voices database"""
|
| 49 |
+
with open(VOICES_DB, 'w') as f:
|
| 50 |
+
json.dump(voices, f, indent=2)
|
| 51 |
+
|
| 52 |
+
@bp.route('/health', methods=['GET'])
|
| 53 |
+
def health_check():
|
| 54 |
+
"""Health check endpoint"""
|
| 55 |
+
return jsonify({
|
| 56 |
+
'status': 'healthy',
|
| 57 |
+
'message': 'Voice Cloning API is running'
|
| 58 |
+
})
|
| 59 |
+
|
| 60 |
+
@bp.route('/enroll', methods=['POST'])
|
| 61 |
+
def enroll_voice():
|
| 62 |
+
"""
|
| 63 |
+
Enroll a new voice by accepting audio file and voice name
|
| 64 |
+
Frontend sends: FormData with 'audio' (File) and 'voice_name' (string)
|
| 65 |
+
"""
|
| 66 |
+
try:
|
| 67 |
+
# Check if audio file is present
|
| 68 |
+
if 'audio' not in request.files:
|
| 69 |
+
return jsonify({'error': 'No audio file provided'}), 400
|
| 70 |
+
|
| 71 |
+
audio_file = request.files['audio']
|
| 72 |
+
voice_name = request.form.get('voice_name', 'Unnamed Voice').strip()
|
| 73 |
+
|
| 74 |
+
if audio_file.filename == '':
|
| 75 |
+
return jsonify({'error': 'No file selected'}), 400
|
| 76 |
+
|
| 77 |
+
if not allowed_file(audio_file.filename):
|
| 78 |
+
return jsonify({'error': 'Invalid file type. Supported: mp3, wav, m4a, flac, ogg, webm'}), 400
|
| 79 |
+
|
| 80 |
+
# Ensure upload folder exists
|
| 81 |
+
UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
|
| 82 |
+
|
| 83 |
+
# Generate unique ID and secure filename
|
| 84 |
+
voice_id = f"voice_{uuid.uuid4().hex[:8]}"
|
| 85 |
+
file_extension = audio_file.filename.rsplit('.', 1)[1].lower()
|
| 86 |
+
filename = f"{voice_id}.{file_extension}"
|
| 87 |
+
filepath = UPLOAD_FOLDER / filename
|
| 88 |
+
|
| 89 |
+
# Save the audio file with error handling
|
| 90 |
+
try:
|
| 91 |
+
audio_file.save(str(filepath))
|
| 92 |
+
print(f"✓ Audio file saved: {filepath}")
|
| 93 |
+
except Exception as file_err:
|
| 94 |
+
print(f"✗ Failed to save audio file: {file_err}")
|
| 95 |
+
return jsonify({'error': f'Failed to save audio: {str(file_err)}'}), 500
|
| 96 |
+
|
| 97 |
+
# Create voice entry
|
| 98 |
+
voice_entry = {
|
| 99 |
+
'id': voice_id,
|
| 100 |
+
'name': voice_name,
|
| 101 |
+
'filename': filename,
|
| 102 |
+
'createdAt': datetime.now().isoformat()
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
# Update voices database with error handling
|
| 106 |
+
try:
|
| 107 |
+
VOICES_DB.parent.mkdir(parents=True, exist_ok=True)
|
| 108 |
+
voices = load_voices_db()
|
| 109 |
+
voices.append(voice_entry)
|
| 110 |
+
save_voices_db(voices)
|
| 111 |
+
print(f"✓ Voice '{voice_name}' (ID: {voice_id}) enrolled successfully")
|
| 112 |
+
except Exception as db_err:
|
| 113 |
+
print(f"✗ Failed to update voices DB: {db_err}")
|
| 114 |
+
return jsonify({'error': f'Failed to save voice metadata: {str(db_err)}'}), 500
|
| 115 |
+
|
| 116 |
+
return jsonify({
|
| 117 |
+
'success': True,
|
| 118 |
+
'message': f'Voice "{voice_name}" enrolled successfully',
|
| 119 |
+
'voice_id': voice_id,
|
| 120 |
+
'voice_name': voice_name,
|
| 121 |
+
'created_at': voice_entry['createdAt']
|
| 122 |
+
}), 201
|
| 123 |
+
|
| 124 |
+
except Exception as e:
|
| 125 |
+
print(f"✗ Error enrolling voice: {e}")
|
| 126 |
+
import traceback
|
| 127 |
+
traceback.print_exc()
|
| 128 |
+
return jsonify({'error': f'Failed to enroll voice: {str(e)}'}), 500
|
| 129 |
+
|
| 130 |
+
@bp.route('/voices', methods=['GET'])
|
| 131 |
+
def get_voices():
|
| 132 |
+
"""
|
| 133 |
+
Get list of all enrolled voices
|
| 134 |
+
Frontend uses this to populate the voice selection dropdown
|
| 135 |
+
"""
|
| 136 |
+
try:
|
| 137 |
+
voices = load_voices_db()
|
| 138 |
+
# Return only necessary info for frontend
|
| 139 |
+
voices_list = [
|
| 140 |
+
{
|
| 141 |
+
'id': v['id'],
|
| 142 |
+
'name': v['name'],
|
| 143 |
+
'createdAt': v['createdAt']
|
| 144 |
+
}
|
| 145 |
+
for v in voices
|
| 146 |
+
]
|
| 147 |
+
return jsonify({'voices': voices_list}), 200
|
| 148 |
+
except Exception as e:
|
| 149 |
+
print(f"Error getting voices: {e}")
|
| 150 |
+
return jsonify({'error': f'Failed to get voices: {str(e)}'}), 500
|
| 151 |
+
|
| 152 |
+
@bp.route('/synthesize', methods=['POST'])
|
| 153 |
+
def synthesize_speech():
|
| 154 |
+
"""
|
| 155 |
+
Synthesize speech from text using enrolled voice
|
| 156 |
+
Frontend sends: { "text": "...", "voiceId": "voice_xxx" }
|
| 157 |
+
"""
|
| 158 |
+
try:
|
| 159 |
+
data = request.get_json()
|
| 160 |
+
|
| 161 |
+
if not data:
|
| 162 |
+
return jsonify({'error': 'No data provided'}), 400
|
| 163 |
+
|
| 164 |
+
text = data.get('text', '').strip()
|
| 165 |
+
voice_id = data.get('voice_id', '') # Changed from 'voiceId' to 'voice_id'
|
| 166 |
+
|
| 167 |
+
if not text:
|
| 168 |
+
return jsonify({'error': 'No text provided'}), 400
|
| 169 |
+
|
| 170 |
+
if not voice_id:
|
| 171 |
+
return jsonify({'error': 'No voice selected'}), 400
|
| 172 |
+
|
| 173 |
+
# Find the voice in database
|
| 174 |
+
voices = load_voices_db()
|
| 175 |
+
voice = next((v for v in voices if v['id'] == voice_id), None)
|
| 176 |
+
|
| 177 |
+
if not voice:
|
| 178 |
+
return jsonify({'error': 'Voice not found'}), 404
|
| 179 |
+
|
| 180 |
+
# Reconstruct path from UPLOAD_FOLDER (server-agnostic)
|
| 181 |
+
voice_filepath = UPLOAD_FOLDER / voice['filename']
|
| 182 |
+
|
| 183 |
+
if not voice_filepath.exists():
|
| 184 |
+
return jsonify({'error': f'Voice file not found: {voice_filepath}'}), 404
|
| 185 |
+
|
| 186 |
+
# Generate unique output filename
|
| 187 |
+
output_filename = f"synthesis_{uuid.uuid4().hex[:8]}.wav"
|
| 188 |
+
output_path = OUTPUT_FOLDER / output_filename
|
| 189 |
+
|
| 190 |
+
# Call the voice cloning synthesis function
|
| 191 |
+
print(f"Synthesizing: '{text}' with voice '{voice['name']}'")
|
| 192 |
+
print(f"Voice file: {voice_filepath}")
|
| 193 |
+
print(f"Output path: {output_path}")
|
| 194 |
+
print(f"Models dir: {MODELS_DIR}")
|
| 195 |
+
print("Starting synthesis... This may take 30-60 seconds...")
|
| 196 |
+
|
| 197 |
+
try:
|
| 198 |
+
# Flush output to see logs immediately
|
| 199 |
+
sys.stdout.flush()
|
| 200 |
+
|
| 201 |
+
synthesize(
|
| 202 |
+
voice_path=voice_filepath,
|
| 203 |
+
text=text,
|
| 204 |
+
models_dir=MODELS_DIR,
|
| 205 |
+
out_path=output_path
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
print(f"Synthesis completed! Output saved to: {output_path}")
|
| 209 |
+
sys.stdout.flush()
|
| 210 |
+
except Exception as synth_error:
|
| 211 |
+
print(f"Synthesis error: {synth_error}")
|
| 212 |
+
import traceback
|
| 213 |
+
traceback.print_exc()
|
| 214 |
+
sys.stdout.flush()
|
| 215 |
+
return jsonify({'error': f'Synthesis failed: {str(synth_error)}'}), 500
|
| 216 |
+
|
| 217 |
+
if not output_path.exists():
|
| 218 |
+
error_msg = 'Synthesis failed - output not generated'
|
| 219 |
+
return jsonify({'error': error_msg}), 500
|
| 220 |
+
|
| 221 |
+
# Return the audio file URL
|
| 222 |
+
return jsonify({
|
| 223 |
+
'success': True,
|
| 224 |
+
'message': 'Speech synthesized successfully',
|
| 225 |
+
'audio_url': f'/api/audio/{output_filename}'
|
| 226 |
+
}), 200
|
| 227 |
+
|
| 228 |
+
except Exception as e:
|
| 229 |
+
print(f"Error synthesizing speech: {e}")
|
| 230 |
+
import traceback
|
| 231 |
+
traceback.print_exc()
|
| 232 |
+
return jsonify({'error': f'Failed to synthesize speech: {str(e)}'}), 500
|
| 233 |
+
|
| 234 |
+
@bp.route('/audio/<filename>', methods=['GET'])
|
| 235 |
+
def get_audio(filename):
|
| 236 |
+
"""
|
| 237 |
+
Serve synthesized audio files
|
| 238 |
+
Frontend uses this URL to play/download the generated audio
|
| 239 |
+
"""
|
| 240 |
+
try:
|
| 241 |
+
filepath = OUTPUT_FOLDER / filename
|
| 242 |
+
if not filepath.exists():
|
| 243 |
+
return jsonify({'error': 'Audio file not found'}), 404
|
| 244 |
+
|
| 245 |
+
return send_file(
|
| 246 |
+
str(filepath),
|
| 247 |
+
mimetype='audio/wav',
|
| 248 |
+
as_attachment=False,
|
| 249 |
+
download_name=filename
|
| 250 |
+
)
|
| 251 |
+
except Exception as e:
|
| 252 |
+
print(f"Error serving audio: {e}")
|
| 253 |
+
return jsonify({'error': f'Failed to serve audio: {str(e)}'}), 500
|
| 254 |
+
|
| 255 |
+
@bp.route('/voices/<voice_id>', methods=['DELETE'])
|
| 256 |
+
def delete_voice(voice_id):
|
| 257 |
+
"""
|
| 258 |
+
Delete an enrolled voice
|
| 259 |
+
Optional: Frontend can call this to remove voices
|
| 260 |
+
"""
|
| 261 |
+
try:
|
| 262 |
+
voices = load_voices_db()
|
| 263 |
+
voice = next((v for v in voices if v['id'] == voice_id), None)
|
| 264 |
+
|
| 265 |
+
if not voice:
|
| 266 |
+
return jsonify({'error': 'Voice not found'}), 404
|
| 267 |
+
|
| 268 |
+
# Delete the audio file
|
| 269 |
+
voice_filepath = UPLOAD_FOLDER / voice['filename']
|
| 270 |
+
if voice_filepath.exists():
|
| 271 |
+
voice_filepath.unlink()
|
| 272 |
+
|
| 273 |
+
# Remove from database
|
| 274 |
+
voices = [v for v in voices if v['id'] != voice_id]
|
| 275 |
+
save_voices_db(voices)
|
| 276 |
+
|
| 277 |
+
return jsonify({
|
| 278 |
+
'success': True,
|
| 279 |
+
'message': f'Voice "{voice["name"]}" deleted successfully'
|
| 280 |
+
}), 200
|
| 281 |
+
|
| 282 |
+
except Exception as e:
|
| 283 |
+
print(f"Error deleting voice: {e}")
|
| 284 |
+
return jsonify({'error': f'Failed to delete voice: {str(e)}'}), 500
|
| 285 |
+
|
| 286 |
+
@bp.route('/spectrogram/<audio_filename>', methods=['GET'])
|
| 287 |
+
def get_spectrogram(audio_filename):
|
| 288 |
+
"""
|
| 289 |
+
Generate and return mel-spectrogram data for visualization
|
| 290 |
+
Frontend can use this to display real-time mel-spectrogram
|
| 291 |
+
"""
|
| 292 |
+
try:
|
| 293 |
+
print(f"[Spectrogram] Requested file: {audio_filename}")
|
| 294 |
+
filepath = OUTPUT_FOLDER / audio_filename
|
| 295 |
+
print(f"[Spectrogram] Full path: {filepath}")
|
| 296 |
+
print(f"[Spectrogram] File exists: {filepath.exists()}")
|
| 297 |
+
|
| 298 |
+
if not filepath.exists():
|
| 299 |
+
print(f"[Spectrogram] ERROR: File not found: {filepath}")
|
| 300 |
+
return jsonify({'error': f'Audio file {audio_filename} not found'}), 404
|
| 301 |
+
|
| 302 |
+
# Import librosa for mel-spectrogram generation
|
| 303 |
+
import librosa
|
| 304 |
+
import numpy as np
|
| 305 |
+
|
| 306 |
+
print(f"[Spectrogram] Loading audio file...")
|
| 307 |
+
# Load audio file
|
| 308 |
+
y, sr = librosa.load(str(filepath), sr=None)
|
| 309 |
+
print(f"[Spectrogram] Audio loaded: shape={y.shape}, sr={sr}")
|
| 310 |
+
|
| 311 |
+
# Generate mel-spectrogram
|
| 312 |
+
# 80 mel bands (common for Tacotron2), hop_length varies with sample rate
|
| 313 |
+
mel_spec = librosa.feature.melspectrogram(
|
| 314 |
+
y=y,
|
| 315 |
+
sr=sr,
|
| 316 |
+
n_mels=80,
|
| 317 |
+
hop_length=512
|
| 318 |
+
)
|
| 319 |
+
print(f"[Spectrogram] Mel-spec generated: shape={mel_spec.shape}")
|
| 320 |
+
|
| 321 |
+
# Convert to dB scale (log scale for better visualization)
|
| 322 |
+
mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
|
| 323 |
+
|
| 324 |
+
# Normalize to 0-255 range for visualization
|
| 325 |
+
mel_spec_normalized = np.clip(
|
| 326 |
+
((mel_spec_db + 80) / 80 * 255),
|
| 327 |
+
0,
|
| 328 |
+
255
|
| 329 |
+
).astype(np.uint8)
|
| 330 |
+
|
| 331 |
+
# Convert to list for JSON serialization
|
| 332 |
+
# Transpose to time x frequency format for frontend
|
| 333 |
+
spectrogram_data = mel_spec_normalized.T.tolist()
|
| 334 |
+
|
| 335 |
+
print(f"[Spectrogram] Successfully generated spectrogram: {len(spectrogram_data)} time steps")
|
| 336 |
+
|
| 337 |
+
return jsonify({
|
| 338 |
+
'spectrogram': spectrogram_data,
|
| 339 |
+
'n_mels': 80,
|
| 340 |
+
'shape': {
|
| 341 |
+
'time_steps': len(spectrogram_data),
|
| 342 |
+
'frequency_bins': 80
|
| 343 |
+
}
|
| 344 |
+
}), 200
|
| 345 |
+
|
| 346 |
+
except Exception as e:
|
| 347 |
+
print(f"[Spectrogram] ERROR: {str(e)}")
|
| 348 |
+
import traceback
|
| 349 |
+
traceback.print_exc()
|
| 350 |
+
return jsonify({'error': f'Failed to generate spectrogram: {str(e)}'}), 500
|
| 351 |
+
|
| 352 |
+
@bp.route('/waveform/<audio_filename>', methods=['GET'])
|
| 353 |
+
def get_waveform(audio_filename):
|
| 354 |
+
"""
|
| 355 |
+
Serve audio waveform as numeric array for real-time FFT visualization
|
| 356 |
+
Frontend fetches this and computes FFT using Web Audio API
|
| 357 |
+
"""
|
| 358 |
+
try:
|
| 359 |
+
filepath = OUTPUT_FOLDER / audio_filename
|
| 360 |
+
if not filepath.exists():
|
| 361 |
+
return jsonify({'error': 'Audio file not found'}), 404
|
| 362 |
+
|
| 363 |
+
import soundfile as sf
|
| 364 |
+
import numpy as np
|
| 365 |
+
|
| 366 |
+
# Load audio file
|
| 367 |
+
# soundfile returns (data, sample_rate)
|
| 368 |
+
y, sr = sf.read(str(filepath))
|
| 369 |
+
|
| 370 |
+
# If stereo, convert to mono by taking first channel or averaging
|
| 371 |
+
if len(y.shape) > 1:
|
| 372 |
+
y = np.mean(y, axis=1)
|
| 373 |
+
|
| 374 |
+
# Ensure float32 for compatibility
|
| 375 |
+
y = np.asarray(y, dtype=np.float32)
|
| 376 |
+
|
| 377 |
+
# Downsample if very long to reduce JSON payload
|
| 378 |
+
# Typical waveform for 60s at 22050Hz = 1.3M samples
|
| 379 |
+
# For FFT we can use 8000 Hz safely (captures up to 4 kHz)
|
| 380 |
+
target_sr = 8000
|
| 381 |
+
if sr > target_sr:
|
| 382 |
+
# Calculate downsample factor
|
| 383 |
+
resample_ratio = target_sr / sr
|
| 384 |
+
new_length = int(len(y) * resample_ratio)
|
| 385 |
+
# Simple linear interpolation for downsampling
|
| 386 |
+
indices = np.linspace(0, len(y) - 1, new_length)
|
| 387 |
+
y = np.interp(indices, np.arange(len(y)), y)
|
| 388 |
+
sr = target_sr
|
| 389 |
+
|
| 390 |
+
# Convert to list for JSON serialization
|
| 391 |
+
waveform_data = y.tolist()
|
| 392 |
+
|
| 393 |
+
return jsonify({
|
| 394 |
+
'waveform': waveform_data,
|
| 395 |
+
'sample_rate': sr,
|
| 396 |
+
'duration': len(y) / sr,
|
| 397 |
+
'samples': len(y)
|
| 398 |
+
}), 200
|
| 399 |
+
|
| 400 |
+
except ImportError as ie:
|
| 401 |
+
err_msg = f'Soundfile library not available: {str(ie)}'
|
| 402 |
+
return jsonify({'error': err_msg}), 500
|
| 403 |
+
except Exception as e:
|
| 404 |
+
print(f"Error serving waveform: {e}")
|
| 405 |
+
import traceback
|
| 406 |
+
traceback.print_exc()
|
| 407 |
+
err_msg = f'Failed to generate waveform: {str(e)}'
|
| 408 |
+
return jsonify({'error': err_msg}), 500
|
| 409 |
+
|
backend/app/vocoder/audio.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import math
|
| 2 |
+
import numpy as np
|
| 3 |
+
import librosa
|
| 4 |
+
from . import hparams as hp
|
| 5 |
+
from scipy.signal import lfilter
|
| 6 |
+
import soundfile as sf
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def label_2_float(x, bits) :
|
| 10 |
+
return 2 * x / (2**bits - 1.) - 1.
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def float_2_label(x, bits) :
|
| 14 |
+
assert abs(x).max() <= 1.0
|
| 15 |
+
x = (x + 1.) * (2**bits - 1) / 2
|
| 16 |
+
return x.clip(0, 2**bits - 1)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def load_wav(path) :
|
| 20 |
+
return librosa.load(str(path), sr=hp.sample_rate)[0]
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def save_wav(x, path) :
|
| 24 |
+
sf.write(path, x.astype(np.float32), hp.sample_rate)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def split_signal(x) :
|
| 28 |
+
unsigned = x + 2**15
|
| 29 |
+
coarse = unsigned // 256
|
| 30 |
+
fine = unsigned % 256
|
| 31 |
+
return coarse, fine
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def combine_signal(coarse, fine) :
|
| 35 |
+
return coarse * 256 + fine - 2**15
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def encode_16bits(x) :
|
| 39 |
+
return np.clip(x * 2**15, -2**15, 2**15 - 1).astype(np.int16)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
mel_basis = None
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def linear_to_mel(spectrogram):
|
| 46 |
+
global mel_basis
|
| 47 |
+
if mel_basis is None:
|
| 48 |
+
mel_basis = build_mel_basis()
|
| 49 |
+
return np.dot(mel_basis, spectrogram)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def build_mel_basis():
|
| 53 |
+
return librosa.filters.mel(hp.sample_rate, hp.n_fft, n_mels=hp.num_mels, fmin=hp.fmin)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def normalize(S):
|
| 57 |
+
return np.clip((S - hp.min_level_db) / -hp.min_level_db, 0, 1)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def denormalize(S):
|
| 61 |
+
return (np.clip(S, 0, 1) * -hp.min_level_db) + hp.min_level_db
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def amp_to_db(x):
|
| 65 |
+
return 20 * np.log10(np.maximum(1e-5, x))
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def db_to_amp(x):
|
| 69 |
+
return np.power(10.0, x * 0.05)
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def spectrogram(y):
|
| 73 |
+
D = stft(y)
|
| 74 |
+
S = amp_to_db(np.abs(D)) - hp.ref_level_db
|
| 75 |
+
return normalize(S)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def melspectrogram(y):
|
| 79 |
+
D = stft(y)
|
| 80 |
+
S = amp_to_db(linear_to_mel(np.abs(D)))
|
| 81 |
+
return normalize(S)
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def stft(y):
|
| 85 |
+
return librosa.stft(y=y, n_fft=hp.n_fft, hop_length=hp.hop_length, win_length=hp.win_length)
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def pre_emphasis(x):
|
| 89 |
+
return lfilter([1, -hp.preemphasis], [1], x)
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def de_emphasis(x):
|
| 93 |
+
return lfilter([1], [1, -hp.preemphasis], x)
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def encode_mu_law(x, mu) :
|
| 97 |
+
mu = mu - 1
|
| 98 |
+
fx = np.sign(x) * np.log(1 + mu * np.abs(x)) / np.log(1 + mu)
|
| 99 |
+
return np.floor((fx + 1) / 2 * mu + 0.5)
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def decode_mu_law(y, mu, from_labels=True) :
|
| 103 |
+
if from_labels:
|
| 104 |
+
y = label_2_float(y, math.log2(mu))
|
| 105 |
+
mu = mu - 1
|
| 106 |
+
x = np.sign(y) / mu * ((1 + mu) ** np.abs(y) - 1)
|
| 107 |
+
return x
|
| 108 |
+
|
backend/app/vocoder/display.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import numpy as np
|
| 3 |
+
import sys
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def progbar(i, n, size=16):
|
| 7 |
+
done = (i * size) // n
|
| 8 |
+
bar = ''
|
| 9 |
+
for i in range(size):
|
| 10 |
+
bar += '█' if i <= done else '░'
|
| 11 |
+
return bar
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def stream(message) :
|
| 15 |
+
try:
|
| 16 |
+
sys.stdout.write("\r{%s}" % message)
|
| 17 |
+
except:
|
| 18 |
+
#Remove non-ASCII characters from message
|
| 19 |
+
message = ''.join(i for i in message if ord(i)<128)
|
| 20 |
+
sys.stdout.write("\r{%s}" % message)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def simple_table(item_tuples) :
|
| 24 |
+
|
| 25 |
+
border_pattern = '+---------------------------------------'
|
| 26 |
+
whitespace = ' '
|
| 27 |
+
|
| 28 |
+
headings, cells, = [], []
|
| 29 |
+
|
| 30 |
+
for item in item_tuples :
|
| 31 |
+
|
| 32 |
+
heading, cell = str(item[0]), str(item[1])
|
| 33 |
+
|
| 34 |
+
pad_head = True if len(heading) < len(cell) else False
|
| 35 |
+
|
| 36 |
+
pad = abs(len(heading) - len(cell))
|
| 37 |
+
pad = whitespace[:pad]
|
| 38 |
+
|
| 39 |
+
pad_left = pad[:len(pad)//2]
|
| 40 |
+
pad_right = pad[len(pad)//2:]
|
| 41 |
+
|
| 42 |
+
if pad_head :
|
| 43 |
+
heading = pad_left + heading + pad_right
|
| 44 |
+
else :
|
| 45 |
+
cell = pad_left + cell + pad_right
|
| 46 |
+
|
| 47 |
+
headings += [heading]
|
| 48 |
+
cells += [cell]
|
| 49 |
+
|
| 50 |
+
border, head, body = '', '', ''
|
| 51 |
+
|
| 52 |
+
for i in range(len(item_tuples)) :
|
| 53 |
+
|
| 54 |
+
temp_head = f'| {headings[i]} '
|
| 55 |
+
temp_body = f'| {cells[i]} '
|
| 56 |
+
|
| 57 |
+
border += border_pattern[:len(temp_head)]
|
| 58 |
+
head += temp_head
|
| 59 |
+
body += temp_body
|
| 60 |
+
|
| 61 |
+
if i == len(item_tuples) - 1 :
|
| 62 |
+
head += '|'
|
| 63 |
+
body += '|'
|
| 64 |
+
border += '+'
|
| 65 |
+
|
| 66 |
+
print(border)
|
| 67 |
+
print(head)
|
| 68 |
+
print(border)
|
| 69 |
+
print(body)
|
| 70 |
+
print(border)
|
| 71 |
+
print(' ')
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def time_since(started) :
|
| 75 |
+
elapsed = time.time() - started
|
| 76 |
+
m = int(elapsed // 60)
|
| 77 |
+
s = int(elapsed % 60)
|
| 78 |
+
if m >= 60 :
|
| 79 |
+
h = int(m // 60)
|
| 80 |
+
m = m % 60
|
| 81 |
+
return f'{h}h {m}m {s}s'
|
| 82 |
+
else :
|
| 83 |
+
return f'{m}m {s}s'
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def save_attention(attn, path):
|
| 87 |
+
import matplotlib.pyplot as plt
|
| 88 |
+
|
| 89 |
+
fig = plt.figure(figsize=(12, 6))
|
| 90 |
+
plt.imshow(attn.T, interpolation='nearest', aspect='auto')
|
| 91 |
+
fig.savefig(f'{path}.png', bbox_inches='tight')
|
| 92 |
+
plt.close(fig)
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def save_spectrogram(M, path, length=None):
|
| 96 |
+
import matplotlib.pyplot as plt
|
| 97 |
+
|
| 98 |
+
M = np.flip(M, axis=0)
|
| 99 |
+
if length : M = M[:, :length]
|
| 100 |
+
fig = plt.figure(figsize=(12, 6))
|
| 101 |
+
plt.imshow(M, interpolation='nearest', aspect='auto')
|
| 102 |
+
fig.savefig(f'{path}.png', bbox_inches='tight')
|
| 103 |
+
plt.close(fig)
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def plot(array):
|
| 107 |
+
import matplotlib.pyplot as plt
|
| 108 |
+
|
| 109 |
+
fig = plt.figure(figsize=(30, 5))
|
| 110 |
+
ax = fig.add_subplot(111)
|
| 111 |
+
ax.xaxis.label.set_color('grey')
|
| 112 |
+
ax.yaxis.label.set_color('grey')
|
| 113 |
+
ax.xaxis.label.set_fontsize(23)
|
| 114 |
+
ax.yaxis.label.set_fontsize(23)
|
| 115 |
+
ax.tick_params(axis='x', colors='grey', labelsize=23)
|
| 116 |
+
ax.tick_params(axis='y', colors='grey', labelsize=23)
|
| 117 |
+
plt.plot(array)
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def plot_spec(M):
|
| 121 |
+
import matplotlib.pyplot as plt
|
| 122 |
+
|
| 123 |
+
M = np.flip(M, axis=0)
|
| 124 |
+
plt.figure(figsize=(18,4))
|
| 125 |
+
plt.imshow(M, interpolation='nearest', aspect='auto')
|
| 126 |
+
plt.show()
|
| 127 |
+
|
backend/app/vocoder/distribution.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import torch
|
| 3 |
+
import torch.nn.functional as F
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def log_sum_exp(x):
|
| 7 |
+
""" numerically stable log_sum_exp implementation that prevents overflow """
|
| 8 |
+
# TF ordering
|
| 9 |
+
axis = len(x.size()) - 1
|
| 10 |
+
m, _ = torch.max(x, dim=axis)
|
| 11 |
+
m2, _ = torch.max(x, dim=axis, keepdim=True)
|
| 12 |
+
return m + torch.log(torch.sum(torch.exp(x - m2), dim=axis))
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
# It is adapted from https://github.com/r9y9/wavenet_vocoder/blob/master/wavenet_vocoder/mixture.py
|
| 16 |
+
def discretized_mix_logistic_loss(y_hat, y, num_classes=65536,
|
| 17 |
+
log_scale_min=None, reduce=True):
|
| 18 |
+
if log_scale_min is None:
|
| 19 |
+
log_scale_min = float(np.log(1e-14))
|
| 20 |
+
y_hat = y_hat.permute(0,2,1)
|
| 21 |
+
assert y_hat.dim() == 3
|
| 22 |
+
assert y_hat.size(1) % 3 == 0
|
| 23 |
+
nr_mix = y_hat.size(1) // 3
|
| 24 |
+
|
| 25 |
+
# (B x T x C)
|
| 26 |
+
y_hat = y_hat.transpose(1, 2)
|
| 27 |
+
|
| 28 |
+
# unpack parameters. (B, T, num_mixtures) x 3
|
| 29 |
+
logit_probs = y_hat[:, :, :nr_mix]
|
| 30 |
+
means = y_hat[:, :, nr_mix:2 * nr_mix]
|
| 31 |
+
log_scales = torch.clamp(y_hat[:, :, 2 * nr_mix:3 * nr_mix], min=log_scale_min)
|
| 32 |
+
|
| 33 |
+
# B x T x 1 -> B x T x num_mixtures
|
| 34 |
+
y = y.expand_as(means)
|
| 35 |
+
|
| 36 |
+
centered_y = y - means
|
| 37 |
+
inv_stdv = torch.exp(-log_scales)
|
| 38 |
+
plus_in = inv_stdv * (centered_y + 1. / (num_classes - 1))
|
| 39 |
+
cdf_plus = torch.sigmoid(plus_in)
|
| 40 |
+
min_in = inv_stdv * (centered_y - 1. / (num_classes - 1))
|
| 41 |
+
cdf_min = torch.sigmoid(min_in)
|
| 42 |
+
|
| 43 |
+
# log probability for edge case of 0 (before scaling)
|
| 44 |
+
# equivalent: torch.log(F.sigmoid(plus_in))
|
| 45 |
+
log_cdf_plus = plus_in - F.softplus(plus_in)
|
| 46 |
+
|
| 47 |
+
# log probability for edge case of 255 (before scaling)
|
| 48 |
+
# equivalent: (1 - F.sigmoid(min_in)).log()
|
| 49 |
+
log_one_minus_cdf_min = -F.softplus(min_in)
|
| 50 |
+
|
| 51 |
+
# probability for all other cases
|
| 52 |
+
cdf_delta = cdf_plus - cdf_min
|
| 53 |
+
|
| 54 |
+
mid_in = inv_stdv * centered_y
|
| 55 |
+
# log probability in the center of the bin, to be used in extreme cases
|
| 56 |
+
# (not actually used in our code)
|
| 57 |
+
log_pdf_mid = mid_in - log_scales - 2. * F.softplus(mid_in)
|
| 58 |
+
|
| 59 |
+
# tf equivalent
|
| 60 |
+
"""
|
| 61 |
+
log_probs = tf.where(x < -0.999, log_cdf_plus,
|
| 62 |
+
tf.where(x > 0.999, log_one_minus_cdf_min,
|
| 63 |
+
tf.where(cdf_delta > 1e-5,
|
| 64 |
+
tf.log(tf.maximum(cdf_delta, 1e-12)),
|
| 65 |
+
log_pdf_mid - np.log(127.5))))
|
| 66 |
+
"""
|
| 67 |
+
# TODO: cdf_delta <= 1e-5 actually can happen. How can we choose the value
|
| 68 |
+
# for num_classes=65536 case? 1e-7? not sure..
|
| 69 |
+
inner_inner_cond = (cdf_delta > 1e-5).float()
|
| 70 |
+
|
| 71 |
+
inner_inner_out = inner_inner_cond * \
|
| 72 |
+
torch.log(torch.clamp(cdf_delta, min=1e-12)) + \
|
| 73 |
+
(1. - inner_inner_cond) * (log_pdf_mid - np.log((num_classes - 1) / 2))
|
| 74 |
+
inner_cond = (y > 0.999).float()
|
| 75 |
+
inner_out = inner_cond * log_one_minus_cdf_min + (1. - inner_cond) * inner_inner_out
|
| 76 |
+
cond = (y < -0.999).float()
|
| 77 |
+
log_probs = cond * log_cdf_plus + (1. - cond) * inner_out
|
| 78 |
+
|
| 79 |
+
log_probs = log_probs + F.log_softmax(logit_probs, -1)
|
| 80 |
+
|
| 81 |
+
if reduce:
|
| 82 |
+
return -torch.mean(log_sum_exp(log_probs))
|
| 83 |
+
else:
|
| 84 |
+
return -log_sum_exp(log_probs).unsqueeze(-1)
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def sample_from_discretized_mix_logistic(y, log_scale_min=None):
|
| 88 |
+
"""
|
| 89 |
+
Sample from discretized mixture of logistic distributions
|
| 90 |
+
Args:
|
| 91 |
+
y (Tensor): B x C x T
|
| 92 |
+
log_scale_min (float): Log scale minimum value
|
| 93 |
+
Returns:
|
| 94 |
+
Tensor: sample in range of [-1, 1].
|
| 95 |
+
"""
|
| 96 |
+
if log_scale_min is None:
|
| 97 |
+
log_scale_min = float(np.log(1e-14))
|
| 98 |
+
assert y.size(1) % 3 == 0
|
| 99 |
+
nr_mix = y.size(1) // 3
|
| 100 |
+
|
| 101 |
+
# B x T x C
|
| 102 |
+
y = y.transpose(1, 2)
|
| 103 |
+
logit_probs = y[:, :, :nr_mix]
|
| 104 |
+
|
| 105 |
+
# sample mixture indicator from softmax
|
| 106 |
+
temp = logit_probs.data.new(logit_probs.size()).uniform_(1e-5, 1.0 - 1e-5)
|
| 107 |
+
temp = logit_probs.data - torch.log(- torch.log(temp))
|
| 108 |
+
_, argmax = temp.max(dim=-1)
|
| 109 |
+
|
| 110 |
+
# (B, T) -> (B, T, nr_mix)
|
| 111 |
+
one_hot = to_one_hot(argmax, nr_mix)
|
| 112 |
+
# select logistic parameters
|
| 113 |
+
means = torch.sum(y[:, :, nr_mix:2 * nr_mix] * one_hot, dim=-1)
|
| 114 |
+
log_scales = torch.clamp(torch.sum(
|
| 115 |
+
y[:, :, 2 * nr_mix:3 * nr_mix] * one_hot, dim=-1), min=log_scale_min)
|
| 116 |
+
# sample from logistic & clip to interval
|
| 117 |
+
# we don't actually round to the nearest 8bit value when sampling
|
| 118 |
+
u = means.data.new(means.size()).uniform_(1e-5, 1.0 - 1e-5)
|
| 119 |
+
x = means + torch.exp(log_scales) * (torch.log(u) - torch.log(1. - u))
|
| 120 |
+
|
| 121 |
+
x = torch.clamp(torch.clamp(x, min=-1.), max=1.)
|
| 122 |
+
|
| 123 |
+
return x
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def to_one_hot(tensor, n, fill_with=1.):
|
| 127 |
+
# we perform one hot encore with respect to the last axis
|
| 128 |
+
one_hot = torch.FloatTensor(tensor.size() + (n,)).zero_()
|
| 129 |
+
if tensor.is_cuda:
|
| 130 |
+
one_hot = one_hot.cuda()
|
| 131 |
+
one_hot.scatter_(len(tensor.size()), tensor.unsqueeze(-1), fill_with)
|
| 132 |
+
return one_hot
|
backend/app/vocoder/hparams.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from synthesizer.hparams import hparams as _syn_hp
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
# Audio settings------------------------------------------------------------------------
|
| 5 |
+
# Match the values of the synthesizer
|
| 6 |
+
sample_rate = _syn_hp.sample_rate
|
| 7 |
+
n_fft = _syn_hp.n_fft
|
| 8 |
+
num_mels = _syn_hp.num_mels
|
| 9 |
+
hop_length = _syn_hp.hop_size
|
| 10 |
+
win_length = _syn_hp.win_size
|
| 11 |
+
fmin = _syn_hp.fmin
|
| 12 |
+
min_level_db = _syn_hp.min_level_db
|
| 13 |
+
ref_level_db = _syn_hp.ref_level_db
|
| 14 |
+
mel_max_abs_value = _syn_hp.max_abs_value
|
| 15 |
+
preemphasis = _syn_hp.preemphasis
|
| 16 |
+
apply_preemphasis = _syn_hp.preemphasize
|
| 17 |
+
|
| 18 |
+
bits = 9 # bit depth of signal
|
| 19 |
+
mu_law = True # Recommended to suppress noise if using raw bits in hp.voc_mode
|
| 20 |
+
# below
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
# WAVERNN / VOCODER --------------------------------------------------------------------------------
|
| 24 |
+
voc_mode = 'RAW' # either 'RAW' (softmax on raw bits) or 'MOL' (sample from
|
| 25 |
+
# mixture of logistics)
|
| 26 |
+
voc_upsample_factors = (5, 5, 8) # NB - this needs to correctly factorise hop_length
|
| 27 |
+
voc_rnn_dims = 512
|
| 28 |
+
voc_fc_dims = 512
|
| 29 |
+
voc_compute_dims = 128
|
| 30 |
+
voc_res_out_dims = 128
|
| 31 |
+
voc_res_blocks = 10
|
| 32 |
+
|
| 33 |
+
# Training
|
| 34 |
+
voc_batch_size = 100
|
| 35 |
+
voc_lr = 1e-4
|
| 36 |
+
voc_gen_at_checkpoint = 5 # number of samples to generate at each checkpoint
|
| 37 |
+
voc_pad = 2 # this will pad the input so that the resnet can 'see' wider
|
| 38 |
+
# than input length
|
| 39 |
+
voc_seq_len = hop_length * 5 # must be a multiple of hop_length
|
| 40 |
+
|
| 41 |
+
# Generating / Synthesizing
|
| 42 |
+
voc_gen_batched = True # very fast (realtime+) single utterance batched generation
|
| 43 |
+
voc_target = 8000 # target number of samples to be generated in each batch entry
|
| 44 |
+
voc_overlap = 400 # number of samples for crossfading between batches
|
backend/app/vocoder/inference.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .models.fatchord_version import WaveRNN
|
| 2 |
+
from . import hparams as hp
|
| 3 |
+
import torch
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
_model = None # type: WaveRNN
|
| 7 |
+
|
| 8 |
+
def load_model(weights_fpath, verbose=True):
|
| 9 |
+
global _model, _device
|
| 10 |
+
|
| 11 |
+
if verbose:
|
| 12 |
+
print("Building Wave-RNN")
|
| 13 |
+
_model = WaveRNN(
|
| 14 |
+
rnn_dims=hp.voc_rnn_dims,
|
| 15 |
+
fc_dims=hp.voc_fc_dims,
|
| 16 |
+
bits=hp.bits,
|
| 17 |
+
pad=hp.voc_pad,
|
| 18 |
+
upsample_factors=hp.voc_upsample_factors,
|
| 19 |
+
feat_dims=hp.num_mels,
|
| 20 |
+
compute_dims=hp.voc_compute_dims,
|
| 21 |
+
res_out_dims=hp.voc_res_out_dims,
|
| 22 |
+
res_blocks=hp.voc_res_blocks,
|
| 23 |
+
hop_length=hp.hop_length,
|
| 24 |
+
sample_rate=hp.sample_rate,
|
| 25 |
+
mode=hp.voc_mode
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
if torch.cuda.is_available():
|
| 29 |
+
_model = _model.cuda()
|
| 30 |
+
_device = torch.device('cuda')
|
| 31 |
+
else:
|
| 32 |
+
_device = torch.device('cpu')
|
| 33 |
+
|
| 34 |
+
if verbose:
|
| 35 |
+
print("Loading model weights at %s" % weights_fpath)
|
| 36 |
+
checkpoint = torch.load(weights_fpath, _device)
|
| 37 |
+
_model.load_state_dict(checkpoint['model_state'])
|
| 38 |
+
_model.eval()
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def is_loaded():
|
| 42 |
+
return _model is not None
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def infer_waveform(mel, normalize=True, batched=True, target=8000, overlap=800,
|
| 46 |
+
progress_callback=None):
|
| 47 |
+
"""
|
| 48 |
+
Infers the waveform of a mel spectrogram output by the synthesizer (the format must match
|
| 49 |
+
that of the synthesizer!)
|
| 50 |
+
|
| 51 |
+
:param normalize:
|
| 52 |
+
:param batched:
|
| 53 |
+
:param target:
|
| 54 |
+
:param overlap:
|
| 55 |
+
:return:
|
| 56 |
+
"""
|
| 57 |
+
import sys
|
| 58 |
+
if _model is None:
|
| 59 |
+
raise Exception("Please load Wave-RNN in memory before using it")
|
| 60 |
+
|
| 61 |
+
print(f"[Vocoder] Input mel-spectrogram shape: {mel.shape}")
|
| 62 |
+
print(f"[Vocoder] Normalize: {normalize}, Batched: {batched}, Target: {target}, Overlap: {overlap}")
|
| 63 |
+
print(f"[Vocoder] Device: {_device}, Model on: {next(_model.parameters()).device}")
|
| 64 |
+
|
| 65 |
+
try:
|
| 66 |
+
if normalize:
|
| 67 |
+
mel = mel / hp.mel_max_abs_value
|
| 68 |
+
mel = torch.from_numpy(mel[None, ...])
|
| 69 |
+
print(f"[Vocoder] Mel tensor shape after processing: {mel.shape}, dtype: {mel.dtype}")
|
| 70 |
+
|
| 71 |
+
print("[Vocoder] Starting waveform generation (this may take a while on CPU)...")
|
| 72 |
+
sys.stdout.flush()
|
| 73 |
+
|
| 74 |
+
wav = _model.generate(mel, batched, target, overlap, hp.mu_law, progress_callback)
|
| 75 |
+
|
| 76 |
+
print(f"[Vocoder] Waveform generated successfully, shape: {wav.shape}")
|
| 77 |
+
return wav
|
| 78 |
+
except Exception as e:
|
| 79 |
+
print(f"[Vocoder] ✗ Error during vocoding: {e}")
|
| 80 |
+
import traceback
|
| 81 |
+
traceback.print_exc()
|
| 82 |
+
sys.stdout.flush()
|
| 83 |
+
raise
|
backend/app/vocoder/models/fatchord_version.py
ADDED
|
@@ -0,0 +1,434 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
import torch.nn.functional as F
|
| 4 |
+
from ..distribution import sample_from_discretized_mix_logistic
|
| 5 |
+
from ..display import *
|
| 6 |
+
from ..audio import *
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class ResBlock(nn.Module):
|
| 10 |
+
def __init__(self, dims):
|
| 11 |
+
super().__init__()
|
| 12 |
+
self.conv1 = nn.Conv1d(dims, dims, kernel_size=1, bias=False)
|
| 13 |
+
self.conv2 = nn.Conv1d(dims, dims, kernel_size=1, bias=False)
|
| 14 |
+
self.batch_norm1 = nn.BatchNorm1d(dims)
|
| 15 |
+
self.batch_norm2 = nn.BatchNorm1d(dims)
|
| 16 |
+
|
| 17 |
+
def forward(self, x):
|
| 18 |
+
residual = x
|
| 19 |
+
x = self.conv1(x)
|
| 20 |
+
x = self.batch_norm1(x)
|
| 21 |
+
x = F.relu(x)
|
| 22 |
+
x = self.conv2(x)
|
| 23 |
+
x = self.batch_norm2(x)
|
| 24 |
+
return x + residual
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class MelResNet(nn.Module):
|
| 28 |
+
def __init__(self, res_blocks, in_dims, compute_dims, res_out_dims, pad):
|
| 29 |
+
super().__init__()
|
| 30 |
+
k_size = pad * 2 + 1
|
| 31 |
+
self.conv_in = nn.Conv1d(in_dims, compute_dims, kernel_size=k_size, bias=False)
|
| 32 |
+
self.batch_norm = nn.BatchNorm1d(compute_dims)
|
| 33 |
+
self.layers = nn.ModuleList()
|
| 34 |
+
for i in range(res_blocks):
|
| 35 |
+
self.layers.append(ResBlock(compute_dims))
|
| 36 |
+
self.conv_out = nn.Conv1d(compute_dims, res_out_dims, kernel_size=1)
|
| 37 |
+
|
| 38 |
+
def forward(self, x):
|
| 39 |
+
x = self.conv_in(x)
|
| 40 |
+
x = self.batch_norm(x)
|
| 41 |
+
x = F.relu(x)
|
| 42 |
+
for f in self.layers: x = f(x)
|
| 43 |
+
x = self.conv_out(x)
|
| 44 |
+
return x
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
class Stretch2d(nn.Module):
|
| 48 |
+
def __init__(self, x_scale, y_scale):
|
| 49 |
+
super().__init__()
|
| 50 |
+
self.x_scale = x_scale
|
| 51 |
+
self.y_scale = y_scale
|
| 52 |
+
|
| 53 |
+
def forward(self, x):
|
| 54 |
+
b, c, h, w = x.size()
|
| 55 |
+
x = x.unsqueeze(-1).unsqueeze(3)
|
| 56 |
+
x = x.repeat(1, 1, 1, self.y_scale, 1, self.x_scale)
|
| 57 |
+
return x.view(b, c, h * self.y_scale, w * self.x_scale)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
class UpsampleNetwork(nn.Module):
|
| 61 |
+
def __init__(self, feat_dims, upsample_scales, compute_dims,
|
| 62 |
+
res_blocks, res_out_dims, pad):
|
| 63 |
+
super().__init__()
|
| 64 |
+
total_scale = np.cumprod(upsample_scales)[-1]
|
| 65 |
+
self.indent = pad * total_scale
|
| 66 |
+
self.resnet = MelResNet(res_blocks, feat_dims, compute_dims, res_out_dims, pad)
|
| 67 |
+
self.resnet_stretch = Stretch2d(total_scale, 1)
|
| 68 |
+
self.up_layers = nn.ModuleList()
|
| 69 |
+
for scale in upsample_scales:
|
| 70 |
+
k_size = (1, scale * 2 + 1)
|
| 71 |
+
padding = (0, scale)
|
| 72 |
+
stretch = Stretch2d(scale, 1)
|
| 73 |
+
conv = nn.Conv2d(1, 1, kernel_size=k_size, padding=padding, bias=False)
|
| 74 |
+
conv.weight.data.fill_(1. / k_size[1])
|
| 75 |
+
self.up_layers.append(stretch)
|
| 76 |
+
self.up_layers.append(conv)
|
| 77 |
+
|
| 78 |
+
def forward(self, m):
|
| 79 |
+
aux = self.resnet(m).unsqueeze(1)
|
| 80 |
+
aux = self.resnet_stretch(aux)
|
| 81 |
+
aux = aux.squeeze(1)
|
| 82 |
+
m = m.unsqueeze(1)
|
| 83 |
+
for f in self.up_layers: m = f(m)
|
| 84 |
+
m = m.squeeze(1)[:, :, self.indent:-self.indent]
|
| 85 |
+
return m.transpose(1, 2), aux.transpose(1, 2)
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
class WaveRNN(nn.Module):
|
| 89 |
+
def __init__(self, rnn_dims, fc_dims, bits, pad, upsample_factors,
|
| 90 |
+
feat_dims, compute_dims, res_out_dims, res_blocks,
|
| 91 |
+
hop_length, sample_rate, mode='RAW'):
|
| 92 |
+
super().__init__()
|
| 93 |
+
self.mode = mode
|
| 94 |
+
self.pad = pad
|
| 95 |
+
if self.mode == 'RAW' :
|
| 96 |
+
self.n_classes = 2 ** bits
|
| 97 |
+
elif self.mode == 'MOL' :
|
| 98 |
+
self.n_classes = 30
|
| 99 |
+
else :
|
| 100 |
+
RuntimeError("Unknown model mode value - ", self.mode)
|
| 101 |
+
|
| 102 |
+
self.rnn_dims = rnn_dims
|
| 103 |
+
self.aux_dims = res_out_dims // 4
|
| 104 |
+
self.hop_length = hop_length
|
| 105 |
+
self.sample_rate = sample_rate
|
| 106 |
+
|
| 107 |
+
self.upsample = UpsampleNetwork(feat_dims, upsample_factors, compute_dims, res_blocks, res_out_dims, pad)
|
| 108 |
+
self.I = nn.Linear(feat_dims + self.aux_dims + 1, rnn_dims)
|
| 109 |
+
self.rnn1 = nn.GRU(rnn_dims, rnn_dims, batch_first=True)
|
| 110 |
+
self.rnn2 = nn.GRU(rnn_dims + self.aux_dims, rnn_dims, batch_first=True)
|
| 111 |
+
self.fc1 = nn.Linear(rnn_dims + self.aux_dims, fc_dims)
|
| 112 |
+
self.fc2 = nn.Linear(fc_dims + self.aux_dims, fc_dims)
|
| 113 |
+
self.fc3 = nn.Linear(fc_dims, self.n_classes)
|
| 114 |
+
|
| 115 |
+
self.step = nn.Parameter(torch.zeros(1).long(), requires_grad=False)
|
| 116 |
+
self.num_params()
|
| 117 |
+
|
| 118 |
+
def forward(self, x, mels):
|
| 119 |
+
self.step += 1
|
| 120 |
+
bsize = x.size(0)
|
| 121 |
+
if torch.cuda.is_available():
|
| 122 |
+
h1 = torch.zeros(1, bsize, self.rnn_dims).cuda()
|
| 123 |
+
h2 = torch.zeros(1, bsize, self.rnn_dims).cuda()
|
| 124 |
+
else:
|
| 125 |
+
h1 = torch.zeros(1, bsize, self.rnn_dims).cpu()
|
| 126 |
+
h2 = torch.zeros(1, bsize, self.rnn_dims).cpu()
|
| 127 |
+
mels, aux = self.upsample(mels)
|
| 128 |
+
|
| 129 |
+
aux_idx = [self.aux_dims * i for i in range(5)]
|
| 130 |
+
a1 = aux[:, :, aux_idx[0]:aux_idx[1]]
|
| 131 |
+
a2 = aux[:, :, aux_idx[1]:aux_idx[2]]
|
| 132 |
+
a3 = aux[:, :, aux_idx[2]:aux_idx[3]]
|
| 133 |
+
a4 = aux[:, :, aux_idx[3]:aux_idx[4]]
|
| 134 |
+
|
| 135 |
+
x = torch.cat([x.unsqueeze(-1), mels, a1], dim=2)
|
| 136 |
+
x = self.I(x)
|
| 137 |
+
res = x
|
| 138 |
+
x, _ = self.rnn1(x, h1)
|
| 139 |
+
|
| 140 |
+
x = x + res
|
| 141 |
+
res = x
|
| 142 |
+
x = torch.cat([x, a2], dim=2)
|
| 143 |
+
x, _ = self.rnn2(x, h2)
|
| 144 |
+
|
| 145 |
+
x = x + res
|
| 146 |
+
x = torch.cat([x, a3], dim=2)
|
| 147 |
+
x = F.relu(self.fc1(x))
|
| 148 |
+
|
| 149 |
+
x = torch.cat([x, a4], dim=2)
|
| 150 |
+
x = F.relu(self.fc2(x))
|
| 151 |
+
return self.fc3(x)
|
| 152 |
+
|
| 153 |
+
def generate(self, mels, batched, target, overlap, mu_law, progress_callback=None):
|
| 154 |
+
mu_law = mu_law if self.mode == 'RAW' else False
|
| 155 |
+
progress_callback = progress_callback or self.gen_display
|
| 156 |
+
|
| 157 |
+
self.eval()
|
| 158 |
+
output = []
|
| 159 |
+
start = time.time()
|
| 160 |
+
rnn1 = self.get_gru_cell(self.rnn1)
|
| 161 |
+
rnn2 = self.get_gru_cell(self.rnn2)
|
| 162 |
+
|
| 163 |
+
with torch.no_grad():
|
| 164 |
+
if torch.cuda.is_available():
|
| 165 |
+
mels = mels.cuda()
|
| 166 |
+
else:
|
| 167 |
+
mels = mels.cpu()
|
| 168 |
+
wave_len = (mels.size(-1) - 1) * self.hop_length
|
| 169 |
+
mels = self.pad_tensor(mels.transpose(1, 2), pad=self.pad, side='both')
|
| 170 |
+
mels, aux = self.upsample(mels.transpose(1, 2))
|
| 171 |
+
|
| 172 |
+
if batched:
|
| 173 |
+
mels = self.fold_with_overlap(mels, target, overlap)
|
| 174 |
+
aux = self.fold_with_overlap(aux, target, overlap)
|
| 175 |
+
|
| 176 |
+
b_size, seq_len, _ = mels.size()
|
| 177 |
+
|
| 178 |
+
if torch.cuda.is_available():
|
| 179 |
+
h1 = torch.zeros(b_size, self.rnn_dims).cuda()
|
| 180 |
+
h2 = torch.zeros(b_size, self.rnn_dims).cuda()
|
| 181 |
+
x = torch.zeros(b_size, 1).cuda()
|
| 182 |
+
else:
|
| 183 |
+
h1 = torch.zeros(b_size, self.rnn_dims).cpu()
|
| 184 |
+
h2 = torch.zeros(b_size, self.rnn_dims).cpu()
|
| 185 |
+
x = torch.zeros(b_size, 1).cpu()
|
| 186 |
+
|
| 187 |
+
d = self.aux_dims
|
| 188 |
+
aux_split = [aux[:, :, d * i:d * (i + 1)] for i in range(4)]
|
| 189 |
+
|
| 190 |
+
for i in range(seq_len):
|
| 191 |
+
|
| 192 |
+
m_t = mels[:, i, :]
|
| 193 |
+
|
| 194 |
+
a1_t, a2_t, a3_t, a4_t = (a[:, i, :] for a in aux_split)
|
| 195 |
+
|
| 196 |
+
x = torch.cat([x, m_t, a1_t], dim=1)
|
| 197 |
+
x = self.I(x)
|
| 198 |
+
h1 = rnn1(x, h1)
|
| 199 |
+
|
| 200 |
+
x = x + h1
|
| 201 |
+
inp = torch.cat([x, a2_t], dim=1)
|
| 202 |
+
h2 = rnn2(inp, h2)
|
| 203 |
+
|
| 204 |
+
x = x + h2
|
| 205 |
+
x = torch.cat([x, a3_t], dim=1)
|
| 206 |
+
x = F.relu(self.fc1(x))
|
| 207 |
+
|
| 208 |
+
x = torch.cat([x, a4_t], dim=1)
|
| 209 |
+
x = F.relu(self.fc2(x))
|
| 210 |
+
|
| 211 |
+
logits = self.fc3(x)
|
| 212 |
+
|
| 213 |
+
if self.mode == 'MOL':
|
| 214 |
+
sample = sample_from_discretized_mix_logistic(logits.unsqueeze(0).transpose(1, 2))
|
| 215 |
+
output.append(sample.view(-1))
|
| 216 |
+
if torch.cuda.is_available():
|
| 217 |
+
# x = torch.FloatTensor([[sample]]).cuda()
|
| 218 |
+
x = sample.transpose(0, 1).cuda()
|
| 219 |
+
else:
|
| 220 |
+
x = sample.transpose(0, 1)
|
| 221 |
+
|
| 222 |
+
elif self.mode == 'RAW' :
|
| 223 |
+
posterior = F.softmax(logits, dim=1)
|
| 224 |
+
distrib = torch.distributions.Categorical(posterior)
|
| 225 |
+
|
| 226 |
+
sample = 2 * distrib.sample().float() / (self.n_classes - 1.) - 1.
|
| 227 |
+
output.append(sample)
|
| 228 |
+
x = sample.unsqueeze(-1)
|
| 229 |
+
else:
|
| 230 |
+
raise RuntimeError("Unknown model mode value - ", self.mode)
|
| 231 |
+
|
| 232 |
+
if i % 100 == 0:
|
| 233 |
+
gen_rate = (i + 1) / (time.time() - start) * b_size / 1000
|
| 234 |
+
progress_callback(i, seq_len, b_size, gen_rate)
|
| 235 |
+
|
| 236 |
+
output = torch.stack(output).transpose(0, 1)
|
| 237 |
+
output = output.cpu().numpy()
|
| 238 |
+
output = output.astype(np.float64)
|
| 239 |
+
|
| 240 |
+
if batched:
|
| 241 |
+
output = self.xfade_and_unfold(output, target, overlap)
|
| 242 |
+
else:
|
| 243 |
+
output = output[0]
|
| 244 |
+
|
| 245 |
+
if mu_law:
|
| 246 |
+
output = decode_mu_law(output, self.n_classes, False)
|
| 247 |
+
if hp.apply_preemphasis:
|
| 248 |
+
output = de_emphasis(output)
|
| 249 |
+
|
| 250 |
+
# Fade-out at the end to avoid signal cutting out suddenly
|
| 251 |
+
fade_out = np.linspace(1, 0, 20 * self.hop_length)
|
| 252 |
+
output = output[:wave_len]
|
| 253 |
+
output[-20 * self.hop_length:] *= fade_out
|
| 254 |
+
|
| 255 |
+
self.train()
|
| 256 |
+
|
| 257 |
+
return output
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
def gen_display(self, i, seq_len, b_size, gen_rate):
|
| 261 |
+
pbar = progbar(i, seq_len)
|
| 262 |
+
msg = f'| {pbar} {i*b_size}/{seq_len*b_size} | Batch Size: {b_size} | Gen Rate: {gen_rate:.1f}kHz | '
|
| 263 |
+
stream(msg)
|
| 264 |
+
|
| 265 |
+
def get_gru_cell(self, gru):
|
| 266 |
+
gru_cell = nn.GRUCell(gru.input_size, gru.hidden_size)
|
| 267 |
+
gru_cell.weight_hh.data = gru.weight_hh_l0.data
|
| 268 |
+
gru_cell.weight_ih.data = gru.weight_ih_l0.data
|
| 269 |
+
gru_cell.bias_hh.data = gru.bias_hh_l0.data
|
| 270 |
+
gru_cell.bias_ih.data = gru.bias_ih_l0.data
|
| 271 |
+
return gru_cell
|
| 272 |
+
|
| 273 |
+
def pad_tensor(self, x, pad, side='both'):
|
| 274 |
+
# NB - this is just a quick method i need right now
|
| 275 |
+
# i.e., it won't generalise to other shapes/dims
|
| 276 |
+
b, t, c = x.size()
|
| 277 |
+
total = t + 2 * pad if side == 'both' else t + pad
|
| 278 |
+
if torch.cuda.is_available():
|
| 279 |
+
padded = torch.zeros(b, total, c).cuda()
|
| 280 |
+
else:
|
| 281 |
+
padded = torch.zeros(b, total, c).cpu()
|
| 282 |
+
if side == 'before' or side == 'both':
|
| 283 |
+
padded[:, pad:pad + t, :] = x
|
| 284 |
+
elif side == 'after':
|
| 285 |
+
padded[:, :t, :] = x
|
| 286 |
+
return padded
|
| 287 |
+
|
| 288 |
+
def fold_with_overlap(self, x, target, overlap):
|
| 289 |
+
|
| 290 |
+
''' Fold the tensor with overlap for quick batched inference.
|
| 291 |
+
Overlap will be used for crossfading in xfade_and_unfold()
|
| 292 |
+
|
| 293 |
+
Args:
|
| 294 |
+
x (tensor) : Upsampled conditioning features.
|
| 295 |
+
shape=(1, timesteps, features)
|
| 296 |
+
target (int) : Target timesteps for each index of batch
|
| 297 |
+
overlap (int) : Timesteps for both xfade and rnn warmup
|
| 298 |
+
|
| 299 |
+
Return:
|
| 300 |
+
(tensor) : shape=(num_folds, target + 2 * overlap, features)
|
| 301 |
+
|
| 302 |
+
Details:
|
| 303 |
+
x = [[h1, h2, ... hn]]
|
| 304 |
+
|
| 305 |
+
Where each h is a vector of conditioning features
|
| 306 |
+
|
| 307 |
+
Eg: target=2, overlap=1 with x.size(1)=10
|
| 308 |
+
|
| 309 |
+
folded = [[h1, h2, h3, h4],
|
| 310 |
+
[h4, h5, h6, h7],
|
| 311 |
+
[h7, h8, h9, h10]]
|
| 312 |
+
'''
|
| 313 |
+
|
| 314 |
+
_, total_len, features = x.size()
|
| 315 |
+
|
| 316 |
+
# Calculate variables needed
|
| 317 |
+
num_folds = (total_len - overlap) // (target + overlap)
|
| 318 |
+
extended_len = num_folds * (overlap + target) + overlap
|
| 319 |
+
remaining = total_len - extended_len
|
| 320 |
+
|
| 321 |
+
# Pad if some time steps poking out
|
| 322 |
+
if remaining != 0:
|
| 323 |
+
num_folds += 1
|
| 324 |
+
padding = target + 2 * overlap - remaining
|
| 325 |
+
x = self.pad_tensor(x, padding, side='after')
|
| 326 |
+
|
| 327 |
+
if torch.cuda.is_available():
|
| 328 |
+
folded = torch.zeros(num_folds, target + 2 * overlap, features).cuda()
|
| 329 |
+
else:
|
| 330 |
+
folded = torch.zeros(num_folds, target + 2 * overlap, features).cpu()
|
| 331 |
+
|
| 332 |
+
# Get the values for the folded tensor
|
| 333 |
+
for i in range(num_folds):
|
| 334 |
+
start = i * (target + overlap)
|
| 335 |
+
end = start + target + 2 * overlap
|
| 336 |
+
folded[i] = x[:, start:end, :]
|
| 337 |
+
|
| 338 |
+
return folded
|
| 339 |
+
|
| 340 |
+
def xfade_and_unfold(self, y, target, overlap):
|
| 341 |
+
|
| 342 |
+
''' Applies a crossfade and unfolds into a 1d array.
|
| 343 |
+
|
| 344 |
+
Args:
|
| 345 |
+
y (ndarry) : Batched sequences of audio samples
|
| 346 |
+
shape=(num_folds, target + 2 * overlap)
|
| 347 |
+
dtype=np.float64
|
| 348 |
+
overlap (int) : Timesteps for both xfade and rnn warmup
|
| 349 |
+
|
| 350 |
+
Return:
|
| 351 |
+
(ndarry) : audio samples in a 1d array
|
| 352 |
+
shape=(total_len)
|
| 353 |
+
dtype=np.float64
|
| 354 |
+
|
| 355 |
+
Details:
|
| 356 |
+
y = [[seq1],
|
| 357 |
+
[seq2],
|
| 358 |
+
[seq3]]
|
| 359 |
+
|
| 360 |
+
Apply a gain envelope at both ends of the sequences
|
| 361 |
+
|
| 362 |
+
y = [[seq1_in, seq1_target, seq1_out],
|
| 363 |
+
[seq2_in, seq2_target, seq2_out],
|
| 364 |
+
[seq3_in, seq3_target, seq3_out]]
|
| 365 |
+
|
| 366 |
+
Stagger and add up the groups of samples:
|
| 367 |
+
|
| 368 |
+
[seq1_in, seq1_target, (seq1_out + seq2_in), seq2_target, ...]
|
| 369 |
+
|
| 370 |
+
'''
|
| 371 |
+
|
| 372 |
+
num_folds, length = y.shape
|
| 373 |
+
target = length - 2 * overlap
|
| 374 |
+
total_len = num_folds * (target + overlap) + overlap
|
| 375 |
+
|
| 376 |
+
# Need some silence for the rnn warmup
|
| 377 |
+
silence_len = overlap // 2
|
| 378 |
+
fade_len = overlap - silence_len
|
| 379 |
+
silence = np.zeros((silence_len), dtype=np.float64)
|
| 380 |
+
|
| 381 |
+
# Equal power crossfade
|
| 382 |
+
t = np.linspace(-1, 1, fade_len, dtype=np.float64)
|
| 383 |
+
fade_in = np.sqrt(0.5 * (1 + t))
|
| 384 |
+
fade_out = np.sqrt(0.5 * (1 - t))
|
| 385 |
+
|
| 386 |
+
# Concat the silence to the fades
|
| 387 |
+
fade_in = np.concatenate([silence, fade_in])
|
| 388 |
+
fade_out = np.concatenate([fade_out, silence])
|
| 389 |
+
|
| 390 |
+
# Apply the gain to the overlap samples
|
| 391 |
+
y[:, :overlap] *= fade_in
|
| 392 |
+
y[:, -overlap:] *= fade_out
|
| 393 |
+
|
| 394 |
+
unfolded = np.zeros((total_len), dtype=np.float64)
|
| 395 |
+
|
| 396 |
+
# Loop to add up all the samples
|
| 397 |
+
for i in range(num_folds):
|
| 398 |
+
start = i * (target + overlap)
|
| 399 |
+
end = start + target + 2 * overlap
|
| 400 |
+
unfolded[start:end] += y[i]
|
| 401 |
+
|
| 402 |
+
return unfolded
|
| 403 |
+
|
| 404 |
+
def get_step(self) :
|
| 405 |
+
return self.step.data.item()
|
| 406 |
+
|
| 407 |
+
def checkpoint(self, model_dir, optimizer) :
|
| 408 |
+
k_steps = self.get_step() // 1000
|
| 409 |
+
self.save(model_dir.joinpath("checkpoint_%dk_steps.pt" % k_steps), optimizer)
|
| 410 |
+
|
| 411 |
+
def log(self, path, msg) :
|
| 412 |
+
with open(path, 'a') as f:
|
| 413 |
+
print(msg, file=f)
|
| 414 |
+
|
| 415 |
+
def load(self, path, optimizer) :
|
| 416 |
+
checkpoint = torch.load(path)
|
| 417 |
+
if "optimizer_state" in checkpoint:
|
| 418 |
+
self.load_state_dict(checkpoint["model_state"])
|
| 419 |
+
optimizer.load_state_dict(checkpoint["optimizer_state"])
|
| 420 |
+
else:
|
| 421 |
+
# Backwards compatibility
|
| 422 |
+
self.load_state_dict(checkpoint)
|
| 423 |
+
|
| 424 |
+
def save(self, path, optimizer) :
|
| 425 |
+
torch.save({
|
| 426 |
+
"model_state": self.state_dict(),
|
| 427 |
+
"optimizer_state": optimizer.state_dict(),
|
| 428 |
+
}, path)
|
| 429 |
+
|
| 430 |
+
def num_params(self, print_out=True):
|
| 431 |
+
parameters = filter(lambda p: p.requires_grad, self.parameters())
|
| 432 |
+
parameters = sum([np.prod(p.size()) for p in parameters]) / 1_000_000
|
| 433 |
+
if print_out :
|
| 434 |
+
print('Trainable Parameters: %.3fM' % parameters)
|
backend/app/voice_cloning.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Core voice cloning logic shared by the API routes."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import shutil
|
| 6 |
+
import gc
|
| 7 |
+
import torch
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import Dict, Tuple
|
| 10 |
+
|
| 11 |
+
import numpy as np
|
| 12 |
+
import soundfile as sf
|
| 13 |
+
from huggingface_hub import hf_hub_download
|
| 14 |
+
|
| 15 |
+
from encoder import inference as encoder_infer
|
| 16 |
+
from synthesizer import inference as synthesizer_infer
|
| 17 |
+
from synthesizer.hparams import hparams as syn_hp
|
| 18 |
+
from app.vocoder import inference as vocoder_infer
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
MODEL_SPECS: Dict[str, Tuple[str, str]] = {
|
| 22 |
+
"encoder.pt": ("AJ50/voice-clone-encoder", "encoder.pt"),
|
| 23 |
+
"synthesizer.pt": ("AJ50/voice-clone-synthesizer", "synthesizer.pt"),
|
| 24 |
+
"vocoder.pt": ("AJ50/voice-clone-vocoder", "vocoder.pt"),
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def ensure_default_models(models_dir: Path) -> None:
|
| 29 |
+
"""Download the required pretrained weights if they are missing."""
|
| 30 |
+
|
| 31 |
+
target_dir = models_dir / "default"
|
| 32 |
+
target_dir.mkdir(parents=True, exist_ok=True)
|
| 33 |
+
|
| 34 |
+
for filename, (repo_id, repo_filename) in MODEL_SPECS.items():
|
| 35 |
+
destination = target_dir / filename
|
| 36 |
+
if destination.exists():
|
| 37 |
+
continue
|
| 38 |
+
|
| 39 |
+
print(f"[Models] Downloading {filename} from {repo_id}...")
|
| 40 |
+
downloaded_path = Path(
|
| 41 |
+
hf_hub_download(repo_id=repo_id, filename=repo_filename)
|
| 42 |
+
)
|
| 43 |
+
shutil.copy2(downloaded_path, destination)
|
| 44 |
+
print(f"[Models] Saved to {destination}")
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def synthesize(voice_path: Path, text: str, models_dir: Path, out_path: Path) -> Path:
|
| 48 |
+
"""Run end-to-end voice cloning and return the generated audio path."""
|
| 49 |
+
|
| 50 |
+
ensure_default_models(models_dir)
|
| 51 |
+
|
| 52 |
+
enc_path = models_dir / "default" / "encoder.pt"
|
| 53 |
+
syn_path = models_dir / "default" / "synthesizer.pt"
|
| 54 |
+
voc_path = models_dir / "default" / "vocoder.pt"
|
| 55 |
+
|
| 56 |
+
for model_path in (enc_path, syn_path, voc_path):
|
| 57 |
+
if not model_path.exists():
|
| 58 |
+
raise RuntimeError(f"Model file missing: {model_path}")
|
| 59 |
+
|
| 60 |
+
print("[VoiceCloning] Loading encoder...")
|
| 61 |
+
encoder_infer.load_model(enc_path)
|
| 62 |
+
print("[VoiceCloning] Loading synthesizer...")
|
| 63 |
+
synthesizer = synthesizer_infer.Synthesizer(syn_path)
|
| 64 |
+
print("[VoiceCloning] Loading vocoder...")
|
| 65 |
+
vocoder_infer.load_model(voc_path)
|
| 66 |
+
|
| 67 |
+
if not voice_path.exists():
|
| 68 |
+
raise RuntimeError(f"Reference voice file not found: {voice_path}")
|
| 69 |
+
|
| 70 |
+
print("[VoiceCloning] Preprocessing reference audio...")
|
| 71 |
+
wav = encoder_infer.preprocess_wav(voice_path)
|
| 72 |
+
embed = encoder_infer.embed_utterance(wav)
|
| 73 |
+
|
| 74 |
+
print("[VoiceCloning] Generating mel-spectrogram...")
|
| 75 |
+
mels = synthesizer.synthesize_spectrograms([text], [embed])
|
| 76 |
+
mel = mels[0]
|
| 77 |
+
|
| 78 |
+
print("[VoiceCloning] Vocoding waveform...")
|
| 79 |
+
try:
|
| 80 |
+
waveform = synthesizer.griffin_lim(mel).astype(np.float32)
|
| 81 |
+
except Exception:
|
| 82 |
+
waveform = vocoder_infer.infer_waveform(
|
| 83 |
+
mel, normalize=True, batched=False, target=8000, overlap=800
|
| 84 |
+
).astype(np.float32)
|
| 85 |
+
|
| 86 |
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
| 87 |
+
sf.write(out_path.as_posix(), waveform, syn_hp.sample_rate)
|
| 88 |
+
print(f"[VoiceCloning] Audio saved to {out_path}")
|
| 89 |
+
|
| 90 |
+
# Memory optimization for Render free tier
|
| 91 |
+
print("[VoiceCloning] Cleaning up models to free memory...")
|
| 92 |
+
try:
|
| 93 |
+
# Clear model caches
|
| 94 |
+
if hasattr(encoder_infer, '_model'):
|
| 95 |
+
encoder_infer._model = None
|
| 96 |
+
if hasattr(synthesizer_infer, '_model'):
|
| 97 |
+
synthesizer_infer._model = None
|
| 98 |
+
if hasattr(vocoder_infer, '_model'):
|
| 99 |
+
vocoder_infer._model = None
|
| 100 |
+
|
| 101 |
+
# Force garbage collection
|
| 102 |
+
gc.collect()
|
| 103 |
+
if torch.cuda.is_available():
|
| 104 |
+
torch.cuda.empty_cache()
|
| 105 |
+
except Exception as e:
|
| 106 |
+
print(f"[VoiceCloning] Warning during cleanup: {e}")
|
| 107 |
+
|
| 108 |
+
return out_path
|
backend/download_models.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Download models from HuggingFace on startup
|
| 3 |
+
Run this once or on container startup for Render
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from huggingface_hub import hf_hub_download
|
| 8 |
+
import shutil
|
| 9 |
+
import sys
|
| 10 |
+
|
| 11 |
+
MODEL_SPECS = {
|
| 12 |
+
"encoder.pt": ("AJ50/voice-clone-encoder", "encoder.pt"),
|
| 13 |
+
"synthesizer.pt": ("AJ50/voice-clone-synthesizer", "synthesizer.pt"),
|
| 14 |
+
"vocoder.pt": ("AJ50/voice-clone-vocoder", "vocoder.pt"),
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
def download_models(models_dir: Path) -> None:
|
| 18 |
+
"""Download required models from HuggingFace if missing"""
|
| 19 |
+
|
| 20 |
+
target_dir = models_dir / "default"
|
| 21 |
+
target_dir.mkdir(parents=True, exist_ok=True)
|
| 22 |
+
|
| 23 |
+
print(f"[Models] Target directory: {target_dir}")
|
| 24 |
+
|
| 25 |
+
for filename, (repo_id, repo_filename) in MODEL_SPECS.items():
|
| 26 |
+
destination = target_dir / filename
|
| 27 |
+
|
| 28 |
+
# Skip if already exists
|
| 29 |
+
if destination.exists():
|
| 30 |
+
size_mb = destination.stat().st_size / (1024 * 1024)
|
| 31 |
+
print(f"✓ {filename} already exists ({size_mb:.1f} MB)")
|
| 32 |
+
continue
|
| 33 |
+
|
| 34 |
+
print(f"[Models] Downloading {filename} from {repo_id}...")
|
| 35 |
+
try:
|
| 36 |
+
downloaded_path = Path(
|
| 37 |
+
hf_hub_download(repo_id=repo_id, filename=repo_filename)
|
| 38 |
+
)
|
| 39 |
+
shutil.copy2(downloaded_path, destination)
|
| 40 |
+
size_mb = destination.stat().st_size / (1024 * 1024)
|
| 41 |
+
print(f"✓ Saved {filename} ({size_mb:.1f} MB) to {destination}")
|
| 42 |
+
except Exception as e:
|
| 43 |
+
print(f"✗ Failed to download {filename}: {e}")
|
| 44 |
+
return False
|
| 45 |
+
|
| 46 |
+
print("[Models] All models downloaded successfully!")
|
| 47 |
+
return True
|
| 48 |
+
|
| 49 |
+
if __name__ == "__main__":
|
| 50 |
+
backend_dir = Path(__file__).parent
|
| 51 |
+
models_dir = backend_dir / "models"
|
| 52 |
+
|
| 53 |
+
success = download_models(models_dir)
|
| 54 |
+
sys.exit(0 if success else 1)
|
backend/encoder/__init__.py
ADDED
|
File without changes
|
backend/encoder/audio.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from scipy.ndimage.morphology import binary_dilation
|
| 2 |
+
from encoder.params_data import *
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import Optional, Union
|
| 5 |
+
from warnings import warn
|
| 6 |
+
import numpy as np
|
| 7 |
+
import librosa
|
| 8 |
+
import struct
|
| 9 |
+
|
| 10 |
+
try:
|
| 11 |
+
import webrtcvad
|
| 12 |
+
except:
|
| 13 |
+
warn("Unable to import 'webrtcvad'. This package enables noise removal and is recommended.")
|
| 14 |
+
webrtcvad=None
|
| 15 |
+
|
| 16 |
+
int16_max = (2 ** 15) - 1
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
|
| 20 |
+
source_sr: Optional[int] = None,
|
| 21 |
+
normalize: Optional[bool] = True,
|
| 22 |
+
trim_silence: Optional[bool] = True):
|
| 23 |
+
"""
|
| 24 |
+
Applies the preprocessing operations used in training the Speaker Encoder to a waveform
|
| 25 |
+
either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
|
| 26 |
+
|
| 27 |
+
:param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not
|
| 28 |
+
just .wav), either the waveform as a numpy array of floats.
|
| 29 |
+
:param source_sr: if passing an audio waveform, the sampling rate of the waveform before
|
| 30 |
+
preprocessing. After preprocessing, the waveform's sampling rate will match the data
|
| 31 |
+
hyperparameters. If passing a filepath, the sampling rate will be automatically detected and
|
| 32 |
+
this argument will be ignored.
|
| 33 |
+
"""
|
| 34 |
+
# Load the wav from disk if needed
|
| 35 |
+
if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
|
| 36 |
+
wav, source_sr = librosa.load(str(fpath_or_wav), sr=None)
|
| 37 |
+
else:
|
| 38 |
+
wav = fpath_or_wav
|
| 39 |
+
|
| 40 |
+
# Resample the wav if needed
|
| 41 |
+
if source_sr is not None and source_sr != sampling_rate:
|
| 42 |
+
wav = librosa.resample(y=wav, orig_sr=source_sr, target_sr=sampling_rate)
|
| 43 |
+
|
| 44 |
+
# Apply the preprocessing: normalize volume and shorten long silences
|
| 45 |
+
if normalize:
|
| 46 |
+
wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
|
| 47 |
+
if webrtcvad and trim_silence:
|
| 48 |
+
wav = trim_long_silences(wav)
|
| 49 |
+
|
| 50 |
+
return wav
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def wav_to_mel_spectrogram(wav):
|
| 54 |
+
"""
|
| 55 |
+
Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
|
| 56 |
+
Note: this not a log-mel spectrogram.
|
| 57 |
+
"""
|
| 58 |
+
frames = librosa.feature.melspectrogram(
|
| 59 |
+
y=wav,
|
| 60 |
+
sr=sampling_rate,
|
| 61 |
+
n_fft=int(sampling_rate * mel_window_length / 1000),
|
| 62 |
+
hop_length=int(sampling_rate * mel_window_step / 1000),
|
| 63 |
+
n_mels=mel_n_channels
|
| 64 |
+
)
|
| 65 |
+
return frames.astype(np.float32).T
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def trim_long_silences(wav):
|
| 69 |
+
"""
|
| 70 |
+
Ensures that segments without voice in the waveform remain no longer than a
|
| 71 |
+
threshold determined by the VAD parameters in params.py.
|
| 72 |
+
|
| 73 |
+
:param wav: the raw waveform as a numpy array of floats
|
| 74 |
+
:return: the same waveform with silences trimmed away (length <= original wav length)
|
| 75 |
+
"""
|
| 76 |
+
# Compute the voice detection window size
|
| 77 |
+
samples_per_window = (vad_window_length * sampling_rate) // 1000
|
| 78 |
+
|
| 79 |
+
# Trim the end of the audio to have a multiple of the window size
|
| 80 |
+
wav = wav[:len(wav) - (len(wav) % samples_per_window)]
|
| 81 |
+
|
| 82 |
+
# Convert the float waveform to 16-bit mono PCM
|
| 83 |
+
pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
|
| 84 |
+
|
| 85 |
+
# Perform voice activation detection
|
| 86 |
+
voice_flags = []
|
| 87 |
+
vad = webrtcvad.Vad(mode=3)
|
| 88 |
+
for window_start in range(0, len(wav), samples_per_window):
|
| 89 |
+
window_end = window_start + samples_per_window
|
| 90 |
+
voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
|
| 91 |
+
sample_rate=sampling_rate))
|
| 92 |
+
voice_flags = np.array(voice_flags)
|
| 93 |
+
|
| 94 |
+
# Smooth the voice detection with a moving average
|
| 95 |
+
def moving_average(array, width):
|
| 96 |
+
array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
|
| 97 |
+
ret = np.cumsum(array_padded, dtype=float)
|
| 98 |
+
ret[width:] = ret[width:] - ret[:-width]
|
| 99 |
+
return ret[width - 1:] / width
|
| 100 |
+
|
| 101 |
+
audio_mask = moving_average(voice_flags, vad_moving_average_width)
|
| 102 |
+
audio_mask = np.round(audio_mask).astype(bool)
|
| 103 |
+
|
| 104 |
+
# Dilate the voiced regions
|
| 105 |
+
audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
|
| 106 |
+
audio_mask = np.repeat(audio_mask, samples_per_window)
|
| 107 |
+
|
| 108 |
+
return wav[audio_mask == True]
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
|
| 112 |
+
if increase_only and decrease_only:
|
| 113 |
+
raise ValueError("Both increase only and decrease only are set")
|
| 114 |
+
dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2))
|
| 115 |
+
if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
|
| 116 |
+
return wav
|
| 117 |
+
return wav * (10 ** (dBFS_change / 20))
|
backend/encoder/inference.py
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from encoder.params_data import *
|
| 2 |
+
from encoder.model import SpeakerEncoder
|
| 3 |
+
from encoder.audio import preprocess_wav # We want to expose this function from here
|
| 4 |
+
from matplotlib import cm
|
| 5 |
+
from encoder import audio
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
import numpy as np
|
| 8 |
+
import torch
|
| 9 |
+
|
| 10 |
+
_model = None # type: SpeakerEncoder
|
| 11 |
+
_device = None # type: torch.device
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def load_model(weights_fpath: Path, device=None):
|
| 15 |
+
"""
|
| 16 |
+
Loads the model in memory. If this function is not explicitely called, it will be run on the
|
| 17 |
+
first call to embed_frames() with the default weights file.
|
| 18 |
+
|
| 19 |
+
:param weights_fpath: the path to saved model weights.
|
| 20 |
+
:param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The
|
| 21 |
+
model will be loaded and will run on this device. Outputs will however always be on the cpu.
|
| 22 |
+
If None, will default to your GPU if it"s available, otherwise your CPU.
|
| 23 |
+
"""
|
| 24 |
+
# TODO: I think the slow loading of the encoder might have something to do with the device it
|
| 25 |
+
# was saved on. Worth investigating.
|
| 26 |
+
global _model, _device
|
| 27 |
+
if device is None:
|
| 28 |
+
_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 29 |
+
elif isinstance(device, str):
|
| 30 |
+
_device = torch.device(device)
|
| 31 |
+
_model = SpeakerEncoder(_device, torch.device("cpu"))
|
| 32 |
+
checkpoint = torch.load(weights_fpath, _device)
|
| 33 |
+
_model.load_state_dict(checkpoint["model_state"])
|
| 34 |
+
_model.eval()
|
| 35 |
+
print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"]))
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def is_loaded():
|
| 39 |
+
return _model is not None
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def embed_frames_batch(frames_batch):
|
| 43 |
+
"""
|
| 44 |
+
Computes embeddings for a batch of mel spectrogram.
|
| 45 |
+
|
| 46 |
+
:param frames_batch: a batch mel of spectrogram as a numpy array of float32 of shape
|
| 47 |
+
(batch_size, n_frames, n_channels)
|
| 48 |
+
:return: the embeddings as a numpy array of float32 of shape (batch_size, model_embedding_size)
|
| 49 |
+
"""
|
| 50 |
+
if _model is None:
|
| 51 |
+
raise Exception("Model was not loaded. Call load_model() before inference.")
|
| 52 |
+
|
| 53 |
+
frames = torch.from_numpy(frames_batch).to(_device)
|
| 54 |
+
embed = _model.forward(frames).detach().cpu().numpy()
|
| 55 |
+
return embed
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames,
|
| 59 |
+
min_pad_coverage=0.75, overlap=0.5):
|
| 60 |
+
"""
|
| 61 |
+
Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain
|
| 62 |
+
partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel
|
| 63 |
+
spectrogram slices are returned, so as to make each partial utterance waveform correspond to
|
| 64 |
+
its spectrogram. This function assumes that the mel spectrogram parameters used are those
|
| 65 |
+
defined in params_data.py.
|
| 66 |
+
|
| 67 |
+
The returned ranges may be indexing further than the length of the waveform. It is
|
| 68 |
+
recommended that you pad the waveform with zeros up to wave_slices[-1].stop.
|
| 69 |
+
|
| 70 |
+
:param n_samples: the number of samples in the waveform
|
| 71 |
+
:param partial_utterance_n_frames: the number of mel spectrogram frames in each partial
|
| 72 |
+
utterance
|
| 73 |
+
:param min_pad_coverage: when reaching the last partial utterance, it may or may not have
|
| 74 |
+
enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
|
| 75 |
+
then the last partial utterance will be considered, as if we padded the audio. Otherwise,
|
| 76 |
+
it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial
|
| 77 |
+
utterance, this parameter is ignored so that the function always returns at least 1 slice.
|
| 78 |
+
:param overlap: by how much the partial utterance should overlap. If set to 0, the partial
|
| 79 |
+
utterances are entirely disjoint.
|
| 80 |
+
:return: the waveform slices and mel spectrogram slices as lists of array slices. Index
|
| 81 |
+
respectively the waveform and the mel spectrogram with these slices to obtain the partial
|
| 82 |
+
utterances.
|
| 83 |
+
"""
|
| 84 |
+
assert 0 <= overlap < 1
|
| 85 |
+
assert 0 < min_pad_coverage <= 1
|
| 86 |
+
|
| 87 |
+
samples_per_frame = int((sampling_rate * mel_window_step / 1000))
|
| 88 |
+
n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
|
| 89 |
+
frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1)
|
| 90 |
+
|
| 91 |
+
# Compute the slices
|
| 92 |
+
wav_slices, mel_slices = [], []
|
| 93 |
+
steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
|
| 94 |
+
for i in range(0, steps, frame_step):
|
| 95 |
+
mel_range = np.array([i, i + partial_utterance_n_frames])
|
| 96 |
+
wav_range = mel_range * samples_per_frame
|
| 97 |
+
mel_slices.append(slice(*mel_range))
|
| 98 |
+
wav_slices.append(slice(*wav_range))
|
| 99 |
+
|
| 100 |
+
# Evaluate whether extra padding is warranted or not
|
| 101 |
+
last_wav_range = wav_slices[-1]
|
| 102 |
+
coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
|
| 103 |
+
if coverage < min_pad_coverage and len(mel_slices) > 1:
|
| 104 |
+
mel_slices = mel_slices[:-1]
|
| 105 |
+
wav_slices = wav_slices[:-1]
|
| 106 |
+
|
| 107 |
+
return wav_slices, mel_slices
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
|
| 111 |
+
"""
|
| 112 |
+
Computes an embedding for a single utterance.
|
| 113 |
+
|
| 114 |
+
# TODO: handle multiple wavs to benefit from batching on GPU
|
| 115 |
+
:param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32
|
| 116 |
+
:param using_partials: if True, then the utterance is split in partial utterances of
|
| 117 |
+
<partial_utterance_n_frames> frames and the utterance embedding is computed from their
|
| 118 |
+
normalized average. If False, the utterance is instead computed from feeding the entire
|
| 119 |
+
spectogram to the network.
|
| 120 |
+
:param return_partials: if True, the partial embeddings will also be returned along with the
|
| 121 |
+
wav slices that correspond to the partial embeddings.
|
| 122 |
+
:param kwargs: additional arguments to compute_partial_splits()
|
| 123 |
+
:return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If
|
| 124 |
+
<return_partials> is True, the partial utterances as a numpy array of float32 of shape
|
| 125 |
+
(n_partials, model_embedding_size) and the wav partials as a list of slices will also be
|
| 126 |
+
returned. If <using_partials> is simultaneously set to False, both these values will be None
|
| 127 |
+
instead.
|
| 128 |
+
"""
|
| 129 |
+
# Process the entire utterance if not using partials
|
| 130 |
+
if not using_partials:
|
| 131 |
+
frames = audio.wav_to_mel_spectrogram(wav)
|
| 132 |
+
embed = embed_frames_batch(frames[None, ...])[0]
|
| 133 |
+
if return_partials:
|
| 134 |
+
return embed, None, None
|
| 135 |
+
return embed
|
| 136 |
+
|
| 137 |
+
# Compute where to split the utterance into partials and pad if necessary
|
| 138 |
+
wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs)
|
| 139 |
+
max_wave_length = wave_slices[-1].stop
|
| 140 |
+
if max_wave_length >= len(wav):
|
| 141 |
+
wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
|
| 142 |
+
|
| 143 |
+
# Split the utterance into partials
|
| 144 |
+
frames = audio.wav_to_mel_spectrogram(wav)
|
| 145 |
+
frames_batch = np.array([frames[s] for s in mel_slices])
|
| 146 |
+
partial_embeds = embed_frames_batch(frames_batch)
|
| 147 |
+
|
| 148 |
+
# Compute the utterance embedding from the partial embeddings
|
| 149 |
+
raw_embed = np.mean(partial_embeds, axis=0)
|
| 150 |
+
embed = raw_embed / np.linalg.norm(raw_embed, 2)
|
| 151 |
+
|
| 152 |
+
if return_partials:
|
| 153 |
+
return embed, partial_embeds, wave_slices
|
| 154 |
+
return embed
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def embed_speaker(wavs, **kwargs):
|
| 158 |
+
raise NotImplemented()
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)):
|
| 162 |
+
import matplotlib.pyplot as plt
|
| 163 |
+
if ax is None:
|
| 164 |
+
ax = plt.gca()
|
| 165 |
+
|
| 166 |
+
if shape is None:
|
| 167 |
+
height = int(np.sqrt(len(embed)))
|
| 168 |
+
shape = (height, -1)
|
| 169 |
+
embed = embed.reshape(shape)
|
| 170 |
+
|
| 171 |
+
cmap = cm.get_cmap()
|
| 172 |
+
mappable = ax.imshow(embed, cmap=cmap)
|
| 173 |
+
cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04)
|
| 174 |
+
sm = cm.ScalarMappable(cmap=cmap)
|
| 175 |
+
sm.set_clim(*color_range)
|
| 176 |
+
|
| 177 |
+
ax.set_xticks([]), ax.set_yticks([])
|
| 178 |
+
ax.set_title(title)
|
backend/encoder/model.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from encoder.params_model import *
|
| 2 |
+
from encoder.params_data import *
|
| 3 |
+
from scipy.interpolate import interp1d
|
| 4 |
+
from sklearn.metrics import roc_curve
|
| 5 |
+
from torch.nn.utils import clip_grad_norm_
|
| 6 |
+
from scipy.optimize import brentq
|
| 7 |
+
from torch import nn
|
| 8 |
+
import numpy as np
|
| 9 |
+
import torch
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class SpeakerEncoder(nn.Module):
|
| 13 |
+
def __init__(self, device, loss_device):
|
| 14 |
+
super().__init__()
|
| 15 |
+
self.loss_device = loss_device
|
| 16 |
+
|
| 17 |
+
# Network defition
|
| 18 |
+
self.lstm = nn.LSTM(input_size=mel_n_channels,
|
| 19 |
+
hidden_size=model_hidden_size,
|
| 20 |
+
num_layers=model_num_layers,
|
| 21 |
+
batch_first=True).to(device)
|
| 22 |
+
self.linear = nn.Linear(in_features=model_hidden_size,
|
| 23 |
+
out_features=model_embedding_size).to(device)
|
| 24 |
+
self.relu = torch.nn.ReLU().to(device)
|
| 25 |
+
|
| 26 |
+
# Cosine similarity scaling (with fixed initial parameter values)
|
| 27 |
+
self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device)
|
| 28 |
+
self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device)
|
| 29 |
+
|
| 30 |
+
# Loss
|
| 31 |
+
self.loss_fn = nn.CrossEntropyLoss().to(loss_device)
|
| 32 |
+
|
| 33 |
+
def do_gradient_ops(self):
|
| 34 |
+
# Gradient scale
|
| 35 |
+
self.similarity_weight.grad *= 0.01
|
| 36 |
+
self.similarity_bias.grad *= 0.01
|
| 37 |
+
|
| 38 |
+
# Gradient clipping
|
| 39 |
+
clip_grad_norm_(self.parameters(), 3, norm_type=2)
|
| 40 |
+
|
| 41 |
+
def forward(self, utterances, hidden_init=None):
|
| 42 |
+
"""
|
| 43 |
+
Computes the embeddings of a batch of utterance spectrograms.
|
| 44 |
+
|
| 45 |
+
:param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape
|
| 46 |
+
(batch_size, n_frames, n_channels)
|
| 47 |
+
:param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers,
|
| 48 |
+
batch_size, hidden_size). Will default to a tensor of zeros if None.
|
| 49 |
+
:return: the embeddings as a tensor of shape (batch_size, embedding_size)
|
| 50 |
+
"""
|
| 51 |
+
# Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
|
| 52 |
+
# and the final cell state.
|
| 53 |
+
out, (hidden, cell) = self.lstm(utterances, hidden_init)
|
| 54 |
+
|
| 55 |
+
# We take only the hidden state of the last layer
|
| 56 |
+
embeds_raw = self.relu(self.linear(hidden[-1]))
|
| 57 |
+
|
| 58 |
+
# L2-normalize it
|
| 59 |
+
embeds = embeds_raw / (torch.norm(embeds_raw, dim=1, keepdim=True) + 1e-5)
|
| 60 |
+
|
| 61 |
+
return embeds
|
| 62 |
+
|
| 63 |
+
def similarity_matrix(self, embeds):
|
| 64 |
+
"""
|
| 65 |
+
Computes the similarity matrix according the section 2.1 of GE2E.
|
| 66 |
+
|
| 67 |
+
:param embeds: the embeddings as a tensor of shape (speakers_per_batch,
|
| 68 |
+
utterances_per_speaker, embedding_size)
|
| 69 |
+
:return: the similarity matrix as a tensor of shape (speakers_per_batch,
|
| 70 |
+
utterances_per_speaker, speakers_per_batch)
|
| 71 |
+
"""
|
| 72 |
+
speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
|
| 73 |
+
|
| 74 |
+
# Inclusive centroids (1 per speaker). Cloning is needed for reverse differentiation
|
| 75 |
+
centroids_incl = torch.mean(embeds, dim=1, keepdim=True)
|
| 76 |
+
centroids_incl = centroids_incl.clone() / (torch.norm(centroids_incl, dim=2, keepdim=True) + 1e-5)
|
| 77 |
+
|
| 78 |
+
# Exclusive centroids (1 per utterance)
|
| 79 |
+
centroids_excl = (torch.sum(embeds, dim=1, keepdim=True) - embeds)
|
| 80 |
+
centroids_excl /= (utterances_per_speaker - 1)
|
| 81 |
+
centroids_excl = centroids_excl.clone() / (torch.norm(centroids_excl, dim=2, keepdim=True) + 1e-5)
|
| 82 |
+
|
| 83 |
+
# Similarity matrix. The cosine similarity of already 2-normed vectors is simply the dot
|
| 84 |
+
# product of these vectors (which is just an element-wise multiplication reduced by a sum).
|
| 85 |
+
# We vectorize the computation for efficiency.
|
| 86 |
+
sim_matrix = torch.zeros(speakers_per_batch, utterances_per_speaker,
|
| 87 |
+
speakers_per_batch).to(self.loss_device)
|
| 88 |
+
mask_matrix = 1 - np.eye(speakers_per_batch, dtype=np.int)
|
| 89 |
+
for j in range(speakers_per_batch):
|
| 90 |
+
mask = np.where(mask_matrix[j])[0]
|
| 91 |
+
sim_matrix[mask, :, j] = (embeds[mask] * centroids_incl[j]).sum(dim=2)
|
| 92 |
+
sim_matrix[j, :, j] = (embeds[j] * centroids_excl[j]).sum(dim=1)
|
| 93 |
+
|
| 94 |
+
## Even more vectorized version (slower maybe because of transpose)
|
| 95 |
+
# sim_matrix2 = torch.zeros(speakers_per_batch, speakers_per_batch, utterances_per_speaker
|
| 96 |
+
# ).to(self.loss_device)
|
| 97 |
+
# eye = np.eye(speakers_per_batch, dtype=np.int)
|
| 98 |
+
# mask = np.where(1 - eye)
|
| 99 |
+
# sim_matrix2[mask] = (embeds[mask[0]] * centroids_incl[mask[1]]).sum(dim=2)
|
| 100 |
+
# mask = np.where(eye)
|
| 101 |
+
# sim_matrix2[mask] = (embeds * centroids_excl).sum(dim=2)
|
| 102 |
+
# sim_matrix2 = sim_matrix2.transpose(1, 2)
|
| 103 |
+
|
| 104 |
+
sim_matrix = sim_matrix * self.similarity_weight + self.similarity_bias
|
| 105 |
+
return sim_matrix
|
| 106 |
+
|
| 107 |
+
def loss(self, embeds):
|
| 108 |
+
"""
|
| 109 |
+
Computes the softmax loss according the section 2.1 of GE2E.
|
| 110 |
+
|
| 111 |
+
:param embeds: the embeddings as a tensor of shape (speakers_per_batch,
|
| 112 |
+
utterances_per_speaker, embedding_size)
|
| 113 |
+
:return: the loss and the EER for this batch of embeddings.
|
| 114 |
+
"""
|
| 115 |
+
speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
|
| 116 |
+
|
| 117 |
+
# Loss
|
| 118 |
+
sim_matrix = self.similarity_matrix(embeds)
|
| 119 |
+
sim_matrix = sim_matrix.reshape((speakers_per_batch * utterances_per_speaker,
|
| 120 |
+
speakers_per_batch))
|
| 121 |
+
ground_truth = np.repeat(np.arange(speakers_per_batch), utterances_per_speaker)
|
| 122 |
+
target = torch.from_numpy(ground_truth).long().to(self.loss_device)
|
| 123 |
+
loss = self.loss_fn(sim_matrix, target)
|
| 124 |
+
|
| 125 |
+
# EER (not backpropagated)
|
| 126 |
+
with torch.no_grad():
|
| 127 |
+
inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=np.int)[0]
|
| 128 |
+
labels = np.array([inv_argmax(i) for i in ground_truth])
|
| 129 |
+
preds = sim_matrix.detach().cpu().numpy()
|
| 130 |
+
|
| 131 |
+
# Snippet from https://yangcha.github.io/EER-ROC/
|
| 132 |
+
fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten())
|
| 133 |
+
eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
|
| 134 |
+
|
| 135 |
+
return loss, eer
|
backend/encoder/params_data.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
## Mel-filterbank
|
| 3 |
+
mel_window_length = 25 # In milliseconds
|
| 4 |
+
mel_window_step = 10 # In milliseconds
|
| 5 |
+
mel_n_channels = 40
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
## Audio
|
| 9 |
+
sampling_rate = 16000
|
| 10 |
+
# Number of spectrogram frames in a partial utterance
|
| 11 |
+
partials_n_frames = 160 # 1600 ms
|
| 12 |
+
# Number of spectrogram frames at inference
|
| 13 |
+
inference_n_frames = 80 # 800 ms
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
## Voice Activation Detection
|
| 17 |
+
# Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
|
| 18 |
+
# This sets the granularity of the VAD. Should not need to be changed.
|
| 19 |
+
vad_window_length = 30 # In milliseconds
|
| 20 |
+
# Number of frames to average together when performing the moving average smoothing.
|
| 21 |
+
# The larger this value, the larger the VAD variations must be to not get smoothed out.
|
| 22 |
+
vad_moving_average_width = 8
|
| 23 |
+
# Maximum number of consecutive silent frames a segment can have.
|
| 24 |
+
vad_max_silence_length = 6
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
## Audio volume normalization
|
| 28 |
+
audio_norm_target_dBFS = -30
|
| 29 |
+
|
backend/encoder/params_model.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
## Model parameters
|
| 3 |
+
model_hidden_size = 256
|
| 4 |
+
model_embedding_size = 256
|
| 5 |
+
model_num_layers = 3
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
## Training parameters
|
| 9 |
+
learning_rate_init = 1e-4
|
| 10 |
+
speakers_per_batch = 64
|
| 11 |
+
utterances_per_speaker = 10
|
backend/enrolled_voices/voice_26bfa1ef.mp3
ADDED
|
Binary file (20.6 kB). View file
|
|
|
backend/enrolled_voices/voice_72beeda9.mp3
ADDED
|
Binary file (20.6 kB). View file
|
|
|
backend/enrolled_voices/voices.json
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"id": "voice_705f524b",
|
| 4 |
+
"name": "Pragyan",
|
| 5 |
+
"filename": "voice_705f524b.wav",
|
| 6 |
+
"filepath": "enrolled_voices\\voice_705f524b.wav",
|
| 7 |
+
"createdAt": "2025-11-05T11:15:58.834934"
|
| 8 |
+
},
|
| 9 |
+
{
|
| 10 |
+
"id": "voice_5b7e198d",
|
| 11 |
+
"name": "Pragyan",
|
| 12 |
+
"filename": "voice_5b7e198d.wav",
|
| 13 |
+
"filepath": "enrolled_voices\\voice_5b7e198d.wav",
|
| 14 |
+
"createdAt": "2025-11-05T11:23:18.943413"
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"id": "voice_e0a7c06e",
|
| 18 |
+
"name": "Pragyan",
|
| 19 |
+
"filename": "voice_e0a7c06e.mp3",
|
| 20 |
+
"filepath": "enrolled_voices\\voice_e0a7c06e.mp3",
|
| 21 |
+
"createdAt": "2025-11-05T11:31:33.094765"
|
| 22 |
+
},
|
| 23 |
+
{
|
| 24 |
+
"id": "voice_7d278c5f",
|
| 25 |
+
"name": "mY",
|
| 26 |
+
"filename": "voice_7d278c5f.mp3",
|
| 27 |
+
"filepath": "enrolled_voices\\voice_7d278c5f.mp3",
|
| 28 |
+
"createdAt": "2025-11-05T11:49:35.933861"
|
| 29 |
+
},
|
| 30 |
+
{
|
| 31 |
+
"id": "voice_44c22d65",
|
| 32 |
+
"name": "My1",
|
| 33 |
+
"filename": "voice_44c22d65.mp3",
|
| 34 |
+
"filepath": "enrolled_voices\\voice_44c22d65.mp3",
|
| 35 |
+
"createdAt": "2025-11-05T11:49:52.844973"
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
"id": "voice_eb54f62d",
|
| 39 |
+
"name": "MY2",
|
| 40 |
+
"filename": "voice_eb54f62d.mp3",
|
| 41 |
+
"filepath": "enrolled_voices\\voice_eb54f62d.mp3",
|
| 42 |
+
"createdAt": "2025-11-05T11:50:13.886497"
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"id": "voice_ecb824ec",
|
| 46 |
+
"name": "Monu",
|
| 47 |
+
"filename": "voice_ecb824ec.wav",
|
| 48 |
+
"filepath": "enrolled_voices\\voice_ecb824ec.wav",
|
| 49 |
+
"createdAt": "2025-11-06T10:28:22.279407"
|
| 50 |
+
},
|
| 51 |
+
{
|
| 52 |
+
"id": "voice_0adf8594",
|
| 53 |
+
"name": "Pragyan1",
|
| 54 |
+
"filename": "voice_0adf8594.wav",
|
| 55 |
+
"filepath": "enrolled_voices\\voice_0adf8594.wav",
|
| 56 |
+
"createdAt": "2025-11-06T14:22:06.737234"
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
"id": "voice_fd577924",
|
| 60 |
+
"name": "MY3",
|
| 61 |
+
"filename": "voice_fd577924.wav",
|
| 62 |
+
"filepath": "enrolled_voices\\voice_fd577924.wav",
|
| 63 |
+
"createdAt": "2025-11-20T15:15:40.488404"
|
| 64 |
+
},
|
| 65 |
+
{
|
| 66 |
+
"id": "voice_a51275b7",
|
| 67 |
+
"name": "Testing Voice",
|
| 68 |
+
"filename": "voice_a51275b7.wav",
|
| 69 |
+
"filepath": "enrolled_voices\\voice_a51275b7.wav",
|
| 70 |
+
"createdAt": "2025-11-20T15:23:43.665441"
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"id": "voice_ea85f251",
|
| 74 |
+
"name": "test",
|
| 75 |
+
"filename": "voice_ea85f251.wav",
|
| 76 |
+
"filepath": "enrolled_voices\\voice_ea85f251.wav",
|
| 77 |
+
"createdAt": "2025-11-25T09:47:22.148753"
|
| 78 |
+
},
|
| 79 |
+
{
|
| 80 |
+
"id": "voice_a4e34f00",
|
| 81 |
+
"name": "Class",
|
| 82 |
+
"filename": "voice_a4e34f00.wav",
|
| 83 |
+
"filepath": "enrolled_voices\\voice_a4e34f00.wav",
|
| 84 |
+
"createdAt": "2025-11-25T10:32:08.525704"
|
| 85 |
+
},
|
| 86 |
+
{
|
| 87 |
+
"id": "voice_26bfa1ef",
|
| 88 |
+
"name": "Saksham voice",
|
| 89 |
+
"filename": "voice_26bfa1ef.mp3",
|
| 90 |
+
"filepath": "E:\\Sem 5\\mini proejct main\\pragyan branch\\backend\\enrolled_voices\\voice_26bfa1ef.mp3",
|
| 91 |
+
"createdAt": "2025-11-28T11:08:59.773738"
|
| 92 |
+
},
|
| 93 |
+
{
|
| 94 |
+
"id": "voice_72beeda9",
|
| 95 |
+
"name": "Saksham voice",
|
| 96 |
+
"filename": "voice_72beeda9.mp3",
|
| 97 |
+
"filepath": "E:\\Sem 5\\mini proejct main\\pragyan branch\\backend\\enrolled_voices\\voice_72beeda9.mp3",
|
| 98 |
+
"createdAt": "2025-11-28T11:16:33.409663"
|
| 99 |
+
}
|
| 100 |
+
]
|
backend/requirements.txt
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
flask==2.3.3
|
| 2 |
+
flask-cors==4.0.0
|
| 3 |
+
gunicorn==21.2.0
|
| 4 |
+
torch>=2.5.0
|
| 5 |
+
librosa>=0.10.0
|
| 6 |
+
soundfile>=0.12.0
|
| 7 |
+
numpy>=1.21.0
|
| 8 |
+
huggingface_hub>=0.19.0
|
| 9 |
+
matplotlib>=3.5.0
|
| 10 |
+
webrtcvad==2.0.10
|
| 11 |
+
scipy>=1.6.0
|
| 12 |
+
scikit-learn>=1.1.0
|
| 13 |
+
unidecode>=1.2.0
|
| 14 |
+
inflect>=6.0.0
|
backend/runtime.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
python-3.10.0
|
backend/synthesizer/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
#
|
backend/synthesizer/audio.py
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import librosa
|
| 2 |
+
import librosa.filters
|
| 3 |
+
import numpy as np
|
| 4 |
+
from scipy import signal
|
| 5 |
+
from scipy.io import wavfile
|
| 6 |
+
import soundfile as sf
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def load_wav(path, sr):
|
| 10 |
+
return librosa.core.load(path, sr=sr)[0]
|
| 11 |
+
|
| 12 |
+
def save_wav(wav, path, sr):
|
| 13 |
+
wav *= 32767 / max(0.01, np.max(np.abs(wav)))
|
| 14 |
+
#proposed by @dsmiller
|
| 15 |
+
wavfile.write(path, sr, wav.astype(np.int16))
|
| 16 |
+
|
| 17 |
+
def save_wavenet_wav(wav, path, sr):
|
| 18 |
+
sf.write(path, wav.astype(np.float32), sr)
|
| 19 |
+
|
| 20 |
+
def preemphasis(wav, k, preemphasize=True):
|
| 21 |
+
if preemphasize:
|
| 22 |
+
return signal.lfilter([1, -k], [1], wav)
|
| 23 |
+
return wav
|
| 24 |
+
|
| 25 |
+
def inv_preemphasis(wav, k, inv_preemphasize=True):
|
| 26 |
+
if inv_preemphasize:
|
| 27 |
+
return signal.lfilter([1], [1, -k], wav)
|
| 28 |
+
return wav
|
| 29 |
+
|
| 30 |
+
#From https://github.com/r9y9/wavenet_vocoder/blob/master/audio.py
|
| 31 |
+
def start_and_end_indices(quantized, silence_threshold=2):
|
| 32 |
+
for start in range(quantized.size):
|
| 33 |
+
if abs(quantized[start] - 127) > silence_threshold:
|
| 34 |
+
break
|
| 35 |
+
for end in range(quantized.size - 1, 1, -1):
|
| 36 |
+
if abs(quantized[end] - 127) > silence_threshold:
|
| 37 |
+
break
|
| 38 |
+
|
| 39 |
+
assert abs(quantized[start] - 127) > silence_threshold
|
| 40 |
+
assert abs(quantized[end] - 127) > silence_threshold
|
| 41 |
+
|
| 42 |
+
return start, end
|
| 43 |
+
|
| 44 |
+
def get_hop_size(hparams):
|
| 45 |
+
hop_size = hparams.hop_size
|
| 46 |
+
if hop_size is None:
|
| 47 |
+
assert hparams.frame_shift_ms is not None
|
| 48 |
+
hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
|
| 49 |
+
return hop_size
|
| 50 |
+
|
| 51 |
+
def linearspectrogram(wav, hparams):
|
| 52 |
+
D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
|
| 53 |
+
S = _amp_to_db(np.abs(D), hparams) - hparams.ref_level_db
|
| 54 |
+
|
| 55 |
+
if hparams.signal_normalization:
|
| 56 |
+
return _normalize(S, hparams)
|
| 57 |
+
return S
|
| 58 |
+
|
| 59 |
+
def melspectrogram(wav, hparams):
|
| 60 |
+
D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
|
| 61 |
+
S = _amp_to_db(_linear_to_mel(np.abs(D), hparams), hparams) - hparams.ref_level_db
|
| 62 |
+
|
| 63 |
+
if hparams.signal_normalization:
|
| 64 |
+
return _normalize(S, hparams)
|
| 65 |
+
return S
|
| 66 |
+
|
| 67 |
+
def inv_linear_spectrogram(linear_spectrogram, hparams):
|
| 68 |
+
"""Converts linear spectrogram to waveform using librosa"""
|
| 69 |
+
if hparams.signal_normalization:
|
| 70 |
+
D = _denormalize(linear_spectrogram, hparams)
|
| 71 |
+
else:
|
| 72 |
+
D = linear_spectrogram
|
| 73 |
+
|
| 74 |
+
S = _db_to_amp(D + hparams.ref_level_db) #Convert back to linear
|
| 75 |
+
|
| 76 |
+
if hparams.use_lws:
|
| 77 |
+
processor = _lws_processor(hparams)
|
| 78 |
+
D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
|
| 79 |
+
y = processor.istft(D).astype(np.float32)
|
| 80 |
+
return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
|
| 81 |
+
else:
|
| 82 |
+
return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)
|
| 83 |
+
|
| 84 |
+
def inv_mel_spectrogram(mel_spectrogram, hparams):
|
| 85 |
+
"""Converts mel spectrogram to waveform using librosa"""
|
| 86 |
+
if hparams.signal_normalization:
|
| 87 |
+
D = _denormalize(mel_spectrogram, hparams)
|
| 88 |
+
else:
|
| 89 |
+
D = mel_spectrogram
|
| 90 |
+
|
| 91 |
+
S = _mel_to_linear(_db_to_amp(D + hparams.ref_level_db), hparams) # Convert back to linear
|
| 92 |
+
|
| 93 |
+
if hparams.use_lws:
|
| 94 |
+
processor = _lws_processor(hparams)
|
| 95 |
+
D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
|
| 96 |
+
y = processor.istft(D).astype(np.float32)
|
| 97 |
+
return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
|
| 98 |
+
else:
|
| 99 |
+
return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)
|
| 100 |
+
|
| 101 |
+
def _lws_processor(hparams):
|
| 102 |
+
import lws
|
| 103 |
+
return lws.lws(hparams.n_fft, get_hop_size(hparams), fftsize=hparams.win_size, mode="speech")
|
| 104 |
+
|
| 105 |
+
def _griffin_lim(S, hparams):
|
| 106 |
+
"""librosa implementation of Griffin-Lim
|
| 107 |
+
Based on https://github.com/librosa/librosa/issues/434
|
| 108 |
+
"""
|
| 109 |
+
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
|
| 110 |
+
S_complex = np.abs(S).astype(np.complex)
|
| 111 |
+
y = _istft(S_complex * angles, hparams)
|
| 112 |
+
for i in range(hparams.griffin_lim_iters):
|
| 113 |
+
angles = np.exp(1j * np.angle(_stft(y, hparams)))
|
| 114 |
+
y = _istft(S_complex * angles, hparams)
|
| 115 |
+
return y
|
| 116 |
+
|
| 117 |
+
def _stft(y, hparams):
|
| 118 |
+
if hparams.use_lws:
|
| 119 |
+
return _lws_processor(hparams).stft(y).T
|
| 120 |
+
else:
|
| 121 |
+
return librosa.stft(y=y, n_fft=hparams.n_fft, hop_length=get_hop_size(hparams), win_length=hparams.win_size)
|
| 122 |
+
|
| 123 |
+
def _istft(y, hparams):
|
| 124 |
+
return librosa.istft(y, hop_length=get_hop_size(hparams), win_length=hparams.win_size)
|
| 125 |
+
|
| 126 |
+
##########################################################
|
| 127 |
+
#Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!)
|
| 128 |
+
def num_frames(length, fsize, fshift):
|
| 129 |
+
"""Compute number of time frames of spectrogram
|
| 130 |
+
"""
|
| 131 |
+
pad = (fsize - fshift)
|
| 132 |
+
if length % fshift == 0:
|
| 133 |
+
M = (length + pad * 2 - fsize) // fshift + 1
|
| 134 |
+
else:
|
| 135 |
+
M = (length + pad * 2 - fsize) // fshift + 2
|
| 136 |
+
return M
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def pad_lr(x, fsize, fshift):
|
| 140 |
+
"""Compute left and right padding
|
| 141 |
+
"""
|
| 142 |
+
M = num_frames(len(x), fsize, fshift)
|
| 143 |
+
pad = (fsize - fshift)
|
| 144 |
+
T = len(x) + 2 * pad
|
| 145 |
+
r = (M - 1) * fshift + fsize - T
|
| 146 |
+
return pad, pad + r
|
| 147 |
+
##########################################################
|
| 148 |
+
#Librosa correct padding
|
| 149 |
+
def librosa_pad_lr(x, fsize, fshift):
|
| 150 |
+
return 0, (x.shape[0] // fshift + 1) * fshift - x.shape[0]
|
| 151 |
+
|
| 152 |
+
# Conversions
|
| 153 |
+
_mel_basis = None
|
| 154 |
+
_inv_mel_basis = None
|
| 155 |
+
|
| 156 |
+
def _linear_to_mel(spectogram, hparams):
|
| 157 |
+
global _mel_basis
|
| 158 |
+
if _mel_basis is None:
|
| 159 |
+
_mel_basis = _build_mel_basis(hparams)
|
| 160 |
+
return np.dot(_mel_basis, spectogram)
|
| 161 |
+
|
| 162 |
+
def _mel_to_linear(mel_spectrogram, hparams):
|
| 163 |
+
global _inv_mel_basis
|
| 164 |
+
if _inv_mel_basis is None:
|
| 165 |
+
_inv_mel_basis = np.linalg.pinv(_build_mel_basis(hparams))
|
| 166 |
+
return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram))
|
| 167 |
+
|
| 168 |
+
def _build_mel_basis(hparams):
|
| 169 |
+
assert hparams.fmax <= hparams.sample_rate // 2
|
| 170 |
+
return librosa.filters.mel(
|
| 171 |
+
sr=hparams.sample_rate,
|
| 172 |
+
n_fft=hparams.n_fft,
|
| 173 |
+
n_mels=hparams.num_mels,
|
| 174 |
+
fmin=hparams.fmin,
|
| 175 |
+
fmax=hparams.fmax
|
| 176 |
+
)
|
| 177 |
+
|
| 178 |
+
def _amp_to_db(x, hparams):
|
| 179 |
+
min_level = np.exp(hparams.min_level_db / 20 * np.log(10))
|
| 180 |
+
return 20 * np.log10(np.maximum(min_level, x))
|
| 181 |
+
|
| 182 |
+
def _db_to_amp(x):
|
| 183 |
+
return np.power(10.0, (x) * 0.05)
|
| 184 |
+
|
| 185 |
+
def _normalize(S, hparams):
|
| 186 |
+
if hparams.allow_clipping_in_normalization:
|
| 187 |
+
if hparams.symmetric_mels:
|
| 188 |
+
return np.clip((2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value,
|
| 189 |
+
-hparams.max_abs_value, hparams.max_abs_value)
|
| 190 |
+
else:
|
| 191 |
+
return np.clip(hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)), 0, hparams.max_abs_value)
|
| 192 |
+
|
| 193 |
+
assert S.max() <= 0 and S.min() - hparams.min_level_db >= 0
|
| 194 |
+
if hparams.symmetric_mels:
|
| 195 |
+
return (2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value
|
| 196 |
+
else:
|
| 197 |
+
return hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db))
|
| 198 |
+
|
| 199 |
+
def _denormalize(D, hparams):
|
| 200 |
+
if hparams.allow_clipping_in_normalization:
|
| 201 |
+
if hparams.symmetric_mels:
|
| 202 |
+
return (((np.clip(D, -hparams.max_abs_value,
|
| 203 |
+
hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value))
|
| 204 |
+
+ hparams.min_level_db)
|
| 205 |
+
else:
|
| 206 |
+
return ((np.clip(D, 0, hparams.max_abs_value) * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
|
| 207 |
+
|
| 208 |
+
if hparams.symmetric_mels:
|
| 209 |
+
return (((D + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) + hparams.min_level_db)
|
| 210 |
+
else:
|
| 211 |
+
return ((D * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
|
backend/synthesizer/hparams.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import ast
|
| 2 |
+
import pprint
|
| 3 |
+
|
| 4 |
+
class HParams(object):
|
| 5 |
+
def __init__(self, **kwargs): self.__dict__.update(kwargs)
|
| 6 |
+
def __setitem__(self, key, value): setattr(self, key, value)
|
| 7 |
+
def __getitem__(self, key): return getattr(self, key)
|
| 8 |
+
def __repr__(self): return pprint.pformat(self.__dict__)
|
| 9 |
+
|
| 10 |
+
def parse(self, string):
|
| 11 |
+
# Overrides hparams from a comma-separated string of name=value pairs
|
| 12 |
+
if len(string) > 0:
|
| 13 |
+
overrides = [s.split("=") for s in string.split(",")]
|
| 14 |
+
keys, values = zip(*overrides)
|
| 15 |
+
keys = list(map(str.strip, keys))
|
| 16 |
+
values = list(map(str.strip, values))
|
| 17 |
+
for k in keys:
|
| 18 |
+
self.__dict__[k] = ast.literal_eval(values[keys.index(k)])
|
| 19 |
+
return self
|
| 20 |
+
|
| 21 |
+
hparams = HParams(
|
| 22 |
+
### Signal Processing (used in both synthesizer and vocoder)
|
| 23 |
+
sample_rate = 16000,
|
| 24 |
+
n_fft = 800,
|
| 25 |
+
num_mels = 80,
|
| 26 |
+
hop_size = 200, # Tacotron uses 12.5 ms frame shift (set to sample_rate * 0.0125)
|
| 27 |
+
win_size = 800, # Tacotron uses 50 ms frame length (set to sample_rate * 0.050)
|
| 28 |
+
fmin = 55,
|
| 29 |
+
min_level_db = -100,
|
| 30 |
+
ref_level_db = 20,
|
| 31 |
+
max_abs_value = 4., # Gradient explodes if too big, premature convergence if too small.
|
| 32 |
+
preemphasis = 0.97, # Filter coefficient to use if preemphasize is True
|
| 33 |
+
preemphasize = True,
|
| 34 |
+
|
| 35 |
+
### Tacotron Text-to-Speech (TTS)
|
| 36 |
+
tts_embed_dims = 512, # Embedding dimension for the graphemes/phoneme inputs
|
| 37 |
+
tts_encoder_dims = 256,
|
| 38 |
+
tts_decoder_dims = 128,
|
| 39 |
+
tts_postnet_dims = 512,
|
| 40 |
+
tts_encoder_K = 5,
|
| 41 |
+
tts_lstm_dims = 1024,
|
| 42 |
+
tts_postnet_K = 5,
|
| 43 |
+
tts_num_highways = 4,
|
| 44 |
+
tts_dropout = 0.5,
|
| 45 |
+
tts_cleaner_names = ["english_cleaners"],
|
| 46 |
+
tts_stop_threshold = -3.4, # Value below which audio generation ends.
|
| 47 |
+
# For example, for a range of [-4, 4], this
|
| 48 |
+
# will terminate the sequence at the first
|
| 49 |
+
# frame that has all values < -3.4
|
| 50 |
+
|
| 51 |
+
### Tacotron Training
|
| 52 |
+
tts_schedule = [(2, 1e-3, 20_000, 12), # Progressive training schedule
|
| 53 |
+
(2, 5e-4, 40_000, 12), # (r, lr, step, batch_size)
|
| 54 |
+
(2, 2e-4, 80_000, 12), #
|
| 55 |
+
(2, 1e-4, 160_000, 12), # r = reduction factor (# of mel frames
|
| 56 |
+
(2, 3e-5, 320_000, 12), # synthesized for each decoder iteration)
|
| 57 |
+
(2, 1e-5, 640_000, 12)], # lr = learning rate
|
| 58 |
+
|
| 59 |
+
tts_clip_grad_norm = 1.0, # clips the gradient norm to prevent explosion - set to None if not needed
|
| 60 |
+
tts_eval_interval = 500, # Number of steps between model evaluation (sample generation)
|
| 61 |
+
# Set to -1 to generate after completing epoch, or 0 to disable
|
| 62 |
+
|
| 63 |
+
tts_eval_num_samples = 1, # Makes this number of samples
|
| 64 |
+
|
| 65 |
+
### Data Preprocessing
|
| 66 |
+
max_mel_frames = 900,
|
| 67 |
+
rescale = True,
|
| 68 |
+
rescaling_max = 0.9,
|
| 69 |
+
synthesis_batch_size = 16, # For vocoder preprocessing and inference.
|
| 70 |
+
|
| 71 |
+
### Mel Visualization and Griffin-Lim
|
| 72 |
+
signal_normalization = True,
|
| 73 |
+
power = 1.5,
|
| 74 |
+
griffin_lim_iters = 60,
|
| 75 |
+
|
| 76 |
+
### Audio processing options
|
| 77 |
+
fmax = 7600, # Should not exceed (sample_rate // 2)
|
| 78 |
+
allow_clipping_in_normalization = True, # Used when signal_normalization = True
|
| 79 |
+
clip_mels_length = True, # If true, discards samples exceeding max_mel_frames
|
| 80 |
+
use_lws = False, # "Fast spectrogram phase recovery using local weighted sums"
|
| 81 |
+
symmetric_mels = True, # Sets mel range to [-max_abs_value, max_abs_value] if True,
|
| 82 |
+
# and [0, max_abs_value] if False
|
| 83 |
+
trim_silence = True, # Use with sample_rate of 16000 for best results
|
| 84 |
+
|
| 85 |
+
### SV2TTS
|
| 86 |
+
speaker_embedding_size = 256, # Dimension for the speaker embedding
|
| 87 |
+
silence_min_duration_split = 0.4, # Duration in seconds of a silence for an utterance to be split
|
| 88 |
+
utterance_min_duration = 1.6, # Duration in seconds below which utterances are discarded
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
def hparams_debug_string():
|
| 92 |
+
return str(hparams)
|
backend/synthesizer/inference.py
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from synthesizer import audio
|
| 3 |
+
from synthesizer.hparams import hparams
|
| 4 |
+
from synthesizer.models.tacotron import Tacotron
|
| 5 |
+
from synthesizer.utils.symbols import symbols
|
| 6 |
+
from synthesizer.utils.text import text_to_sequence
|
| 7 |
+
from app.vocoder.display import simple_table
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import Union, List
|
| 10 |
+
import numpy as np
|
| 11 |
+
import librosa
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class Synthesizer:
|
| 15 |
+
sample_rate = hparams.sample_rate
|
| 16 |
+
hparams = hparams
|
| 17 |
+
|
| 18 |
+
def __init__(self, model_fpath: Path, verbose=True):
|
| 19 |
+
"""
|
| 20 |
+
The model isn't instantiated and loaded in memory until needed or until load() is called.
|
| 21 |
+
|
| 22 |
+
:param model_fpath: path to the trained model file
|
| 23 |
+
:param verbose: if False, prints less information when using the model
|
| 24 |
+
"""
|
| 25 |
+
self.model_fpath = model_fpath
|
| 26 |
+
self.verbose = verbose
|
| 27 |
+
|
| 28 |
+
# Check for GPU
|
| 29 |
+
if torch.cuda.is_available():
|
| 30 |
+
self.device = torch.device("cuda")
|
| 31 |
+
else:
|
| 32 |
+
self.device = torch.device("cpu")
|
| 33 |
+
if self.verbose:
|
| 34 |
+
print("Synthesizer using device:", self.device)
|
| 35 |
+
|
| 36 |
+
# Tacotron model will be instantiated later on first use.
|
| 37 |
+
self._model = None
|
| 38 |
+
|
| 39 |
+
def is_loaded(self):
|
| 40 |
+
"""
|
| 41 |
+
Whether the model is loaded in memory.
|
| 42 |
+
"""
|
| 43 |
+
return self._model is not None
|
| 44 |
+
|
| 45 |
+
def load(self):
|
| 46 |
+
"""
|
| 47 |
+
Instantiates and loads the model given the weights file that was passed in the constructor.
|
| 48 |
+
"""
|
| 49 |
+
self._model = Tacotron(embed_dims=hparams.tts_embed_dims,
|
| 50 |
+
num_chars=len(symbols),
|
| 51 |
+
encoder_dims=hparams.tts_encoder_dims,
|
| 52 |
+
decoder_dims=hparams.tts_decoder_dims,
|
| 53 |
+
n_mels=hparams.num_mels,
|
| 54 |
+
fft_bins=hparams.num_mels,
|
| 55 |
+
postnet_dims=hparams.tts_postnet_dims,
|
| 56 |
+
encoder_K=hparams.tts_encoder_K,
|
| 57 |
+
lstm_dims=hparams.tts_lstm_dims,
|
| 58 |
+
postnet_K=hparams.tts_postnet_K,
|
| 59 |
+
num_highways=hparams.tts_num_highways,
|
| 60 |
+
dropout=hparams.tts_dropout,
|
| 61 |
+
stop_threshold=hparams.tts_stop_threshold,
|
| 62 |
+
speaker_embedding_size=hparams.speaker_embedding_size).to(self.device)
|
| 63 |
+
|
| 64 |
+
self._model.load(self.model_fpath)
|
| 65 |
+
self._model.eval()
|
| 66 |
+
|
| 67 |
+
if self.verbose:
|
| 68 |
+
print("Loaded synthesizer \"%s\" trained to step %d" % (self.model_fpath.name, self._model.state_dict()["step"]))
|
| 69 |
+
|
| 70 |
+
def synthesize_spectrograms(self, texts: List[str],
|
| 71 |
+
embeddings: Union[np.ndarray, List[np.ndarray]],
|
| 72 |
+
return_alignments=False):
|
| 73 |
+
"""
|
| 74 |
+
Synthesizes mel spectrograms from texts and speaker embeddings.
|
| 75 |
+
|
| 76 |
+
:param texts: a list of N text prompts to be synthesized
|
| 77 |
+
:param embeddings: a numpy array or list of speaker embeddings of shape (N, 256)
|
| 78 |
+
:param return_alignments: if True, a matrix representing the alignments between the
|
| 79 |
+
characters
|
| 80 |
+
and each decoder output step will be returned for each spectrogram
|
| 81 |
+
:return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the
|
| 82 |
+
sequence length of spectrogram i, and possibly the alignments.
|
| 83 |
+
"""
|
| 84 |
+
# Load the model on the first request.
|
| 85 |
+
if not self.is_loaded():
|
| 86 |
+
self.load()
|
| 87 |
+
|
| 88 |
+
# Preprocess text inputs
|
| 89 |
+
inputs = [text_to_sequence(text.strip(), hparams.tts_cleaner_names) for text in texts]
|
| 90 |
+
if not isinstance(embeddings, list):
|
| 91 |
+
embeddings = [embeddings]
|
| 92 |
+
|
| 93 |
+
# Batch inputs
|
| 94 |
+
batched_inputs = [inputs[i:i+hparams.synthesis_batch_size]
|
| 95 |
+
for i in range(0, len(inputs), hparams.synthesis_batch_size)]
|
| 96 |
+
batched_embeds = [embeddings[i:i+hparams.synthesis_batch_size]
|
| 97 |
+
for i in range(0, len(embeddings), hparams.synthesis_batch_size)]
|
| 98 |
+
|
| 99 |
+
specs = []
|
| 100 |
+
for i, batch in enumerate(batched_inputs, 1):
|
| 101 |
+
if self.verbose:
|
| 102 |
+
print(f"\n| Generating {i}/{len(batched_inputs)}")
|
| 103 |
+
|
| 104 |
+
# Pad texts so they are all the same length
|
| 105 |
+
text_lens = [len(text) for text in batch]
|
| 106 |
+
max_text_len = max(text_lens)
|
| 107 |
+
chars = [pad1d(text, max_text_len) for text in batch]
|
| 108 |
+
chars = np.stack(chars)
|
| 109 |
+
|
| 110 |
+
# Stack speaker embeddings into 2D array for batch processing
|
| 111 |
+
speaker_embeds = np.stack(batched_embeds[i-1])
|
| 112 |
+
|
| 113 |
+
# Convert to tensor
|
| 114 |
+
chars = torch.tensor(chars).long().to(self.device)
|
| 115 |
+
speaker_embeddings = torch.tensor(speaker_embeds).float().to(self.device)
|
| 116 |
+
|
| 117 |
+
# Inference
|
| 118 |
+
_, mels, alignments = self._model.generate(chars, speaker_embeddings)
|
| 119 |
+
mels = mels.detach().cpu().numpy()
|
| 120 |
+
for m in mels:
|
| 121 |
+
# Trim silence from end of each spectrogram
|
| 122 |
+
while np.max(m[:, -1]) < hparams.tts_stop_threshold:
|
| 123 |
+
m = m[:, :-1]
|
| 124 |
+
specs.append(m)
|
| 125 |
+
|
| 126 |
+
if self.verbose:
|
| 127 |
+
print("\n\nDone.\n")
|
| 128 |
+
return (specs, alignments) if return_alignments else specs
|
| 129 |
+
|
| 130 |
+
@staticmethod
|
| 131 |
+
def load_preprocess_wav(fpath):
|
| 132 |
+
"""
|
| 133 |
+
Loads and preprocesses an audio file under the same conditions the audio files were used to
|
| 134 |
+
train the synthesizer.
|
| 135 |
+
"""
|
| 136 |
+
wav = librosa.load(str(fpath), hparams.sample_rate)[0]
|
| 137 |
+
if hparams.rescale:
|
| 138 |
+
wav = wav / np.abs(wav).max() * hparams.rescaling_max
|
| 139 |
+
return wav
|
| 140 |
+
|
| 141 |
+
@staticmethod
|
| 142 |
+
def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray]):
|
| 143 |
+
"""
|
| 144 |
+
Creates a mel spectrogram from an audio file in the same manner as the mel spectrograms that
|
| 145 |
+
were fed to the synthesizer when training.
|
| 146 |
+
"""
|
| 147 |
+
if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
|
| 148 |
+
wav = Synthesizer.load_preprocess_wav(fpath_or_wav)
|
| 149 |
+
else:
|
| 150 |
+
wav = fpath_or_wav
|
| 151 |
+
|
| 152 |
+
mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
|
| 153 |
+
return mel_spectrogram
|
| 154 |
+
|
| 155 |
+
@staticmethod
|
| 156 |
+
def griffin_lim(mel):
|
| 157 |
+
"""
|
| 158 |
+
Inverts a mel spectrogram using Griffin-Lim. The mel spectrogram is expected to have been built
|
| 159 |
+
with the same parameters present in hparams.py.
|
| 160 |
+
"""
|
| 161 |
+
return audio.inv_mel_spectrogram(mel, hparams)
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
def pad1d(x, max_len, pad_value=0):
|
| 165 |
+
return np.pad(x, (0, max_len - len(x)), mode="constant", constant_values=pad_value)
|
backend/synthesizer/models/tacotron.py
ADDED
|
@@ -0,0 +1,542 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import numpy as np
|
| 3 |
+
import torch
|
| 4 |
+
import torch.nn as nn
|
| 5 |
+
import torch.nn.functional as F
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import Union
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class HighwayNetwork(nn.Module):
|
| 11 |
+
def __init__(self, size):
|
| 12 |
+
super().__init__()
|
| 13 |
+
self.W1 = nn.Linear(size, size)
|
| 14 |
+
self.W2 = nn.Linear(size, size)
|
| 15 |
+
self.W1.bias.data.fill_(0.)
|
| 16 |
+
|
| 17 |
+
def forward(self, x):
|
| 18 |
+
x1 = self.W1(x)
|
| 19 |
+
x2 = self.W2(x)
|
| 20 |
+
g = torch.sigmoid(x2)
|
| 21 |
+
y = g * F.relu(x1) + (1. - g) * x
|
| 22 |
+
return y
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class Encoder(nn.Module):
|
| 26 |
+
def __init__(self, embed_dims, num_chars, encoder_dims, K, num_highways, dropout):
|
| 27 |
+
super().__init__()
|
| 28 |
+
prenet_dims = (encoder_dims, encoder_dims)
|
| 29 |
+
cbhg_channels = encoder_dims
|
| 30 |
+
self.embedding = nn.Embedding(num_chars, embed_dims)
|
| 31 |
+
self.pre_net = PreNet(embed_dims, fc1_dims=prenet_dims[0], fc2_dims=prenet_dims[1],
|
| 32 |
+
dropout=dropout)
|
| 33 |
+
self.cbhg = CBHG(K=K, in_channels=cbhg_channels, channels=cbhg_channels,
|
| 34 |
+
proj_channels=[cbhg_channels, cbhg_channels],
|
| 35 |
+
num_highways=num_highways)
|
| 36 |
+
|
| 37 |
+
def forward(self, x, speaker_embedding=None):
|
| 38 |
+
x = self.embedding(x)
|
| 39 |
+
x = self.pre_net(x)
|
| 40 |
+
x.transpose_(1, 2)
|
| 41 |
+
x = self.cbhg(x)
|
| 42 |
+
if speaker_embedding is not None:
|
| 43 |
+
x = self.add_speaker_embedding(x, speaker_embedding)
|
| 44 |
+
return x
|
| 45 |
+
|
| 46 |
+
def add_speaker_embedding(self, x, speaker_embedding):
|
| 47 |
+
# SV2TTS
|
| 48 |
+
# The input x is the encoder output and is a 3D tensor with size (batch_size, num_chars, tts_embed_dims)
|
| 49 |
+
# When training, speaker_embedding is also a 2D tensor with size (batch_size, speaker_embedding_size)
|
| 50 |
+
# (for inference, speaker_embedding is a 1D tensor with size (speaker_embedding_size))
|
| 51 |
+
# This concats the speaker embedding for each char in the encoder output
|
| 52 |
+
|
| 53 |
+
# Save the dimensions as human-readable names
|
| 54 |
+
batch_size = x.size()[0]
|
| 55 |
+
num_chars = x.size()[1]
|
| 56 |
+
|
| 57 |
+
if speaker_embedding.dim() == 1:
|
| 58 |
+
idx = 0
|
| 59 |
+
else:
|
| 60 |
+
idx = 1
|
| 61 |
+
|
| 62 |
+
# Start by making a copy of each speaker embedding to match the input text length
|
| 63 |
+
# The output of this has size (batch_size, num_chars * tts_embed_dims)
|
| 64 |
+
speaker_embedding_size = speaker_embedding.size()[idx]
|
| 65 |
+
e = speaker_embedding.repeat_interleave(num_chars, dim=idx)
|
| 66 |
+
|
| 67 |
+
# Reshape it and transpose
|
| 68 |
+
e = e.reshape(batch_size, speaker_embedding_size, num_chars)
|
| 69 |
+
e = e.transpose(1, 2)
|
| 70 |
+
|
| 71 |
+
# Concatenate the tiled speaker embedding with the encoder output
|
| 72 |
+
x = torch.cat((x, e), 2)
|
| 73 |
+
return x
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
class BatchNormConv(nn.Module):
|
| 77 |
+
def __init__(self, in_channels, out_channels, kernel, relu=True):
|
| 78 |
+
super().__init__()
|
| 79 |
+
self.conv = nn.Conv1d(in_channels, out_channels, kernel, stride=1, padding=kernel // 2, bias=False)
|
| 80 |
+
self.bnorm = nn.BatchNorm1d(out_channels)
|
| 81 |
+
self.relu = relu
|
| 82 |
+
|
| 83 |
+
def forward(self, x):
|
| 84 |
+
x = self.conv(x)
|
| 85 |
+
x = F.relu(x) if self.relu is True else x
|
| 86 |
+
return self.bnorm(x)
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
class CBHG(nn.Module):
|
| 90 |
+
def __init__(self, K, in_channels, channels, proj_channels, num_highways):
|
| 91 |
+
super().__init__()
|
| 92 |
+
|
| 93 |
+
# List of all rnns to call `flatten_parameters()` on
|
| 94 |
+
self._to_flatten = []
|
| 95 |
+
|
| 96 |
+
self.bank_kernels = [i for i in range(1, K + 1)]
|
| 97 |
+
self.conv1d_bank = nn.ModuleList()
|
| 98 |
+
for k in self.bank_kernels:
|
| 99 |
+
conv = BatchNormConv(in_channels, channels, k)
|
| 100 |
+
self.conv1d_bank.append(conv)
|
| 101 |
+
|
| 102 |
+
self.maxpool = nn.MaxPool1d(kernel_size=2, stride=1, padding=1)
|
| 103 |
+
|
| 104 |
+
self.conv_project1 = BatchNormConv(len(self.bank_kernels) * channels, proj_channels[0], 3)
|
| 105 |
+
self.conv_project2 = BatchNormConv(proj_channels[0], proj_channels[1], 3, relu=False)
|
| 106 |
+
|
| 107 |
+
# Fix the highway input if necessary
|
| 108 |
+
if proj_channels[-1] != channels:
|
| 109 |
+
self.highway_mismatch = True
|
| 110 |
+
self.pre_highway = nn.Linear(proj_channels[-1], channels, bias=False)
|
| 111 |
+
else:
|
| 112 |
+
self.highway_mismatch = False
|
| 113 |
+
|
| 114 |
+
self.highways = nn.ModuleList()
|
| 115 |
+
for i in range(num_highways):
|
| 116 |
+
hn = HighwayNetwork(channels)
|
| 117 |
+
self.highways.append(hn)
|
| 118 |
+
|
| 119 |
+
self.rnn = nn.GRU(channels, channels // 2, batch_first=True, bidirectional=True)
|
| 120 |
+
self._to_flatten.append(self.rnn)
|
| 121 |
+
|
| 122 |
+
# Avoid fragmentation of RNN parameters and associated warning
|
| 123 |
+
self._flatten_parameters()
|
| 124 |
+
|
| 125 |
+
def forward(self, x):
|
| 126 |
+
# Although we `_flatten_parameters()` on init, when using DataParallel
|
| 127 |
+
# the model gets replicated, making it no longer guaranteed that the
|
| 128 |
+
# weights are contiguous in GPU memory. Hence, we must call it again
|
| 129 |
+
self._flatten_parameters()
|
| 130 |
+
|
| 131 |
+
# Save these for later
|
| 132 |
+
residual = x
|
| 133 |
+
seq_len = x.size(-1)
|
| 134 |
+
conv_bank = []
|
| 135 |
+
|
| 136 |
+
# Convolution Bank
|
| 137 |
+
for conv in self.conv1d_bank:
|
| 138 |
+
c = conv(x) # Convolution
|
| 139 |
+
conv_bank.append(c[:, :, :seq_len])
|
| 140 |
+
|
| 141 |
+
# Stack along the channel axis
|
| 142 |
+
conv_bank = torch.cat(conv_bank, dim=1)
|
| 143 |
+
|
| 144 |
+
# dump the last padding to fit residual
|
| 145 |
+
x = self.maxpool(conv_bank)[:, :, :seq_len]
|
| 146 |
+
|
| 147 |
+
# Conv1d projections
|
| 148 |
+
x = self.conv_project1(x)
|
| 149 |
+
x = self.conv_project2(x)
|
| 150 |
+
|
| 151 |
+
# Residual Connect
|
| 152 |
+
x = x + residual
|
| 153 |
+
|
| 154 |
+
# Through the highways
|
| 155 |
+
x = x.transpose(1, 2)
|
| 156 |
+
if self.highway_mismatch is True:
|
| 157 |
+
x = self.pre_highway(x)
|
| 158 |
+
for h in self.highways: x = h(x)
|
| 159 |
+
|
| 160 |
+
# And then the RNN
|
| 161 |
+
x, _ = self.rnn(x)
|
| 162 |
+
return x
|
| 163 |
+
|
| 164 |
+
def _flatten_parameters(self):
|
| 165 |
+
"""Calls `flatten_parameters` on all the rnns used by the WaveRNN. Used
|
| 166 |
+
to improve efficiency and avoid PyTorch yelling at us."""
|
| 167 |
+
[m.flatten_parameters() for m in self._to_flatten]
|
| 168 |
+
|
| 169 |
+
class PreNet(nn.Module):
|
| 170 |
+
def __init__(self, in_dims, fc1_dims=256, fc2_dims=128, dropout=0.5):
|
| 171 |
+
super().__init__()
|
| 172 |
+
self.fc1 = nn.Linear(in_dims, fc1_dims)
|
| 173 |
+
self.fc2 = nn.Linear(fc1_dims, fc2_dims)
|
| 174 |
+
self.p = dropout
|
| 175 |
+
|
| 176 |
+
def forward(self, x):
|
| 177 |
+
x = self.fc1(x)
|
| 178 |
+
x = F.relu(x)
|
| 179 |
+
x = F.dropout(x, self.p, training=True)
|
| 180 |
+
x = self.fc2(x)
|
| 181 |
+
x = F.relu(x)
|
| 182 |
+
x = F.dropout(x, self.p, training=True)
|
| 183 |
+
return x
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
class Attention(nn.Module):
|
| 187 |
+
def __init__(self, attn_dims):
|
| 188 |
+
super().__init__()
|
| 189 |
+
self.W = nn.Linear(attn_dims, attn_dims, bias=False)
|
| 190 |
+
self.v = nn.Linear(attn_dims, 1, bias=False)
|
| 191 |
+
|
| 192 |
+
def forward(self, encoder_seq_proj, query, t):
|
| 193 |
+
|
| 194 |
+
# print(encoder_seq_proj.shape)
|
| 195 |
+
# Transform the query vector
|
| 196 |
+
query_proj = self.W(query).unsqueeze(1)
|
| 197 |
+
|
| 198 |
+
# Compute the scores
|
| 199 |
+
u = self.v(torch.tanh(encoder_seq_proj + query_proj))
|
| 200 |
+
scores = F.softmax(u, dim=1)
|
| 201 |
+
|
| 202 |
+
return scores.transpose(1, 2)
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
class LSA(nn.Module):
|
| 206 |
+
def __init__(self, attn_dim, kernel_size=31, filters=32):
|
| 207 |
+
super().__init__()
|
| 208 |
+
self.conv = nn.Conv1d(1, filters, padding=(kernel_size - 1) // 2, kernel_size=kernel_size, bias=True)
|
| 209 |
+
self.L = nn.Linear(filters, attn_dim, bias=False)
|
| 210 |
+
self.W = nn.Linear(attn_dim, attn_dim, bias=True) # Include the attention bias in this term
|
| 211 |
+
self.v = nn.Linear(attn_dim, 1, bias=False)
|
| 212 |
+
self.cumulative = None
|
| 213 |
+
self.attention = None
|
| 214 |
+
|
| 215 |
+
def init_attention(self, encoder_seq_proj):
|
| 216 |
+
device = next(self.parameters()).device # use same device as parameters
|
| 217 |
+
b, t, c = encoder_seq_proj.size()
|
| 218 |
+
self.cumulative = torch.zeros(b, t, device=device)
|
| 219 |
+
self.attention = torch.zeros(b, t, device=device)
|
| 220 |
+
|
| 221 |
+
def forward(self, encoder_seq_proj, query, t, chars):
|
| 222 |
+
|
| 223 |
+
if t == 0: self.init_attention(encoder_seq_proj)
|
| 224 |
+
|
| 225 |
+
processed_query = self.W(query).unsqueeze(1)
|
| 226 |
+
|
| 227 |
+
location = self.cumulative.unsqueeze(1)
|
| 228 |
+
processed_loc = self.L(self.conv(location).transpose(1, 2))
|
| 229 |
+
|
| 230 |
+
u = self.v(torch.tanh(processed_query + encoder_seq_proj + processed_loc))
|
| 231 |
+
u = u.squeeze(-1)
|
| 232 |
+
|
| 233 |
+
# Mask zero padding chars
|
| 234 |
+
u = u * (chars != 0).float()
|
| 235 |
+
|
| 236 |
+
# Smooth Attention
|
| 237 |
+
# scores = torch.sigmoid(u) / torch.sigmoid(u).sum(dim=1, keepdim=True)
|
| 238 |
+
scores = F.softmax(u, dim=1)
|
| 239 |
+
self.attention = scores
|
| 240 |
+
self.cumulative = self.cumulative + self.attention
|
| 241 |
+
|
| 242 |
+
return scores.unsqueeze(-1).transpose(1, 2)
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
class Decoder(nn.Module):
|
| 246 |
+
# Class variable because its value doesn't change between classes
|
| 247 |
+
# yet ought to be scoped by class because its a property of a Decoder
|
| 248 |
+
max_r = 20
|
| 249 |
+
def __init__(self, n_mels, encoder_dims, decoder_dims, lstm_dims,
|
| 250 |
+
dropout, speaker_embedding_size):
|
| 251 |
+
super().__init__()
|
| 252 |
+
self.register_buffer("r", torch.tensor(1, dtype=torch.int))
|
| 253 |
+
self.n_mels = n_mels
|
| 254 |
+
prenet_dims = (decoder_dims * 2, decoder_dims * 2)
|
| 255 |
+
self.prenet = PreNet(n_mels, fc1_dims=prenet_dims[0], fc2_dims=prenet_dims[1],
|
| 256 |
+
dropout=dropout)
|
| 257 |
+
self.attn_net = LSA(decoder_dims)
|
| 258 |
+
self.attn_rnn = nn.GRUCell(encoder_dims + prenet_dims[1] + speaker_embedding_size, decoder_dims)
|
| 259 |
+
self.rnn_input = nn.Linear(encoder_dims + decoder_dims + speaker_embedding_size, lstm_dims)
|
| 260 |
+
self.res_rnn1 = nn.LSTMCell(lstm_dims, lstm_dims)
|
| 261 |
+
self.res_rnn2 = nn.LSTMCell(lstm_dims, lstm_dims)
|
| 262 |
+
self.mel_proj = nn.Linear(lstm_dims, n_mels * self.max_r, bias=False)
|
| 263 |
+
self.stop_proj = nn.Linear(encoder_dims + speaker_embedding_size + lstm_dims, 1)
|
| 264 |
+
|
| 265 |
+
def zoneout(self, prev, current, p=0.1):
|
| 266 |
+
device = next(self.parameters()).device # Use same device as parameters
|
| 267 |
+
mask = torch.zeros(prev.size(), device=device).bernoulli_(p)
|
| 268 |
+
return prev * mask + current * (1 - mask)
|
| 269 |
+
|
| 270 |
+
def forward(self, encoder_seq, encoder_seq_proj, prenet_in,
|
| 271 |
+
hidden_states, cell_states, context_vec, t, chars):
|
| 272 |
+
|
| 273 |
+
# Need this for reshaping mels
|
| 274 |
+
batch_size = encoder_seq.size(0)
|
| 275 |
+
|
| 276 |
+
# Unpack the hidden and cell states
|
| 277 |
+
attn_hidden, rnn1_hidden, rnn2_hidden = hidden_states
|
| 278 |
+
rnn1_cell, rnn2_cell = cell_states
|
| 279 |
+
|
| 280 |
+
# PreNet for the Attention RNN
|
| 281 |
+
prenet_out = self.prenet(prenet_in)
|
| 282 |
+
|
| 283 |
+
# Compute the Attention RNN hidden state
|
| 284 |
+
attn_rnn_in = torch.cat([context_vec, prenet_out], dim=-1)
|
| 285 |
+
attn_hidden = self.attn_rnn(attn_rnn_in.squeeze(1), attn_hidden)
|
| 286 |
+
|
| 287 |
+
# Compute the attention scores
|
| 288 |
+
scores = self.attn_net(encoder_seq_proj, attn_hidden, t, chars)
|
| 289 |
+
|
| 290 |
+
# Dot product to create the context vector
|
| 291 |
+
context_vec = scores @ encoder_seq
|
| 292 |
+
context_vec = context_vec.squeeze(1)
|
| 293 |
+
|
| 294 |
+
# Concat Attention RNN output w. Context Vector & project
|
| 295 |
+
x = torch.cat([context_vec, attn_hidden], dim=1)
|
| 296 |
+
x = self.rnn_input(x)
|
| 297 |
+
|
| 298 |
+
# Compute first Residual RNN
|
| 299 |
+
rnn1_hidden_next, rnn1_cell = self.res_rnn1(x, (rnn1_hidden, rnn1_cell))
|
| 300 |
+
if self.training:
|
| 301 |
+
rnn1_hidden = self.zoneout(rnn1_hidden, rnn1_hidden_next)
|
| 302 |
+
else:
|
| 303 |
+
rnn1_hidden = rnn1_hidden_next
|
| 304 |
+
x = x + rnn1_hidden
|
| 305 |
+
|
| 306 |
+
# Compute second Residual RNN
|
| 307 |
+
rnn2_hidden_next, rnn2_cell = self.res_rnn2(x, (rnn2_hidden, rnn2_cell))
|
| 308 |
+
if self.training:
|
| 309 |
+
rnn2_hidden = self.zoneout(rnn2_hidden, rnn2_hidden_next)
|
| 310 |
+
else:
|
| 311 |
+
rnn2_hidden = rnn2_hidden_next
|
| 312 |
+
x = x + rnn2_hidden
|
| 313 |
+
|
| 314 |
+
# Project Mels
|
| 315 |
+
mels = self.mel_proj(x)
|
| 316 |
+
mels = mels.view(batch_size, self.n_mels, self.max_r)[:, :, :self.r]
|
| 317 |
+
hidden_states = (attn_hidden, rnn1_hidden, rnn2_hidden)
|
| 318 |
+
cell_states = (rnn1_cell, rnn2_cell)
|
| 319 |
+
|
| 320 |
+
# Stop token prediction
|
| 321 |
+
s = torch.cat((x, context_vec), dim=1)
|
| 322 |
+
s = self.stop_proj(s)
|
| 323 |
+
stop_tokens = torch.sigmoid(s)
|
| 324 |
+
|
| 325 |
+
return mels, scores, hidden_states, cell_states, context_vec, stop_tokens
|
| 326 |
+
|
| 327 |
+
|
| 328 |
+
class Tacotron(nn.Module):
|
| 329 |
+
def __init__(self, embed_dims, num_chars, encoder_dims, decoder_dims, n_mels,
|
| 330 |
+
fft_bins, postnet_dims, encoder_K, lstm_dims, postnet_K, num_highways,
|
| 331 |
+
dropout, stop_threshold, speaker_embedding_size):
|
| 332 |
+
super().__init__()
|
| 333 |
+
self.n_mels = n_mels
|
| 334 |
+
self.lstm_dims = lstm_dims
|
| 335 |
+
self.encoder_dims = encoder_dims
|
| 336 |
+
self.decoder_dims = decoder_dims
|
| 337 |
+
self.speaker_embedding_size = speaker_embedding_size
|
| 338 |
+
self.encoder = Encoder(embed_dims, num_chars, encoder_dims,
|
| 339 |
+
encoder_K, num_highways, dropout)
|
| 340 |
+
self.encoder_proj = nn.Linear(encoder_dims + speaker_embedding_size, decoder_dims, bias=False)
|
| 341 |
+
self.decoder = Decoder(n_mels, encoder_dims, decoder_dims, lstm_dims,
|
| 342 |
+
dropout, speaker_embedding_size)
|
| 343 |
+
self.postnet = CBHG(postnet_K, n_mels, postnet_dims,
|
| 344 |
+
[postnet_dims, fft_bins], num_highways)
|
| 345 |
+
self.post_proj = nn.Linear(postnet_dims, fft_bins, bias=False)
|
| 346 |
+
|
| 347 |
+
self.init_model()
|
| 348 |
+
self.num_params()
|
| 349 |
+
|
| 350 |
+
self.register_buffer("step", torch.zeros(1, dtype=torch.long))
|
| 351 |
+
self.register_buffer("stop_threshold", torch.tensor(stop_threshold, dtype=torch.float32))
|
| 352 |
+
|
| 353 |
+
@property
|
| 354 |
+
def r(self):
|
| 355 |
+
return self.decoder.r.item()
|
| 356 |
+
|
| 357 |
+
@r.setter
|
| 358 |
+
def r(self, value):
|
| 359 |
+
self.decoder.r = self.decoder.r.new_tensor(value, requires_grad=False)
|
| 360 |
+
|
| 361 |
+
def forward(self, x, m, speaker_embedding):
|
| 362 |
+
device = next(self.parameters()).device # use same device as parameters
|
| 363 |
+
|
| 364 |
+
self.step += 1
|
| 365 |
+
batch_size, _, steps = m.size()
|
| 366 |
+
|
| 367 |
+
# Initialise all hidden states and pack into tuple
|
| 368 |
+
attn_hidden = torch.zeros(batch_size, self.decoder_dims, device=device)
|
| 369 |
+
rnn1_hidden = torch.zeros(batch_size, self.lstm_dims, device=device)
|
| 370 |
+
rnn2_hidden = torch.zeros(batch_size, self.lstm_dims, device=device)
|
| 371 |
+
hidden_states = (attn_hidden, rnn1_hidden, rnn2_hidden)
|
| 372 |
+
|
| 373 |
+
# Initialise all lstm cell states and pack into tuple
|
| 374 |
+
rnn1_cell = torch.zeros(batch_size, self.lstm_dims, device=device)
|
| 375 |
+
rnn2_cell = torch.zeros(batch_size, self.lstm_dims, device=device)
|
| 376 |
+
cell_states = (rnn1_cell, rnn2_cell)
|
| 377 |
+
|
| 378 |
+
# <GO> Frame for start of decoder loop
|
| 379 |
+
go_frame = torch.zeros(batch_size, self.n_mels, device=device)
|
| 380 |
+
|
| 381 |
+
# Need an initial context vector
|
| 382 |
+
context_vec = torch.zeros(batch_size, self.encoder_dims + self.speaker_embedding_size, device=device)
|
| 383 |
+
|
| 384 |
+
# SV2TTS: Run the encoder with the speaker embedding
|
| 385 |
+
# The projection avoids unnecessary matmuls in the decoder loop
|
| 386 |
+
encoder_seq = self.encoder(x, speaker_embedding)
|
| 387 |
+
encoder_seq_proj = self.encoder_proj(encoder_seq)
|
| 388 |
+
|
| 389 |
+
# Need a couple of lists for outputs
|
| 390 |
+
mel_outputs, attn_scores, stop_outputs = [], [], []
|
| 391 |
+
|
| 392 |
+
# Run the decoder loop
|
| 393 |
+
for t in range(0, steps, self.r):
|
| 394 |
+
prenet_in = m[:, :, t - 1] if t > 0 else go_frame
|
| 395 |
+
mel_frames, scores, hidden_states, cell_states, context_vec, stop_tokens = \
|
| 396 |
+
self.decoder(encoder_seq, encoder_seq_proj, prenet_in,
|
| 397 |
+
hidden_states, cell_states, context_vec, t, x)
|
| 398 |
+
mel_outputs.append(mel_frames)
|
| 399 |
+
attn_scores.append(scores)
|
| 400 |
+
stop_outputs.extend([stop_tokens] * self.r)
|
| 401 |
+
|
| 402 |
+
# Concat the mel outputs into sequence
|
| 403 |
+
mel_outputs = torch.cat(mel_outputs, dim=2)
|
| 404 |
+
|
| 405 |
+
# Post-Process for Linear Spectrograms
|
| 406 |
+
postnet_out = self.postnet(mel_outputs)
|
| 407 |
+
linear = self.post_proj(postnet_out)
|
| 408 |
+
linear = linear.transpose(1, 2)
|
| 409 |
+
|
| 410 |
+
# For easy visualisation
|
| 411 |
+
attn_scores = torch.cat(attn_scores, 1)
|
| 412 |
+
# attn_scores = attn_scores.cpu().data.numpy()
|
| 413 |
+
stop_outputs = torch.cat(stop_outputs, 1)
|
| 414 |
+
|
| 415 |
+
return mel_outputs, linear, attn_scores, stop_outputs
|
| 416 |
+
|
| 417 |
+
def generate(self, x, speaker_embedding=None, steps=2000):
|
| 418 |
+
import sys
|
| 419 |
+
|
| 420 |
+
self.eval()
|
| 421 |
+
device = next(self.parameters()).device # use same device as parameters
|
| 422 |
+
|
| 423 |
+
batch_size, _ = x.size()
|
| 424 |
+
|
| 425 |
+
# Need to initialise all hidden states and pack into tuple for tidyness
|
| 426 |
+
attn_hidden = torch.zeros(batch_size, self.decoder_dims, device=device)
|
| 427 |
+
rnn1_hidden = torch.zeros(batch_size, self.lstm_dims, device=device)
|
| 428 |
+
rnn2_hidden = torch.zeros(batch_size, self.lstm_dims, device=device)
|
| 429 |
+
hidden_states = (attn_hidden, rnn1_hidden, rnn2_hidden)
|
| 430 |
+
|
| 431 |
+
# Need to initialise all lstm cell states and pack into tuple for tidyness
|
| 432 |
+
rnn1_cell = torch.zeros(batch_size, self.lstm_dims, device=device)
|
| 433 |
+
rnn2_cell = torch.zeros(batch_size, self.lstm_dims, device=device)
|
| 434 |
+
cell_states = (rnn1_cell, rnn2_cell)
|
| 435 |
+
|
| 436 |
+
# Need a <GO> Frame for start of decoder loop
|
| 437 |
+
go_frame = torch.zeros(batch_size, self.n_mels, device=device)
|
| 438 |
+
|
| 439 |
+
# Need an initial context vector
|
| 440 |
+
context_vec = torch.zeros(batch_size, self.encoder_dims + self.speaker_embedding_size, device=device)
|
| 441 |
+
|
| 442 |
+
# SV2TTS: Run the encoder with the speaker embedding
|
| 443 |
+
# The projection avoids unnecessary matmuls in the decoder loop
|
| 444 |
+
print(" [Tacotron] Running encoder...", end='', flush=True)
|
| 445 |
+
sys.stdout.flush()
|
| 446 |
+
encoder_seq = self.encoder(x, speaker_embedding)
|
| 447 |
+
encoder_seq_proj = self.encoder_proj(encoder_seq)
|
| 448 |
+
print(" OK")
|
| 449 |
+
sys.stdout.flush()
|
| 450 |
+
|
| 451 |
+
# Need a couple of lists for outputs
|
| 452 |
+
mel_outputs, attn_scores, stop_outputs = [], [], []
|
| 453 |
+
|
| 454 |
+
# Run the decoder loop
|
| 455 |
+
print(f" [Tacotron] Decoder loop: 0/{steps} steps", end='')
|
| 456 |
+
sys.stdout.flush()
|
| 457 |
+
for t in range(0, steps, self.r):
|
| 458 |
+
prenet_in = mel_outputs[-1][:, :, -1] if t > 0 else go_frame
|
| 459 |
+
mel_frames, scores, hidden_states, cell_states, context_vec, stop_tokens = \
|
| 460 |
+
self.decoder(encoder_seq, encoder_seq_proj, prenet_in,
|
| 461 |
+
hidden_states, cell_states, context_vec, t, x)
|
| 462 |
+
mel_outputs.append(mel_frames)
|
| 463 |
+
attn_scores.append(scores)
|
| 464 |
+
stop_outputs.extend([stop_tokens] * self.r)
|
| 465 |
+
|
| 466 |
+
# Progress every 100 steps
|
| 467 |
+
if t % 100 == 0:
|
| 468 |
+
print(f"\r [Tacotron] Decoder loop: {t}/{steps} steps", end='')
|
| 469 |
+
sys.stdout.flush()
|
| 470 |
+
|
| 471 |
+
# Stop the loop when all stop tokens in batch exceed threshold
|
| 472 |
+
if (stop_tokens > 0.5).all() and t > 10:
|
| 473 |
+
print(f"\r [Tacotron] Decoder loop: {t}/{steps} steps (stopped early)")
|
| 474 |
+
sys.stdout.flush()
|
| 475 |
+
break
|
| 476 |
+
|
| 477 |
+
print(f"\r [Tacotron] Decoder loop: {len(mel_outputs) * self.r}/{steps} steps (complete)")
|
| 478 |
+
sys.stdout.flush()
|
| 479 |
+
|
| 480 |
+
# Concat the mel outputs into sequence
|
| 481 |
+
print(" [Tacotron] Concatenating and post-processing...", end='', flush=True)
|
| 482 |
+
sys.stdout.flush()
|
| 483 |
+
mel_outputs = torch.cat(mel_outputs, dim=2)
|
| 484 |
+
|
| 485 |
+
# Post-Process for Linear Spectrograms
|
| 486 |
+
postnet_out = self.postnet(mel_outputs)
|
| 487 |
+
linear = self.post_proj(postnet_out)
|
| 488 |
+
|
| 489 |
+
linear = linear.transpose(1, 2)
|
| 490 |
+
|
| 491 |
+
# For easy visualisation
|
| 492 |
+
attn_scores = torch.cat(attn_scores, 1)
|
| 493 |
+
stop_outputs = torch.cat(stop_outputs, 1)
|
| 494 |
+
|
| 495 |
+
print(" OK")
|
| 496 |
+
sys.stdout.flush()
|
| 497 |
+
self.train()
|
| 498 |
+
|
| 499 |
+
return mel_outputs, linear, attn_scores
|
| 500 |
+
|
| 501 |
+
def init_model(self):
|
| 502 |
+
for p in self.parameters():
|
| 503 |
+
if p.dim() > 1: nn.init.xavier_uniform_(p)
|
| 504 |
+
|
| 505 |
+
def get_step(self):
|
| 506 |
+
return self.step.data.item()
|
| 507 |
+
|
| 508 |
+
def reset_step(self):
|
| 509 |
+
# assignment to parameters or buffers is overloaded, updates internal dict entry
|
| 510 |
+
self.step = self.step.data.new_tensor(1)
|
| 511 |
+
|
| 512 |
+
def log(self, path, msg):
|
| 513 |
+
with open(path, "a") as f:
|
| 514 |
+
print(msg, file=f)
|
| 515 |
+
|
| 516 |
+
def load(self, path, optimizer=None):
|
| 517 |
+
# Use device of model params as location for loaded state
|
| 518 |
+
device = next(self.parameters()).device
|
| 519 |
+
checkpoint = torch.load(str(path), map_location=device)
|
| 520 |
+
self.load_state_dict(checkpoint["model_state"])
|
| 521 |
+
|
| 522 |
+
if "optimizer_state" in checkpoint and optimizer is not None:
|
| 523 |
+
optimizer.load_state_dict(checkpoint["optimizer_state"])
|
| 524 |
+
|
| 525 |
+
def save(self, path, optimizer=None):
|
| 526 |
+
if optimizer is not None:
|
| 527 |
+
torch.save({
|
| 528 |
+
"model_state": self.state_dict(),
|
| 529 |
+
"optimizer_state": optimizer.state_dict(),
|
| 530 |
+
}, str(path))
|
| 531 |
+
else:
|
| 532 |
+
torch.save({
|
| 533 |
+
"model_state": self.state_dict(),
|
| 534 |
+
}, str(path))
|
| 535 |
+
|
| 536 |
+
|
| 537 |
+
def num_params(self, print_out=True):
|
| 538 |
+
parameters = filter(lambda p: p.requires_grad, self.parameters())
|
| 539 |
+
parameters = sum([np.prod(p.size()) for p in parameters]) / 1_000_000
|
| 540 |
+
if print_out:
|
| 541 |
+
print("Trainable Parameters: %.3fM" % parameters)
|
| 542 |
+
return parameters
|
backend/synthesizer/utils/__init__.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
_output_ref = None
|
| 5 |
+
_replicas_ref = None
|
| 6 |
+
|
| 7 |
+
def data_parallel_workaround(model, *input):
|
| 8 |
+
global _output_ref
|
| 9 |
+
global _replicas_ref
|
| 10 |
+
device_ids = list(range(torch.cuda.device_count()))
|
| 11 |
+
output_device = device_ids[0]
|
| 12 |
+
replicas = torch.nn.parallel.replicate(model, device_ids)
|
| 13 |
+
# input.shape = (num_args, batch, ...)
|
| 14 |
+
inputs = torch.nn.parallel.scatter(input, device_ids)
|
| 15 |
+
# inputs.shape = (num_gpus, num_args, batch/num_gpus, ...)
|
| 16 |
+
replicas = replicas[:len(inputs)]
|
| 17 |
+
outputs = torch.nn.parallel.parallel_apply(replicas, inputs)
|
| 18 |
+
y_hat = torch.nn.parallel.gather(outputs, output_device)
|
| 19 |
+
_output_ref = outputs
|
| 20 |
+
_replicas_ref = replicas
|
| 21 |
+
return y_hat
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class ValueWindow():
|
| 25 |
+
def __init__(self, window_size=100):
|
| 26 |
+
self._window_size = window_size
|
| 27 |
+
self._values = []
|
| 28 |
+
|
| 29 |
+
def append(self, x):
|
| 30 |
+
self._values = self._values[-(self._window_size - 1):] + [x]
|
| 31 |
+
|
| 32 |
+
@property
|
| 33 |
+
def sum(self):
|
| 34 |
+
return sum(self._values)
|
| 35 |
+
|
| 36 |
+
@property
|
| 37 |
+
def count(self):
|
| 38 |
+
return len(self._values)
|
| 39 |
+
|
| 40 |
+
@property
|
| 41 |
+
def average(self):
|
| 42 |
+
return self.sum / max(1, self.count)
|
| 43 |
+
|
| 44 |
+
def reset(self):
|
| 45 |
+
self._values = []
|
backend/synthesizer/utils/cleaners.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Cleaners are transformations that run over the input text at both training and eval time.
|
| 3 |
+
|
| 4 |
+
Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
|
| 5 |
+
hyperparameter. Some cleaners are English-specific. You"ll typically want to use:
|
| 6 |
+
1. "english_cleaners" for English text
|
| 7 |
+
2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
|
| 8 |
+
the Unidecode library (https://pypi.python.org/pypi/Unidecode)
|
| 9 |
+
3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
|
| 10 |
+
the symbols in symbols.py to match your data).
|
| 11 |
+
"""
|
| 12 |
+
import re
|
| 13 |
+
from unidecode import unidecode
|
| 14 |
+
from .numbers import normalize_numbers
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
# Regular expression matching whitespace:
|
| 18 |
+
_whitespace_re = re.compile(r"\s+")
|
| 19 |
+
|
| 20 |
+
# List of (regular expression, replacement) pairs for abbreviations:
|
| 21 |
+
_abbreviations = [(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) for x in [
|
| 22 |
+
("mrs", "misess"),
|
| 23 |
+
("mr", "mister"),
|
| 24 |
+
("dr", "doctor"),
|
| 25 |
+
("st", "saint"),
|
| 26 |
+
("co", "company"),
|
| 27 |
+
("jr", "junior"),
|
| 28 |
+
("maj", "major"),
|
| 29 |
+
("gen", "general"),
|
| 30 |
+
("drs", "doctors"),
|
| 31 |
+
("rev", "reverend"),
|
| 32 |
+
("lt", "lieutenant"),
|
| 33 |
+
("hon", "honorable"),
|
| 34 |
+
("sgt", "sergeant"),
|
| 35 |
+
("capt", "captain"),
|
| 36 |
+
("esq", "esquire"),
|
| 37 |
+
("ltd", "limited"),
|
| 38 |
+
("col", "colonel"),
|
| 39 |
+
("ft", "fort"),
|
| 40 |
+
]]
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def expand_abbreviations(text):
|
| 44 |
+
for regex, replacement in _abbreviations:
|
| 45 |
+
text = re.sub(regex, replacement, text)
|
| 46 |
+
return text
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def expand_numbers(text):
|
| 50 |
+
return normalize_numbers(text)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def lowercase(text):
|
| 54 |
+
"""lowercase input tokens."""
|
| 55 |
+
return text.lower()
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def collapse_whitespace(text):
|
| 59 |
+
return re.sub(_whitespace_re, " ", text)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def convert_to_ascii(text):
|
| 63 |
+
return unidecode(text)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def basic_cleaners(text):
|
| 67 |
+
"""Basic pipeline that lowercases and collapses whitespace without transliteration."""
|
| 68 |
+
text = lowercase(text)
|
| 69 |
+
text = collapse_whitespace(text)
|
| 70 |
+
return text
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def transliteration_cleaners(text):
|
| 74 |
+
"""Pipeline for non-English text that transliterates to ASCII."""
|
| 75 |
+
text = convert_to_ascii(text)
|
| 76 |
+
text = lowercase(text)
|
| 77 |
+
text = collapse_whitespace(text)
|
| 78 |
+
return text
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def english_cleaners(text):
|
| 82 |
+
"""Pipeline for English text, including number and abbreviation expansion."""
|
| 83 |
+
text = convert_to_ascii(text)
|
| 84 |
+
text = lowercase(text)
|
| 85 |
+
text = expand_numbers(text)
|
| 86 |
+
text = expand_abbreviations(text)
|
| 87 |
+
text = collapse_whitespace(text)
|
| 88 |
+
return text
|
backend/synthesizer/utils/numbers.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import inflect
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
_inflect = inflect.engine()
|
| 6 |
+
_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
|
| 7 |
+
_decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
|
| 8 |
+
_pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
|
| 9 |
+
_dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
|
| 10 |
+
_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
|
| 11 |
+
_number_re = re.compile(r"[0-9]+")
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def _remove_commas(m):
|
| 15 |
+
return m.group(1).replace(",", "")
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def _expand_decimal_point(m):
|
| 19 |
+
return m.group(1).replace(".", " point ")
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def _expand_dollars(m):
|
| 23 |
+
match = m.group(1)
|
| 24 |
+
parts = match.split(".")
|
| 25 |
+
if len(parts) > 2:
|
| 26 |
+
return match + " dollars" # Unexpected format
|
| 27 |
+
dollars = int(parts[0]) if parts[0] else 0
|
| 28 |
+
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
|
| 29 |
+
if dollars and cents:
|
| 30 |
+
dollar_unit = "dollar" if dollars == 1 else "dollars"
|
| 31 |
+
cent_unit = "cent" if cents == 1 else "cents"
|
| 32 |
+
return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
|
| 33 |
+
elif dollars:
|
| 34 |
+
dollar_unit = "dollar" if dollars == 1 else "dollars"
|
| 35 |
+
return "%s %s" % (dollars, dollar_unit)
|
| 36 |
+
elif cents:
|
| 37 |
+
cent_unit = "cent" if cents == 1 else "cents"
|
| 38 |
+
return "%s %s" % (cents, cent_unit)
|
| 39 |
+
else:
|
| 40 |
+
return "zero dollars"
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def _expand_ordinal(m):
|
| 44 |
+
return _inflect.number_to_words(m.group(0))
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def _expand_number(m):
|
| 48 |
+
num = int(m.group(0))
|
| 49 |
+
if num > 1000 and num < 3000:
|
| 50 |
+
if num == 2000:
|
| 51 |
+
return "two thousand"
|
| 52 |
+
elif num > 2000 and num < 2010:
|
| 53 |
+
return "two thousand " + _inflect.number_to_words(num % 100)
|
| 54 |
+
elif num % 100 == 0:
|
| 55 |
+
return _inflect.number_to_words(num // 100) + " hundred"
|
| 56 |
+
else:
|
| 57 |
+
return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ")
|
| 58 |
+
else:
|
| 59 |
+
return _inflect.number_to_words(num, andword="")
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def normalize_numbers(text):
|
| 63 |
+
text = re.sub(_comma_number_re, _remove_commas, text)
|
| 64 |
+
text = re.sub(_pounds_re, r"\1 pounds", text)
|
| 65 |
+
text = re.sub(_dollars_re, _expand_dollars, text)
|
| 66 |
+
text = re.sub(_decimal_number_re, _expand_decimal_point, text)
|
| 67 |
+
text = re.sub(_ordinal_re, _expand_ordinal, text)
|
| 68 |
+
text = re.sub(_number_re, _expand_number, text)
|
| 69 |
+
return text
|
backend/synthesizer/utils/symbols.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Defines the set of symbols used in text input to the model.
|
| 3 |
+
|
| 4 |
+
The default is a set of ASCII characters that works well for English or text that has been run
|
| 5 |
+
through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
|
| 6 |
+
"""
|
| 7 |
+
# from . import cmudict
|
| 8 |
+
|
| 9 |
+
_pad = "_"
|
| 10 |
+
_eos = "~"
|
| 11 |
+
_characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'\"(),-.:;? "
|
| 12 |
+
|
| 13 |
+
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
|
| 14 |
+
#_arpabet = ["@' + s for s in cmudict.valid_symbols]
|
| 15 |
+
|
| 16 |
+
# Export all symbols:
|
| 17 |
+
symbols = [_pad, _eos] + list(_characters) #+ _arpabet
|
backend/synthesizer/utils/text.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .symbols import symbols
|
| 2 |
+
from . import cleaners
|
| 3 |
+
import re
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
# Mappings from symbol to numeric ID and vice versa:
|
| 7 |
+
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
|
| 8 |
+
_id_to_symbol = {i: s for i, s in enumerate(symbols)}
|
| 9 |
+
|
| 10 |
+
# Regular expression matching text enclosed in curly braces:
|
| 11 |
+
_curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)")
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def text_to_sequence(text, cleaner_names):
|
| 15 |
+
"""Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
| 16 |
+
|
| 17 |
+
The text can optionally have ARPAbet sequences enclosed in curly braces embedded
|
| 18 |
+
in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
|
| 19 |
+
|
| 20 |
+
Args:
|
| 21 |
+
text: string to convert to a sequence
|
| 22 |
+
cleaner_names: names of the cleaner functions to run the text through
|
| 23 |
+
|
| 24 |
+
Returns:
|
| 25 |
+
List of integers corresponding to the symbols in the text
|
| 26 |
+
"""
|
| 27 |
+
sequence = []
|
| 28 |
+
|
| 29 |
+
# Check for curly braces and treat their contents as ARPAbet:
|
| 30 |
+
while len(text):
|
| 31 |
+
m = _curly_re.match(text)
|
| 32 |
+
if not m:
|
| 33 |
+
sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
|
| 34 |
+
break
|
| 35 |
+
sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
|
| 36 |
+
sequence += _arpabet_to_sequence(m.group(2))
|
| 37 |
+
text = m.group(3)
|
| 38 |
+
|
| 39 |
+
# Append EOS token
|
| 40 |
+
sequence.append(_symbol_to_id["~"])
|
| 41 |
+
return sequence
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def sequence_to_text(sequence):
|
| 45 |
+
"""Converts a sequence of IDs back to a string"""
|
| 46 |
+
result = ""
|
| 47 |
+
for symbol_id in sequence:
|
| 48 |
+
if symbol_id in _id_to_symbol:
|
| 49 |
+
s = _id_to_symbol[symbol_id]
|
| 50 |
+
# Enclose ARPAbet back in curly braces:
|
| 51 |
+
if len(s) > 1 and s[0] == "@":
|
| 52 |
+
s = "{%s}" % s[1:]
|
| 53 |
+
result += s
|
| 54 |
+
return result.replace("}{", " ")
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def _clean_text(text, cleaner_names):
|
| 58 |
+
for name in cleaner_names:
|
| 59 |
+
cleaner = getattr(cleaners, name)
|
| 60 |
+
if not cleaner:
|
| 61 |
+
raise Exception("Unknown cleaner: %s" % name)
|
| 62 |
+
text = cleaner(text)
|
| 63 |
+
return text
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def _symbols_to_sequence(symbols):
|
| 67 |
+
return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def _arpabet_to_sequence(text):
|
| 71 |
+
return _symbols_to_sequence(["@" + s for s in text.split()])
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def _should_keep_symbol(s):
|
| 75 |
+
return s in _symbol_to_id and s not in ("_", "~")
|
backend/wsgi.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Gunicorn entry point for the voice cloning backend."""
|
| 2 |
+
|
| 3 |
+
import sys
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
# Ensure backend directory is in the path for imports
|
| 7 |
+
backend_dir = Path(__file__).parent
|
| 8 |
+
if str(backend_dir) not in sys.path:
|
| 9 |
+
sys.path.insert(0, str(backend_dir))
|
| 10 |
+
|
| 11 |
+
from app import app
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
if __name__ == "__main__":
|
| 15 |
+
app.run()
|
frontend/.env.development
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Local development
|
| 2 |
+
VITE_API_URL=http://localhost:5000
|
| 3 |
+
FLASK_ENV=development
|
| 4 |
+
DEBUG=true
|
frontend/.env.production
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Production deployment
|
| 2 |
+
VITE_API_URL=https://voice-cloning-personalized-speech.onrender.com
|
frontend/.gitignore
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Logs
|
| 2 |
+
logs
|
| 3 |
+
*.log
|
| 4 |
+
npm-debug.log*
|
| 5 |
+
yarn-debug.log*
|
| 6 |
+
yarn-error.log*
|
| 7 |
+
pnpm-debug.log*
|
| 8 |
+
lerna-debug.log*
|
| 9 |
+
|
| 10 |
+
# Dependencies
|
| 11 |
+
/node_modules
|
| 12 |
+
/.pnp
|
| 13 |
+
.pnp.js
|
| 14 |
+
|
| 15 |
+
# Testing
|
| 16 |
+
/coverage
|
| 17 |
+
|
| 18 |
+
# Next.js
|
| 19 |
+
/.next/
|
| 20 |
+
/out/
|
| 21 |
+
|
| 22 |
+
# Production
|
| 23 |
+
/build
|
| 24 |
+
/dist
|
| 25 |
+
/dist-ssr
|
| 26 |
+
|
| 27 |
+
# Local env files
|
| 28 |
+
.env*.local
|
| 29 |
+
.env
|
| 30 |
+
|
| 31 |
+
# Debug logs
|
| 32 |
+
npm-debug.log*
|
| 33 |
+
yarn-debug.log*
|
| 34 |
+
yarn-error.log*
|
| 35 |
+
pnpm-debug.log*
|
| 36 |
+
|
| 37 |
+
# Editor directories and files
|
| 38 |
+
.idea
|
| 39 |
+
.vscode/*
|
| 40 |
+
!.vscode/extensions.json
|
| 41 |
+
.DS_Store
|
| 42 |
+
*.suo
|
| 43 |
+
*.ntvs*
|
| 44 |
+
*.njsproj
|
| 45 |
+
*.sln
|
| 46 |
+
*.sw?
|
| 47 |
+
|
| 48 |
+
# System Files
|
| 49 |
+
.DS_Store
|
| 50 |
+
Thumbs.db
|
| 51 |
+
|
| 52 |
+
# Cache
|
| 53 |
+
.cache/
|
| 54 |
+
.temp/
|
| 55 |
+
.tmp/
|
| 56 |
+
|
| 57 |
+
# Misc
|
| 58 |
+
.vercel
|
| 59 |
+
.next
|
| 60 |
+
.vercel_build_output
|
| 61 |
+
|
| 62 |
+
# Local Netlify folder
|
| 63 |
+
.netlify
|
| 64 |
+
|
| 65 |
+
# Optional npm cache directory
|
| 66 |
+
.npm
|
| 67 |
+
|
| 68 |
+
# Optional eslint cache
|
| 69 |
+
.eslintcache
|
| 70 |
+
|
| 71 |
+
# Optional REPL history
|
| 72 |
+
.node_repl_history
|
| 73 |
+
|
| 74 |
+
# Output of 'npm pack'
|
| 75 |
+
*.tgz
|
| 76 |
+
|
| 77 |
+
# Yarn Integrity file
|
| 78 |
+
.yarn-integrity
|
| 79 |
+
|
| 80 |
+
# dotenv environment variables file
|
| 81 |
+
.env*.local
|
| 82 |
+
.env
|
| 83 |
+
|
| 84 |
+
# parcel-bundler cache (https://parceljs.org/)
|
| 85 |
+
.parcel-cache
|
| 86 |
+
|
| 87 |
+
# Next.js build output
|
| 88 |
+
.next
|
| 89 |
+
out
|
| 90 |
+
|
| 91 |
+
# Vercel
|
| 92 |
+
.vercel
|
| 93 |
+
|
| 94 |
+
# TypeScript
|
| 95 |
+
*.tsbuildinfo
|
| 96 |
+
next-env.d.ts
|
| 97 |
+
|
| 98 |
+
# Optional stylelint cache
|
| 99 |
+
.stylelintcache
|
frontend/README.md
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Voice Cloning – Personalized Speech Synthesis (Frontend)
|
| 2 |
+
|
| 3 |
+
> Note: On first load, please wait 2–3 minutes. The app initializes several 3D elements which can take time to fetch and compile in the browser, including examples like:
|
| 4 |
+
> - Spline-powered scenes and backgrounds
|
| 5 |
+
> - Interactive Orb (Three.js) with real-time interaction
|
| 6 |
+
> - Particle Field and Floating Elements
|
| 7 |
+
> - Speaker/Microphone 3D scenes and visualizers
|
| 8 |
+
|
| 9 |
+
This repository contains the fully custom-built frontend for a Voice Cloning and Personalized Speech Synthesis application.
|
| 10 |
+
|
| 11 |
+
- Modern, responsive UI with smooth 3D visuals and an accessible design system.
|
| 12 |
+
|
| 13 |
+
---
|
| 14 |
+
|
| 15 |
+
## Overview
|
| 16 |
+
|
| 17 |
+
The frontend provides:
|
| 18 |
+
|
| 19 |
+
- A clean interface to enroll voice samples and synthesize speech.
|
| 20 |
+
- Real-time audio recording, waveform visualization, and playback controls.
|
| 21 |
+
- Rich 3D/animated visuals to enhance the user experience (Spline and Three.js).
|
| 22 |
+
- A component-driven architecture for maintainability and reusability.
|
| 23 |
+
|
| 24 |
+
---
|
| 25 |
+
|
| 26 |
+
## Features
|
| 27 |
+
|
| 28 |
+
- Audio
|
| 29 |
+
- Audio recorder and waveform visualization
|
| 30 |
+
- Error boundaries and robust UI states
|
| 31 |
+
|
| 32 |
+
- 3D & Visuals
|
| 33 |
+
- Spline background scenes
|
| 34 |
+
- Interactive Orb, Particle Field, Floating Elements
|
| 35 |
+
- Speaker/Microphone scenes and animated transitions
|
| 36 |
+
|
| 37 |
+
- UI/UX
|
| 38 |
+
- shadcn/ui components with Tailwind CSS
|
| 39 |
+
- Responsive, accessible design
|
| 40 |
+
- Theming and utility-first styling
|
| 41 |
+
|
| 42 |
+
---
|
| 43 |
+
|
| 44 |
+
## Tech Stack
|
| 45 |
+
|
| 46 |
+
- Vite (bundler & dev server)
|
| 47 |
+
- React (UI) + TypeScript
|
| 48 |
+
- Tailwind CSS + PostCSS
|
| 49 |
+
- shadcn/ui component library
|
| 50 |
+
- Three.js & Spline (3D scenes and interactions)
|
| 51 |
+
- ESLint (code quality) and modern TS configs
|
| 52 |
+
|
| 53 |
+
---
|
| 54 |
+
|
| 55 |
+
## Getting Started
|
| 56 |
+
|
| 57 |
+
Prerequisites:
|
| 58 |
+
|
| 59 |
+
- Node.js and npm installed (recommend using nvm)
|
| 60 |
+
|
| 61 |
+
Install and run:
|
| 62 |
+
|
| 63 |
+
```bash
|
| 64 |
+
npm install
|
| 65 |
+
npm run dev
|
| 66 |
+
```
|
| 67 |
+
|
| 68 |
+
Open the local URL printed in the terminal. First load may take 2–3 minutes due to 3D assets.
|
| 69 |
+
|
| 70 |
+
---
|
| 71 |
+
|
| 72 |
+
## Available Scripts
|
| 73 |
+
|
| 74 |
+
- `npm run dev` – Start the development server
|
| 75 |
+
- `npm run build` – Build for production into `dist/`
|
| 76 |
+
- `npm run preview` – Preview the production build locally
|
| 77 |
+
|
| 78 |
+
---
|
| 79 |
+
|
| 80 |
+
## Project Structure (high level)
|
| 81 |
+
|
| 82 |
+
- `src/`
|
| 83 |
+
- `components/`
|
| 84 |
+
- `audio/` – Recorder, waveform, audio UI
|
| 85 |
+
- `three/` – Interactive Orb, Particle Field, Speaker/Mic scenes, Spline background
|
| 86 |
+
- `ui/` – shadcn/ui component wrappers and utilities
|
| 87 |
+
- `pages/` – App pages and routing
|
| 88 |
+
- `lib/` – Utility functions
|
| 89 |
+
|
| 90 |
+
- `public/` – Static assets (icons, placeholders, robots.txt)
|
| 91 |
+
- `tailwind.config.ts`, `postcss.config.js` – Styling configuration
|
| 92 |
+
- `eslint.config.js` – Linting configuration
|
| 93 |
+
|
| 94 |
+
---
|
| 95 |
+
|
| 96 |
+
## Deployment
|
| 97 |
+
|
| 98 |
+
Build a production bundle:
|
| 99 |
+
|
| 100 |
+
```bash
|
| 101 |
+
npm run build
|
| 102 |
+
npm run preview
|
| 103 |
+
```
|
| 104 |
+
|
| 105 |
+
Deploy the contents of `dist/` to your hosting of choice (e.g., Netlify, Vercel, GitHub Pages, or a static server).
|
| 106 |
+
|
| 107 |
+
---
|
| 108 |
+
|
| 109 |
+
## License
|
| 110 |
+
|
| 111 |
+
Copyright The project owner. All rights reserved.
|
frontend/components.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"$schema": "https://ui.shadcn.com/schema.json",
|
| 3 |
+
"style": "default",
|
| 4 |
+
"rsc": false,
|
| 5 |
+
"tsx": true,
|
| 6 |
+
"tailwind": {
|
| 7 |
+
"config": "tailwind.config.ts",
|
| 8 |
+
"css": "src/index.css",
|
| 9 |
+
"baseColor": "slate",
|
| 10 |
+
"cssVariables": true,
|
| 11 |
+
"prefix": ""
|
| 12 |
+
},
|
| 13 |
+
"aliases": {
|
| 14 |
+
"components": "@/components",
|
| 15 |
+
"utils": "@/lib/utils",
|
| 16 |
+
"ui": "@/components/ui",
|
| 17 |
+
"lib": "@/lib",
|
| 18 |
+
"hooks": "@/hooks"
|
| 19 |
+
}
|
| 20 |
+
}
|
frontend/eslint.config.js
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import js from "@eslint/js";
|
| 2 |
+
import globals from "globals";
|
| 3 |
+
import reactHooks from "eslint-plugin-react-hooks";
|
| 4 |
+
import reactRefresh from "eslint-plugin-react-refresh";
|
| 5 |
+
import tseslint from "typescript-eslint";
|
| 6 |
+
|
| 7 |
+
export default tseslint.config(
|
| 8 |
+
{ ignores: ["dist"] },
|
| 9 |
+
{
|
| 10 |
+
extends: [js.configs.recommended, ...tseslint.configs.recommended],
|
| 11 |
+
files: ["**/*.{ts,tsx}"],
|
| 12 |
+
languageOptions: {
|
| 13 |
+
ecmaVersion: 2020,
|
| 14 |
+
globals: globals.browser,
|
| 15 |
+
},
|
| 16 |
+
plugins: {
|
| 17 |
+
"react-hooks": reactHooks,
|
| 18 |
+
"react-refresh": reactRefresh,
|
| 19 |
+
},
|
| 20 |
+
rules: {
|
| 21 |
+
...reactHooks.configs.recommended.rules,
|
| 22 |
+
"react-refresh/only-export-components": [
|
| 23 |
+
"warn",
|
| 24 |
+
{ allowConstantExport: true },
|
| 25 |
+
],
|
| 26 |
+
"@typescript-eslint/no-unused-vars": "off",
|
| 27 |
+
},
|
| 28 |
+
}
|
| 29 |
+
);
|
frontend/index.html
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8" />
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
| 6 |
+
<title>Dhwanii Voice Cloning AI</title>
|
| 7 |
+
<meta name="description" content="Voice cloning and speech synthesis demo" />
|
| 8 |
+
<meta name="author" content="Dhwanii Voice Cloning AI" />
|
| 9 |
+
|
| 10 |
+
<meta property="og:title" content="Dhwanii Voice Cloning AI" />
|
| 11 |
+
<meta property="og:description" content="Voice cloning and speech synthesis demo" />
|
| 12 |
+
<meta property="og:type" content="website" />
|
| 13 |
+
<meta property="og:image" content="/favicon.ico" />
|
| 14 |
+
|
| 15 |
+
<meta name="twitter:card" content="summary" />
|
| 16 |
+
<meta name="twitter:site" content="@Arjitsharma00074" />
|
| 17 |
+
<meta name="twitter:image" content="/favicon.ico" />
|
| 18 |
+
</head>
|
| 19 |
+
|
| 20 |
+
<body>
|
| 21 |
+
<div id="root"></div>
|
| 22 |
+
<script type="module" src="/src/main.tsx"></script>
|
| 23 |
+
</body>
|
| 24 |
+
</html>
|
frontend/package-lock.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
frontend/package.json
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "vite_react_shadcn_ts",
|
| 3 |
+
"private": true,
|
| 4 |
+
"version": "0.0.0",
|
| 5 |
+
"type": "module",
|
| 6 |
+
"scripts": {
|
| 7 |
+
"dev": "vite",
|
| 8 |
+
"build": "vite build",
|
| 9 |
+
"build:dev": "vite build --mode development",
|
| 10 |
+
"lint": "eslint .",
|
| 11 |
+
"preview": "vite preview"
|
| 12 |
+
},
|
| 13 |
+
"dependencies": {
|
| 14 |
+
"@hookform/resolvers": "^3.10.0",
|
| 15 |
+
"@radix-ui/react-accordion": "^1.2.11",
|
| 16 |
+
"@radix-ui/react-alert-dialog": "^1.1.14",
|
| 17 |
+
"@radix-ui/react-aspect-ratio": "^1.1.7",
|
| 18 |
+
"@radix-ui/react-avatar": "^1.1.10",
|
| 19 |
+
"@radix-ui/react-checkbox": "^1.3.2",
|
| 20 |
+
"@radix-ui/react-collapsible": "^1.1.11",
|
| 21 |
+
"@radix-ui/react-context-menu": "^2.2.15",
|
| 22 |
+
"@radix-ui/react-dialog": "^1.1.14",
|
| 23 |
+
"@radix-ui/react-dropdown-menu": "^2.1.15",
|
| 24 |
+
"@radix-ui/react-hover-card": "^1.1.14",
|
| 25 |
+
"@radix-ui/react-label": "^2.1.7",
|
| 26 |
+
"@radix-ui/react-menubar": "^1.1.15",
|
| 27 |
+
"@radix-ui/react-navigation-menu": "^1.2.13",
|
| 28 |
+
"@radix-ui/react-popover": "^1.1.14",
|
| 29 |
+
"@radix-ui/react-progress": "^1.1.7",
|
| 30 |
+
"@radix-ui/react-radio-group": "^1.3.7",
|
| 31 |
+
"@radix-ui/react-scroll-area": "^1.2.9",
|
| 32 |
+
"@radix-ui/react-select": "^2.2.5",
|
| 33 |
+
"@radix-ui/react-separator": "^1.1.7",
|
| 34 |
+
"@radix-ui/react-slider": "^1.3.5",
|
| 35 |
+
"@radix-ui/react-slot": "^1.2.3",
|
| 36 |
+
"@radix-ui/react-switch": "^1.2.5",
|
| 37 |
+
"@radix-ui/react-tabs": "^1.1.12",
|
| 38 |
+
"@radix-ui/react-toast": "^1.2.14",
|
| 39 |
+
"@radix-ui/react-toggle": "^1.1.9",
|
| 40 |
+
"@radix-ui/react-toggle-group": "^1.1.10",
|
| 41 |
+
"@radix-ui/react-tooltip": "^1.2.7",
|
| 42 |
+
"@react-three/drei": "^9.122.0",
|
| 43 |
+
"@react-three/fiber": "^8.18.0",
|
| 44 |
+
"@react-three/postprocessing": "^2.19.1",
|
| 45 |
+
"@splinetool/react-spline": "^4.1.0",
|
| 46 |
+
"@splinetool/runtime": "^1.10.55",
|
| 47 |
+
"@tanstack/react-query": "^5.83.0",
|
| 48 |
+
"class-variance-authority": "^0.7.1",
|
| 49 |
+
"clsx": "^2.1.1",
|
| 50 |
+
"cmdk": "^1.1.1",
|
| 51 |
+
"date-fns": "^3.6.0",
|
| 52 |
+
"embla-carousel-react": "^8.6.0",
|
| 53 |
+
"input-otp": "^1.4.2",
|
| 54 |
+
"lucide-react": "^0.462.0",
|
| 55 |
+
"next-themes": "^0.3.0",
|
| 56 |
+
"react": "^18.3.1",
|
| 57 |
+
"react-day-picker": "^8.10.1",
|
| 58 |
+
"react-dom": "^18.3.1",
|
| 59 |
+
"react-hook-form": "^7.61.1",
|
| 60 |
+
"react-resizable-panels": "^2.1.9",
|
| 61 |
+
"react-router-dom": "^6.30.1",
|
| 62 |
+
"recharts": "^2.15.4",
|
| 63 |
+
"sonner": "^1.7.4",
|
| 64 |
+
"tailwind-merge": "^2.6.0",
|
| 65 |
+
"tailwindcss-animate": "^1.0.7",
|
| 66 |
+
"three": "^0.169.0",
|
| 67 |
+
"vaul": "^0.9.9",
|
| 68 |
+
"zod": "^3.25.76"
|
| 69 |
+
},
|
| 70 |
+
"devDependencies": {
|
| 71 |
+
"@eslint/js": "^9.32.0",
|
| 72 |
+
"@tailwindcss/typography": "^0.5.16",
|
| 73 |
+
"@types/node": "^22.16.5",
|
| 74 |
+
"@types/react": "^18.3.23",
|
| 75 |
+
"@types/react-dom": "^18.3.7",
|
| 76 |
+
"@vitejs/plugin-react-swc": "^3.11.0",
|
| 77 |
+
"autoprefixer": "^10.4.21",
|
| 78 |
+
"eslint": "^9.32.0",
|
| 79 |
+
"eslint-plugin-react-hooks": "^5.2.0",
|
| 80 |
+
"eslint-plugin-react-refresh": "^0.4.20",
|
| 81 |
+
"globals": "^15.15.0",
|
| 82 |
+
"postcss": "^8.5.6",
|
| 83 |
+
"tailwindcss": "^3.4.17",
|
| 84 |
+
"typescript": "^5.8.3",
|
| 85 |
+
"typescript-eslint": "^8.38.0",
|
| 86 |
+
"vite": "^7.1.4"
|
| 87 |
+
}
|
| 88 |
+
}
|
frontend/postcss.config.js
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
export default {
|
| 2 |
+
plugins: {
|
| 3 |
+
tailwindcss: {},
|
| 4 |
+
autoprefixer: {},
|
| 5 |
+
},
|
| 6 |
+
}
|
frontend/public/placeholder.svg
ADDED
|
|
frontend/public/robots.txt
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
User-agent: Googlebot
|
| 2 |
+
Allow: /
|
| 3 |
+
|
| 4 |
+
User-agent: Bingbot
|
| 5 |
+
Allow: /
|
| 6 |
+
|
| 7 |
+
User-agent: Twitterbot
|
| 8 |
+
Allow: /
|
| 9 |
+
|
| 10 |
+
User-agent: facebookexternalhit
|
| 11 |
+
Allow: /
|
| 12 |
+
|
| 13 |
+
User-agent: *
|
| 14 |
+
Allow: /
|