Voice_Clonning / app.py
Reahan
Added Project files
8970226
import os
import time
from flask import Flask, request, jsonify, render_template_string, send_from_directory, url_for
from werkzeug.utils import secure_filename
import threading, uuid, subprocess, shutil
# Reuse existing clone function
from clone_voice import clone_voice as do_clone, warm_model, is_model_loaded
app = Flask(__name__)
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
UPLOAD_DIR = os.path.join(BASE_DIR, "uploads")
OUTPUT_DIR = os.path.join(BASE_DIR, "outputs")
os.makedirs(UPLOAD_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)
# Limit upload size to 50MB
app.config["MAX_CONTENT_LENGTH"] = 50 * 1024 * 1024
ALLOWED_EXTENSIONS = {"wav", "mp3", "m4a", "flac", "ogg", "opus", "webm"}
def allowed_file(filename: str) -> bool:
return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
# Audio conversion helpers
_CONVERT_TO_WAV_EXTS = {"webm", "mp4", "m4a"}
def _ffmpeg_path() -> str | None:
return shutil.which("ffmpeg")
def _should_convert_to_wav(path: str) -> bool:
ext = os.path.splitext(path)[1].lower().lstrip(".")
return ext in _CONVERT_TO_WAV_EXTS
def _convert_to_wav(input_path: str) -> str:
ffmpeg = _ffmpeg_path()
if not ffmpeg:
raise RuntimeError("ffmpeg not found on PATH. Install ffmpeg or upload WAV/OGG/OPUS/MP3/M4A.")
output_path = input_path + ".wav"
cmd = [ffmpeg, "-y", "-i", input_path, "-ac", "1", "-ar", "22050", "-vn", output_path]
proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
if proc.returncode != 0:
tail = (proc.stderr or "").splitlines()[-10:]
raise RuntimeError("Audio conversion failed. " + "\n".join(tail))
return output_path
INDEX_HTML = r'''
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>XTTS Voice Cloning Demo</title>
<style>
:root {
--bg1: #0f172a;
--bg2: #111827;
--card-bg: rgba(255, 255, 255, 0.08);
--card-border: rgba(255, 255, 255, 0.15);
--text: #e5e7eb;
--muted: #94a3b8;
--primary: #8b5cf6;
--primary-600: #7c3aed;
--accent: #22d3ee;
--success: #10b981;
--danger: #ef4444;
}
* { box-sizing: border-box; }
html, body { height: 100%; }
body {
margin: 0;
font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Ubuntu, Cantarell, Noto Sans, Helvetica Neue, Arial, "Apple Color Emoji", "Segoe UI Emoji";
color: var(--text);
background: radial-gradient(1200px 800px at 10% 0%, #1f2937, transparent 50%),
radial-gradient(1000px 700px at 90% 0%, #0ea5e9, transparent 50%),
linear-gradient(160deg, var(--bg1), var(--bg2));
overflow-y: auto;
-webkit-font-smoothing: antialiased;
-moz-osx-font-smoothing: grayscale;
}
.container {
min-height: 100%;
display: flex;
align-items: center;
justify-content: center;
padding: 40px 20px;
}
.card {
width: 100%;
max-width: 980px;
background: var(--card-bg);
border: 1px solid var(--card-border);
border-radius: 20px;
backdrop-filter: blur(12px);
-webkit-backdrop-filter: blur(12px);
box-shadow: 0 10px 30px rgba(0,0,0,0.35), inset 0 1px 0 rgba(255,255,255,0.08);
overflow: hidden;
}
.header {
padding: 28px 28px 0 28px;
display: flex;
align-items: center;
justify-content: space-between;
gap: 12px;
}
.title {
display: flex;
align-items: center;
gap: 12px;
}
.badge {
display: inline-block;
font-size: 12px;
letter-spacing: 0.08em;
text-transform: uppercase;
color: white;
background: linear-gradient(135deg, var(--primary), var(--accent));
padding: 6px 10px;
border-radius: 999px;
border: 1px solid rgba(255,255,255,0.25);
}
h1 {
margin: 0;
font-size: 24px;
font-weight: 700;
}
.body {
display: grid;
grid-template-columns: 1.1fr 0.9fr;
gap: 24px;
padding: 24px 28px 28px 28px;
}
@media (max-width: 900px) {
.body { grid-template-columns: 1fr; }
}
.panel {
background: rgba(255,255,255,0.04);
border: 1px solid rgba(255,255,255,0.12);
border-radius: 16px;
padding: 18px;
}
p { color: var(--muted); margin: 0 0 12px 0; line-height: 1.6; }
ul { color: var(--muted); margin: 0 0 12px 20px; }
li { margin: 6px 0; }
label { display: block; margin: 12px 0 8px 0; color: #cbd5e1; font-size: 14px; }
textarea, select, input[type="file"], input[type="text"] {
width: 100%;
padding: 12px 14px;
border-radius: 10px;
border: 1px solid rgba(255,255,255,0.12);
background: rgba(0,0,0,0.25);
color: var(--text);
outline: none;
}
/* Improve dropdown visibility */
select {
background: #0b1220;
color: #f1f5f9;
border-color: rgba(255,255,255,0.2);
}
/* Ensure dropdown options are readable in dark mode (supported browsers) */
select option {
background-color: #0b1220;
color: #f1f5f9;
}
textarea { min-height: 120px; resize: vertical; }
.row { display: grid; grid-template-columns: 1fr 1fr; gap: 12px; }
@media (max-width: 600px) { .row { grid-template-columns: 1fr; } }
.btn {
display: inline-flex;
align-items: center;
gap: 10px;
padding: 12px 16px;
border: 0;
border-radius: 12px;
color: white;
font-weight: 600;
background: linear-gradient(135deg, var(--primary), var(--primary-600));
box-shadow: 0 8px 20px rgba(139, 92, 246, 0.35);
cursor: pointer;
transition: transform .06s ease, filter .2s ease, box-shadow .2s ease;
}
.btn:disabled { filter: grayscale(0.3) brightness(0.8); cursor: not-allowed; }
.btn:not(:disabled):hover { transform: translateY(-1px); filter: brightness(1.05); }
.muted { color: var(--muted); font-size: 13px; }
.divider { height: 1px; background: rgba(255,255,255,0.1); margin: 18px 0; }
.result {
margin-top: 12px;
padding: 12px;
border-radius: 12px;
border: 1px solid rgba(255,255,255,0.12);
background: rgba(0,0,0,0.2);
}
.error { color: #fecaca; background: rgba(239, 68, 68, 0.12); border: 1px solid rgba(239, 68, 68, 0.25); padding: 10px 12px; border-radius: 10px; }
/* Loader overlay */
.overlay {
position: fixed;
inset: 0;
display: none;
align-items: center;
justify-content: center;
background: rgba(2, 6, 23, 0.55);
backdrop-filter: blur(4px);
z-index: 50;
}
.overlay.active { display: flex; }
.spinner {
width: 64px;
height: 64px;
border: 6px solid rgba(255,255,255,0.15);
border-top-color: var(--accent);
border-radius: 50%;
animation: spin 0.9s linear infinite;
box-shadow: 0 0 0 1px rgba(255,255,255,0.08) inset;
}
@keyframes spin { to { transform: rotate(360deg); } }
footer { padding: 0 28px 20px 28px; color: var(--muted); font-size: 12px; text-align: right; }
a { color: #93c5fd; text-decoration: none; }
a:hover { text-decoration: underline; }
/* Modals and progress styling */
.modal-overlay {
position: fixed;
inset: 0;
display: none;
align-items: center;
justify-content: center;
background: rgba(2, 6, 23, 0.6);
backdrop-filter: blur(6px);
z-index: 60;
}
.modal-overlay.active { display: flex; }
.modal {
width: min(560px, 92vw);
background: var(--card-bg);
border: 1px solid var(--card-border);
border-radius: 16px;
box-shadow: 0 10px 30px rgba(0,0,0,0.45), inset 0 1px 0 rgba(255,255,255,0.06);
overflow: hidden;
}
.modal-header {
padding: 16px 18px;
display: flex;
align-items: center;
gap: 10px;
border-bottom: 1px solid rgba(255,255,255,0.08);
}
.modal-title { font-size: 16px; font-weight: 700; }
.modal-body { padding: 16px 18px; color: var(--muted); }
.modal-actions { padding: 14px 18px 18px; display: flex; gap: 10px; justify-content: flex-end; }
.btn.secondary { background: rgba(255,255,255,0.08); box-shadow: none; }
.btn.secondary:hover { filter: brightness(1.1); }
.steps { display: flex; flex-direction: column; gap: 10px; margin-top: 6px; }
.step { display: flex; align-items: center; gap: 12px; padding: 10px 12px; border: 1px solid rgba(255,255,255,0.08); border-radius: 12px; background: rgba(255,255,255,0.03); }
.step .dot { width: 12px; height: 12px; border-radius: 50%; background: rgba(255,255,255,0.2); box-shadow: 0 0 0 2px rgba(255,255,255,0.08) inset; }
.step.active .dot { background: var(--accent); animation: pulse 1s ease-in-out infinite; }
.step.done .dot { background: var(--success); box-shadow: none; }
.step.error { border-color: rgba(239,68,68,0.45); background: rgba(239,68,68,0.1); }
.step .label { color: var(--text); font-weight: 600; }
.step .sub { color: var(--muted); font-size: 13px; }
@keyframes pulse { 0% { transform: scale(1); } 50% { transform: scale(1.25); } 100% { transform: scale(1); } }
.progress-bar { height: 6px; background: rgba(255,255,255,0.08); border-radius: 999px; overflow: hidden; margin-top: 12px; }
.progress-bar > div { height: 100%; width: 20%; background: linear-gradient(90deg, var(--primary), var(--accent)); animation: progressAnim 1.2s linear infinite; }
@keyframes progressAnim { from { transform: translateX(-100%);} to { transform: translateX(400%);} }
.alert { color: #fde68a; background: rgba(245, 158, 11, 0.12); border: 1px solid rgba(245, 158, 11, 0.3); padding: 10px 12px; border-radius: 10px; }
</style>
</head>
<body>
<div class="container">
<div class="card">
<div class="header">
<div class="title">
<span class="badge">XTTS v2</span>
<h1>Voice Cloning Demo</h1>
</div>
<div>
<a class="btn secondary" href="/record">Try your own voice</a>
</div>
</div>
<div class="body">
<section class="panel">
<p><strong>Cross‑lingual voice cloning</strong> powered by the Coqui TTS XTTS v2 model. Provide a few seconds of a reference voice, choose a language, and synthesize any text in that cloned voice.</p>
<div class="divider"></div>
<ul>
<li>Upload a short reference clip (WAV/MP3/M4A/FLAC/OGG/OPUS)</li>
<li>Select target language</li>
<li>Type the text you want the cloned voice to speak</li>
</ul>
<p class="muted">Note: First run may take longer while the model downloads and loads. A loading indicator will be shown.</p>
</section>
<section class="panel">
<form id="cloneForm">
<label for="reference">Reference audio</label>
<input id="reference" name="reference" type="file" accept=".wav,.mp3,.m4a,.flac,.ogg,.opus,.webm" required />
<div class="muted">Use a clean clip with minimal background noise for best results.</div>
<label for="language">Language</label>
<select id="language" name="language" required>
<option value="en" selected>English (en)</option>
<option value="it">Italian (it)</option>
<option value="es">Spanish (es)</option>
<option value="fr">French (fr)</option>
<option value="de">German (de)</option>
<option value="pt">Portuguese (pt)</option>
<option value="hi">Hindi (hi)</option>
<option value="ar">Arabic (ar)</option>
<option value="zh">Chinese (zh)</option>
<option value="ja">Japanese (ja)</option>
<option value="ko">Korean (ko)</option>
</select>
<label for="text">Text to synthesize</label>
<textarea id="text" name="text" placeholder="Type the sentence to synthesize in the cloned voice..." required>Hi! This is a web demo using XTTS v2 to clone a voice and speak this sentence.</textarea>
<div style="margin-top:14px; display:flex; align-items:center; gap:12px;">
<button id="submitBtn" class="btn" type="submit">Clone Voice</button>
<span class="muted">The output will appear below.</span>
</div>
<div id="message" style="margin-top:12px;"></div>
<div id="result" class="result" style="display:none;">
<strong>Result</strong>
<audio id="audioPlayer" style="margin-top:8px; width:100%;" controls></audio>
</div>
</form>
</section>
</div>
<footer>
Powered by <a href="https://github.com/coqui-ai/TTS" target="_blank" rel="noopener">Coqui TTS</a> • XTTS v2
</footer>
</div>
</div>
<div id="confirmOverlay" class="modal-overlay" role="dialog" aria-modal="true" aria-labelledby="confirmTitle">
<div class="modal">
<div class="modal-header">
<div class="modal-title" id="confirmTitle">Before you start</div>
</div>
<div class="modal-body">
<div class="alert">This demo runs the XTTS model locally. The first request may take a little longer while the model loads. Subsequent runs will be faster. Thanks for your patience.</div>
<p style="margin-top:10px;">Your reference audio stays on this machine. The generated audio will appear when processing completes.</p>
</div>
<div class="modal-actions">
<button id="confirmCancel" class="btn secondary" type="button">Cancel</button>
<button id="confirmOk" class="btn" type="button">Proceed</button>
</div>
</div>
</div>
<div id="progressOverlay" class="modal-overlay" role="dialog" aria-modal="true" aria-labelledby="progressTitle">
<div class="modal" style="max-width:680px;">
<div class="modal-header">
<div class="modal-title" id="progressTitle">Cloning in progress</div>
</div>
<div class="modal-body">
<div class="steps" id="steps">
<div class="step" data-step="0"><div class="dot"></div><div><div class="label">Preparing</div><div class="sub">Validating inputs</div></div></div>
<div class="step" data-step="1"><div class="dot"></div><div><div class="label">Uploading reference</div><div class="sub">Sending audio to server</div></div></div>
<div class="step" data-step="2"><div class="dot"></div><div><div class="label">Waiting for server</div><div class="sub">Request queued</div></div></div>
<div class="step" data-step="3"><div class="dot"></div><div><div class="label">Loading model</div><div class="sub">First run can be slow</div></div></div>
<div class="step" data-step="4"><div class="dot"></div><div><div class="label">Generating audio</div><div class="sub">Synthesizing speech</div></div></div>
<div class="step" data-step="5"><div class="dot"></div><div><div class="label">Finalizing</div><div class="sub">Preparing playback</div></div></div>
</div>
<div class="progress-bar"><div></div></div>
<div id="progressError" class="error" style="display:none; margin-top:12px;"></div>
</div>
<div class="modal-actions">
<button id="progressClose" class="btn secondary" type="button" style="display:none;">Close</button>
</div>
</div>
</div>
<script>
const form = document.getElementById('cloneForm');
const submitBtn = document.getElementById('submitBtn');
const message = document.getElementById('message');
const resultBox = document.getElementById('result');
const audioPlayer = document.getElementById('audioPlayer');
const confirmOverlay = document.getElementById('confirmOverlay');
const confirmOk = document.getElementById('confirmOk');
const confirmCancel = document.getElementById('confirmCancel');
const progressOverlay = document.getElementById('progressOverlay');
const progressClose = document.getElementById('progressClose');
const stepsRoot = document.getElementById('steps');
const progressError = document.getElementById('progressError');
// Single polling loop guards
let pollHandle = null;
let pollJobId = null;
let pollController = null;
function stopPolling() {
if (pollHandle) { clearTimeout(pollHandle); pollHandle = null; }
if (pollController) { try { pollController.abort(); } catch (_) {} pollController = null; }
pollJobId = null;
}
function openConfirm(onProceed) {
confirmOverlay.classList.add('active');
const cleanup = () => {
confirmOverlay.classList.remove('active');
confirmOk.onclick = null;
confirmCancel.onclick = null;
};
confirmOk.onclick = () => { cleanup(); onProceed(); };
confirmCancel.onclick = cleanup;
}
function setStepState(index, state) { // state: pending|active|done|error
const el = stepsRoot.querySelector(`.step[data-step="${index}"]`);
if (!el) return;
el.classList.remove('active','done','error');
if (state === 'active') el.classList.add('active');
if (state === 'done') el.classList.add('done');
if (state === 'error') el.classList.add('error');
}
function setStepSub(index, text) {
const el = stepsRoot.querySelector(`.step[data-step="${index}"] .sub`);
if (el && text) el.textContent = text;
}
function resetSteps() {
stepsRoot.querySelectorAll('.step').forEach(s => {
s.classList.remove('active','done','error');
});
progressError.style.display = 'none';
progressClose.style.display = 'none';
}
function openProgress() {
resetSteps();
progressOverlay.classList.add('active');
submitBtn.disabled = true;
}
function closeProgress() {
progressOverlay.classList.remove('active');
submitBtn.disabled = false;
stopPolling();
}
function showError(msg) {
message.innerHTML = `<div class="error">${msg}</div>`;
}
function schedulePoll(jobId) {
// Ensure only one polling loop per job
if (pollJobId !== jobId) return;
pollController = new AbortController();
fetch(`/api/clone_status/${jobId}`, { signal: pollController.signal })
.then(res => res.json().then(json => ({ ok: res.ok, json })))
.then(({ ok, json }) => {
if (!ok || !json.success) throw new Error(json.error || 'Failed to get status');
const steps = json.steps || [];
steps.forEach((st, i) => { setStepState(i, st.status); setStepSub(i, st.sub); });
if (json.status === 'done') {
if (json.audio_url) { audioPlayer.src = json.audio_url; audioPlayer.load(); }
progressClose.style.display = 'inline-flex';
setTimeout(() => {
closeProgress();
resultBox.style.display = 'block';
audioPlayer.play().catch(()=>{});
}, 350);
stopPolling();
} else if (json.status === 'error') {
progressError.style.display = 'block';
progressError.textContent = json.error || 'Unexpected error';
progressClose.style.display = 'inline-flex';
progressClose.onclick = closeProgress;
showError(progressError.textContent);
stopPolling();
} else {
// Schedule next poll after current completes
pollHandle = setTimeout(() => schedulePoll(jobId), 1200);
}
})
.catch(e => {
progressError.style.display = 'block';
progressError.textContent = (e && e.message) ? e.message : 'Unexpected error';
progressClose.style.display = 'inline-flex';
progressClose.onclick = closeProgress;
showError(progressError.textContent);
stopPolling();
});
}
async function runClone(data) {
resultBox.style.display = 'none';
openProgress();
stopPolling(); // cancel any previous
try {
// Kick off job
const startRes = await fetch('/api/clone_start', { method: 'POST', body: data });
const startJson = await startRes.json();
if (!startRes.ok || !startJson.success) {
throw new Error(startJson.error || 'Failed to start job');
}
const jobId = startJson.job_id;
pollJobId = jobId;
schedulePoll(jobId); // start immediate poll cycle
} catch (err) {
progressError.style.display = 'block';
progressError.textContent = (err && err.message) ? err.message : 'Unexpected error';
progressClose.style.display = 'inline-flex';
progressClose.onclick = closeProgress;
showError(progressError.textContent);
stopPolling();
}
}
form.addEventListener('submit', async (e) => {
e.preventDefault();
message.textContent = '';
const data = new FormData(form);
if (!data.get('text') || !data.get('reference')) {
showError('Please provide both text and a reference audio file.');
return;
}
openConfirm(() => runClone(data));
});
</script>
</body>
</html>
'''
RECORD_HTML = r'''
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>Record Your Voice • XTTS Demo</title>
<style>
:root {
--bg1: #0f172a;
--bg2: #111827;
--card-bg: rgba(255, 255, 255, 0.08);
--card-border: rgba(255, 255, 255, 0.15);
--text: #e5e7eb;
--muted: #94a3b8;
--primary: #8b5cf6;
--primary-600: #7c3aed;
--accent: #22d3ee;
--success: #10b981;
--danger: #ef4444;
}
* { box-sizing: border-box; }
html, body { height: 100%; }
body {
margin: 0;
font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Ubuntu, Cantarell, Noto Sans, Helvetica Neue, Arial, "Apple Color Emoji", "Segoe UI Emoji";
color: var(--text);
background: radial-gradient(1200px 800px at 10% 0%, #1f2937, transparent 50%),
radial-gradient(1000px 700px at 90% 0%, #0ea5e9, transparent 50%),
linear-gradient(160deg, var(--bg1), var(--bg2));
overflow-y: auto;
}
.container { min-height: 100%; display: flex; align-items: center; justify-content: center; padding: 40px 20px; }
.card { width: 100%; max-width: 980px; background: var(--card-bg); border: 1px solid var(--card-border); border-radius: 20px; backdrop-filter: blur(12px); -webkit-backdrop-filter: blur(12px); box-shadow: 0 10px 30px rgba(0,0,0,0.35), inset 0 1px 0 rgba(255,255,255,0.08); overflow: hidden; }
.header { padding: 28px 28px 0 28px; display: flex; align-items: center; justify-content: space-between; gap: 12px; }
.title { display: flex; align-items: center; gap: 12px; }
.badge { display: inline-block; font-size: 12px; letter-spacing: 0.08em; text-transform: uppercase; color: white; background: linear-gradient(135deg, var(--primary), var(--accent)); padding: 6px 10px; border-radius: 999px; border: 1px solid rgba(255,255,255,0.25); }
h1 { margin: 0; font-size: 24px; font-weight: 700; }
.body { display: grid; grid-template-columns: 1.1fr 0.9fr; gap: 24px; padding: 24px 28px 28px 28px; }
@media (max-width: 900px) { .body { grid-template-columns: 1fr; } }
.panel { background: rgba(255,255,255,0.04); border: 1px solid rgba(255,255,255,0.12); border-radius: 16px; padding: 18px; }
p { color: var(--muted); margin: 0 0 12px 0; line-height: 1.6; }
label { display: block; margin: 12px 0 8px 0; color: #cbd5e1; font-size: 14px; }
select, textarea, input[type="text"] { width: 100%; padding: 12px 14px; border-radius: 10px; border: 1px solid rgba(255,255,255,0.12); background: rgba(0,0,0,0.25); color: var(--text); outline: none; }
select { background: #0b1220; color: #f1f5f9; border-color: rgba(255,255,255,0.2); }
select option { background-color: #0b1220; color: #f1f5f9; }
textarea { min-height: 120px; resize: vertical; }
.btn { display: inline-flex; align-items: center; gap: 10px; padding: 12px 16px; border: 0; border-radius: 12px; color: white; font-weight: 600; background: linear-gradient(135deg, var(--primary), var(--primary-600)); box-shadow: 0 8px 20px rgba(139, 92, 246, 0.35); cursor: pointer; transition: transform .06s ease, filter .2s ease, box-shadow .2s ease; }
.btn.secondary { background: rgba(255,255,255,0.08); box-shadow: none; }
.btn:disabled { filter: grayscale(0.3) brightness(0.8); cursor: not-allowed; }
.muted { color: var(--muted); font-size: 13px; }
.recorder { display:flex; align-items:center; gap:12px; padding:12px; border:1px solid rgba(255,255,255,0.12); border-radius:12px; background: rgba(0,0,0,0.25); }
.dot { width:12px; height:12px; border-radius:50%; background: rgba(239,68,68,0.5); }
.dot.active { background:#ef4444; animation: pulse 1s ease-in-out infinite; }
@keyframes pulse { 0% { transform: scale(1);} 50% { transform: scale(1.25);} 100% { transform: scale(1);} }
.controls { display:flex; gap:10px; flex-wrap:wrap; }
.divider { height: 1px; background: rgba(255,255,255,0.1); margin: 18px 0; }
.result { margin-top: 12px; padding: 12px; border-radius: 12px; border: 1px solid rgba(255,255,255,0.12); background: rgba(0,0,0,0.2); }
.error { color: #fecaca; background: rgba(239, 68, 68, 0.12); border: 1px solid rgba(239, 68, 68, 0.25); padding: 10px 12px; border-radius: 10px; }
.modal-overlay { position: fixed; inset: 0; display: none; align-items: center; justify-content: center; background: rgba(2, 6, 23, 0.6); backdrop-filter: blur(6px); z-index: 60; }
.modal-overlay.active { display: flex; }
.modal { width:min(560px,92vw); background: var(--card-bg); border:1px solid var(--card-border); border-radius:16px; box-shadow: 0 10px 30px rgba(0,0,0,0.45), inset 0 1px 0 rgba(255,255,255,0.06); overflow:hidden; }
.modal-header { padding:16px 18px; display:flex; align-items:center; gap:10px; border-bottom:1px solid rgba(255,255,255,0.08); }
.modal-title { font-size:16px; font-weight:700; }
.modal-body { padding: 16px 18px; color: var(--muted); }
.modal-actions { padding:14px 18px 18px; display:flex; gap:10px; justify-content:flex-end; }
.steps { display:flex; flex-direction:column; gap:10px; margin-top:6px; }
.step { display:flex; align-items:center; gap:12px; padding:10px 12px; border:1px solid rgba(255,255,255,0.08); border-radius:12px; background: rgba(255,255,255,0.03); }
.step .dot { width:12px; height:12px; border-radius:50%; background: rgba(255,255,255,0.2); box-shadow: 0 0 0 2px rgba(255,255,255,0.08) inset; }
.step.active .dot { background: var(--accent); animation: pulse 1s ease-in-out infinite; }
.step.done .dot { background: var(--success); box-shadow:none; }
.progress-bar { height:6px; background: rgba(255,255,255,0.08); border-radius:999px; overflow:hidden; margin-top:12px; }
.progress-bar > div { height:100%; width:20%; background: linear-gradient(90deg, var(--primary), var(--accent)); animation: progressAnim 1.2s linear infinite; }
@keyframes progressAnim { from { transform: translateX(-100%);} to { transform: translateX(400%);} }
.alert { color: #fde68a; background: rgba(245, 158, 11, 0.12); border: 1px solid rgba(245, 158, 11, 0.3); padding: 10px 12px; border-radius: 10px; }
</style>
</head>
<body>
<div class="container">
<div class="card">
<div class="header">
<div class="title">
<span class="badge">XTTS v2</span>
<h1>Record Your Voice</h1>
</div>
<div>
<a class="btn secondary" href="/">Back to Upload</a>
</div>
</div>
<div class="body">
<section class="panel">
<p><strong>Try your own voice</strong> by recording a short, clear clip. Then choose a language and synthesize any text in your cloned voice.</p>
<div class="divider"></div>
<div class="recorder">
<div id="recDot" class="dot"></div>
<div style="flex:1;">
<div style="display:flex; align-items:center; gap:10px;">
<div id="recLabel" style="font-weight:600;">Idle</div>
<div id="recTimer" class="muted">00:00</div>
</div>
<div class="muted" style="margin-top:6px;">Use a quiet environment and speak naturally for 5–10 seconds.</div>
</div>
</div>
<div class="controls" style="margin-top:12px;">
<button id="btnStart" class="btn" type="button">Start recording</button>
<button id="btnStop" class="btn secondary" type="button" disabled>Stop</button>
<button id="btnRetake" class="btn secondary" type="button" disabled>Retake</button>
</div>
<audio id="preview" style="margin-top:10px; width:100%; display:none;" controls></audio>
</section>
<section class="panel">
<form id="recordForm">
<label for="language">Language</label>
<select id="language" name="language" required>
<option value="en" selected>English (en)</option>
<option value="it">Italian (it)</option>
<option value="es">Spanish (es)</option>
<option value="fr">French (fr)</option>
<option value="de">German (de)</option>
<option value="pt">Portuguese (pt)</option>
<option value="hi">Hindi (hi)</option>
<option value="ar">Arabic (ar)</option>
<option value="zh">Chinese (zh)</option>
<option value="ja">Japanese (ja)</option>
<option value="ko">Korean (ko)</option>
</select>
<label for="text">Text to synthesize</label>
<textarea id="text" name="text" placeholder="Type the sentence to synthesize in your cloned voice..." required>Hi! This is my own voice recorded and used to clone for this sentence.</textarea>
<div style="margin-top:14px; display:flex; align-items:center; gap:12px;">
<button id="submitBtn" class="btn" type="submit">Clone Voice</button>
<span class="muted">Recording is required before cloning.</span>
</div>
<div id="message" style="margin-top:12px;"></div>
<div id="result" class="result" style="display:none;">
<strong>Result</strong>
<audio id="audioPlayer" style="margin-top:8px; width:100%;" controls></audio>
</div>
</form>
</section>
</div>
<footer style="padding: 0 28px 20px 28px; color: var(--muted); font-size: 12px; text-align: right;">
Powered by <a href="https://github.com/coqui-ai/TTS" target="_blank" rel="noopener" style="color:#93c5fd;">Coqui TTS</a> • XTTS v2
</footer>
</div>
</div>
<!-- Confirm and Progress Modals -->
<div id="confirmOverlay" class="modal-overlay" role="dialog" aria-modal="true" aria-labelledby="confirmTitle">
<div class="modal">
<div class="modal-header">
<div class="modal-title" id="confirmTitle">Before you start</div>
</div>
<div class="modal-body">
<div class="alert">This demo runs the XTTS model locally. The first request may take a little longer while the model loads. Subsequent runs will be faster. Thanks for your patience.</div>
<p style="margin-top:10px;">Your voice recording stays on this machine. The generated audio will appear when processing completes.</p>
</div>
<div class="modal-actions">
<button id="confirmCancel" class="btn secondary" type="button">Cancel</button>
<button id="confirmOk" class="btn" type="button">Proceed</button>
</div>
</div>
</div>
<div id="progressOverlay" class="modal-overlay" role="dialog" aria-modal="true" aria-labelledby="progressTitle">
<div class="modal" style="max-width:680px;">
<div class="modal-header">
<div class="modal-title" id="progressTitle">Cloning in progress</div>
</div>
<div class="modal-body">
<div class="steps" id="steps">
<div class="step" data-step="0"><div class="dot"></div><div><div class="label">Preparing</div><div class="sub">Validating inputs</div></div></div>
<div class="step" data-step="1"><div class="dot"></div><div><div class="label">Uploading reference</div><div class="sub">Sending audio to server</div></div></div>
<div class="step" data-step="2"><div class="dot"></div><div><div class="label">Waiting for server</div><div class="sub">Request queued</div></div></div>
<div class="step" data-step="3"><div class="dot"></div><div><div class="label">Loading model</div><div class="sub">First run can be slow</div></div></div>
<div class="step" data-step="4"><div class="dot"></div><div><div class="label">Generating audio</div><div class="sub">Synthesizing speech</div></div></div>
<div class="step" data-step="5"><div class="dot"></div><div><div class="label">Finalizing</div><div class="sub">Preparing playback</div></div></div>
</div>
<div class="progress-bar"><div></div></div>
<div id="progressError" class="error" style="display:none; margin-top:12px;"></div>
</div>
<div class="modal-actions">
<button id="progressClose" class="btn secondary" type="button" style="display:none;">Close</button>
</div>
</div>
</div>
<script>
const recDot = document.getElementById('recDot');
const recLabel = document.getElementById('recLabel');
const recTimer = document.getElementById('recTimer');
const btnStart = document.getElementById('btnStart');
const btnStop = document.getElementById('btnStop');
const btnRetake = document.getElementById('btnRetake');
const preview = document.getElementById('preview');
let mediaStream = null;
let mediaRecorder = null;
let chunks = [];
let recordedBlob = null;
let t0 = 0; let timerHandle = null;
function fmt(t){ const m = Math.floor(t/60).toString().padStart(2,'0'); const s = Math.floor(t%60).toString().padStart(2,'0'); return `${m}:${s}`; }
function setTimer(on){
if (on){
t0 = Date.now();
recTimer.textContent = '00:00';
timerHandle = setInterval(()=>{
const dt=(Date.now()-t0)/1000;
recTimer.textContent = fmt(dt);
}, 250);
} else {
if (timerHandle){ clearInterval(timerHandle); timerHandle=null; }
}
}
async function startRecording(){
try {
const candidates = ['audio/ogg;codecs=opus','audio/webm;codecs=opus','audio/mp4;codecs=mp4a.40.2','audio/ogg','audio/webm'];
const mime = (window.MediaRecorder && typeof MediaRecorder.isTypeSupported === 'function') ? candidates.find(t => MediaRecorder.isTypeSupported(t)) : '';
mediaStream = await navigator.mediaDevices.getUserMedia({ audio: { echoCancellation: true, noiseSuppression: true } });
mediaRecorder = mime ? new MediaRecorder(mediaStream, { mimeType: mime }) : new MediaRecorder(mediaStream);
chunks = []; recordedBlob = null;
mediaRecorder.ondataavailable = e => { if (e.data && e.data.size > 0) chunks.push(e.data); };
mediaRecorder.onstop = () => {
recordedBlob = new Blob(chunks, { type: mediaRecorder.mimeType });
preview.src = URL.createObjectURL(recordedBlob);
preview.style.display = 'block';
recLabel.textContent = 'Recorded';
recDot.classList.remove('active');
setTimer(false);
btnRetake.disabled = false;
};
mediaRecorder.start();
recLabel.textContent = 'Recording...';
recDot.classList.add('active');
setTimer(true);
btnStart.disabled = true;
btnStop.disabled = false;
btnRetake.disabled = true;
} catch (e){
alert('Microphone access is required to record. ' + (e && e.message ? e.message : ''));
}
}
function stopRecording(){
if (mediaRecorder && mediaRecorder.state === 'recording'){
mediaRecorder.stop();
}
if (mediaStream){ mediaStream.getTracks().forEach(t => t.stop()); mediaStream = null; }
btnStart.disabled = false; btnStop.disabled = true;
}
function retake(){
recordedBlob = null; chunks = []; preview.src = ''; preview.style.display = 'none';
recLabel.textContent = 'Idle'; recTimer.textContent = '00:00'; recDot.classList.remove('active');
btnRetake.disabled = true;
}
btnStart.onclick = startRecording;
btnStop.onclick = stopRecording;
btnRetake.onclick = retake;
// Confirmation and progress logic (same as upload page)
const form = document.getElementById('recordForm');
const submitBtn = document.getElementById('submitBtn');
const message = document.getElementById('message');
const resultBox = document.getElementById('result');
const audioPlayer = document.getElementById('audioPlayer');
const confirmOverlay = document.getElementById('confirmOverlay');
const confirmOk = document.getElementById('confirmOk');
const confirmCancel = document.getElementById('confirmCancel');
const progressOverlay = document.getElementById('progressOverlay');
const progressClose = document.getElementById('progressClose');
const stepsRoot = document.getElementById('steps');
const progressError = document.getElementById('progressError');
let pollHandle = null; let pollJobId = null; let pollController = null;
function stopPolling(){ if (pollHandle){ clearTimeout(pollHandle); pollHandle=null; } if (pollController){ try{pollController.abort();}catch(_){} pollController=null; } pollJobId=null; }
function openConfirm(onProceed){
confirmOverlay.classList.add('active');
const cleanup=()=>{ confirmOverlay.classList.remove('active'); confirmOk.onclick=null; confirmCancel.onclick=null; };
confirmOk.onclick=()=>{ cleanup(); onProceed(); };
confirmCancel.onclick=cleanup;
}
function setStepState(index, state){ const el=stepsRoot.querySelector(`.step[data-step="${index}"]`); if(!el) return; el.classList.remove('active','done','error'); if(state==='active') el.classList.add('active'); if(state==='done') el.classList.add('done'); if(state==='error') el.classList.add('error'); }
function setStepSub(index, text){ const el=stepsRoot.querySelector(`.step[data-step="${index}"] .sub`); if(el && text) el.textContent=text; }
function resetSteps(){ stepsRoot.querySelectorAll('.step').forEach(s=>s.classList.remove('active','done','error')); progressError.style.display='none'; progressClose.style.display='none'; }
function openProgress(){ resetSteps(); progressOverlay.classList.add('active'); submitBtn.disabled=true; }
function closeProgress(){ progressOverlay.classList.remove('active'); submitBtn.disabled=false; stopPolling(); }
function showError(msg){ message.innerHTML = `<div class="error">${msg}</div>`; }
function schedulePoll(jobId){
if (pollJobId !== jobId) return;
pollController = new AbortController();
fetch(`/api/clone_status/${jobId}`, { signal: pollController.signal })
.then(res => res.json().then(json => ({ ok: res.ok, json })))
.then(({ ok, json }) => {
if (!ok || !json.success) throw new Error(json.error || 'Failed to get status');
const steps = json.steps || [];
steps.forEach((st,i)=>{ setStepState(i, st.status); setStepSub(i, st.sub); });
if (json.status === 'done'){
if (json.audio_url){ audioPlayer.src = json.audio_url; audioPlayer.load(); }
progressClose.style.display = 'inline-flex';
setTimeout(()=>{ closeProgress(); resultBox.style.display='block'; audioPlayer.play().catch(()=>{}); }, 350);
stopPolling();
} else if (json.status === 'error'){
progressError.style.display='block'; progressError.textContent = json.error || 'Unexpected error'; progressClose.style.display='inline-flex'; progressClose.onclick = closeProgress; showError(progressError.textContent); stopPolling();
} else {
pollHandle = setTimeout(()=>schedulePoll(jobId), 1200);
}
})
.catch(e=>{ progressError.style.display='block'; progressError.textContent = (e&&e.message)?e.message:'Unexpected error'; progressClose.style.display='inline-flex'; progressClose.onclick=closeProgress; showError(progressError.textContent); stopPolling(); });
}
async function runClone(){
resultBox.style.display='none';
if (!recordedBlob){ showError('Please record your voice before cloning.'); return; }
openProgress(); stopPolling();
try {
const fd = new FormData();
fd.append('language', document.getElementById('language').value);
fd.append('text', document.getElementById('text').value);
const type = (recordedBlob && recordedBlob.type) || '';
const ext = type.includes('ogg') ? 'ogg' : (type.includes('webm') ? 'webm' : (type.includes('mp4') ? 'm4a' : 'webm'));
fd.append('reference', recordedBlob, `recording.${ext}`);
const startRes = await fetch('/api/clone_start', { method:'POST', body: fd });
const startJson = await startRes.json();
if (!startRes.ok || !startJson.success){ throw new Error(startJson.error || 'Failed to start job'); }
const jobId = startJson.job_id; pollJobId = jobId; schedulePoll(jobId);
} catch (err){ progressError.style.display='block'; progressError.textContent=(err&&err.message)?err.message:'Unexpected error'; progressClose.style.display='inline-flex'; progressClose.onclick=closeProgress; showError(progressError.textContent); stopPolling(); }
}
form.addEventListener('submit', (e)=>{ e.preventDefault(); message.textContent=''; openConfirm(runClone); });
</script>
</body>
</html>
'''
@app.route("/record")
def record():
return render_template_string(RECORD_HTML)
@app.route("/")
def index():
return render_template_string(INDEX_HTML)
@app.route("/outputs/<path:filename>")
def serve_output(filename: str):
return send_from_directory(OUTPUT_DIR, filename, as_attachment=False)
# ---------------- Progress tracking and async job execution ---------------- #
JOBS = {}
JOBS_LOCK = threading.Lock()
STEPS_TEMPLATE = [
{"label": "Preparing", "sub": "Validating inputs", "status": "pending"},
{"label": "Uploading reference", "sub": "Saving audio", "status": "pending"},
{"label": "Waiting for server", "sub": "Queued", "status": "pending"},
{"label": "Loading model", "sub": "First run may be slow", "status": "pending"},
{"label": "Generating audio", "sub": "Synthesizing speech", "status": "pending"},
{"label": "Finalizing", "sub": "Preparing playback", "status": "pending"},
]
def _new_job() -> dict:
return {
"status": "pending",
"steps": [dict(label=s["label"], sub=s["sub"], status="pending") for s in STEPS_TEMPLATE],
"error": None,
"audio_url": None,
"created": time.time(),
}
# Cleanup policy for job registry
JOB_TTL_SECONDS = 3600 # 1 hour
MAX_JOBS = 500
def _cleanup_jobs() -> None:
now = time.time()
with JOBS_LOCK:
# Remove jobs older than TTL
to_delete = [jid for jid, job in JOBS.items() if now - job.get("created", now) > JOB_TTL_SECONDS]
# If too many jobs, remove oldest finished (done/error)
if len(JOBS) > MAX_JOBS:
finished = [jid for jid, job in JOBS.items() if job.get("status") in ("done", "error")]
finished.sort(key=lambda j: JOBS[j].get("created", 0))
overflow = max(0, len(JOBS) - MAX_JOBS)
to_delete.extend(finished[:overflow])
for jid in set(to_delete):
JOBS.pop(jid, None)
def _set_step(job_id: str, idx: int, status: str, sub: str | None = None) -> None:
with JOBS_LOCK:
job = JOBS.get(job_id)
if not job:
return
st = job["steps"][idx]
st["status"] = status
if sub is not None:
st["sub"] = sub
def _set_job_status(job_id: str, status: str) -> None:
with JOBS_LOCK:
job = JOBS.get(job_id)
if job:
job["status"] = status
def _set_job_error(job_id: str, msg: str) -> None:
with JOBS_LOCK:
job = JOBS.get(job_id)
if job:
job["status"] = "error"
job["error"] = msg
def _set_job_audio(job_id: str, audio_url: str) -> None:
with JOBS_LOCK:
job = JOBS.get(job_id)
if job:
job["audio_url"] = audio_url
def _run_job(job_id: str, *, text: str, language: str, device: str | None, input_path: str, output_name: str, output_path: str) -> None:
current_step = -1
try:
_set_job_status(job_id, "running")
# Step 0: Preparing
current_step = 0
_set_step(job_id, 0, "active")
_set_step(job_id, 0, "done")
# Step 1: Uploading reference (already saved by start endpoint)
current_step = 1
_set_step(job_id, 1, "active")
_set_step(job_id, 1, "done")
# Step 2: Waiting for server (queue)
current_step = 2
_set_step(job_id, 2, "active")
_set_step(job_id, 2, "done")
# Step 3: Loading model
current_step = 3
if not is_model_loaded(device):
_set_step(job_id, 3, "active")
warm_model(device)
_set_step(job_id, 3, "done")
else:
_set_step(job_id, 3, "done", sub="Model already in memory")
# Step 4: Generating audio
current_step = 4
_set_step(job_id, 4, "active", sub="Synthesizing speech")
ref_path = input_path
if _should_convert_to_wav(input_path):
if _ffmpeg_path():
_set_step(job_id, 4, "active", sub="Converting reference audio")
ref_path = _convert_to_wav(input_path)
_set_step(job_id, 4, "active", sub="Synthesizing speech")
else:
raise RuntimeError("Reference format not supported by backend. Please install ffmpeg or upload WAV/OGG/OPUS/MP3/M4A.")
do_clone(text=text, speaker_wav=ref_path, language=language, output=output_path, device=device)
_set_step(job_id, 4, "done")
# Step 5: Finalizing
current_step = 5
_set_step(job_id, 5, "active")
# Avoid url_for in background thread (no app context). Use relative path.
audio_url = f"/outputs/{output_name}"
_set_job_audio(job_id, audio_url)
_set_step(job_id, 5, "done")
_set_job_status(job_id, "done")
except Exception as e:
failed_step = current_step if current_step >= 0 else 0
_set_step(job_id, failed_step, "error")
_set_job_error(job_id, str(e))
@app.route("/api/clone_start", methods=["POST"])
def api_clone_start():
_cleanup_jobs()
text = (request.form.get("text") or "").strip()
language = (request.form.get("language") or "en").strip()
device = (request.form.get("device") or None)
file = request.files.get("reference")
if not text:
return jsonify({"success": False, "error": "Text is required."}), 400
if not file or file.filename == "":
return jsonify({"success": False, "error": "Reference audio file is required."}), 400
if not allowed_file(file.filename):
return jsonify({"success": False, "error": "Unsupported file type. Use wav, mp3, m4a, flac, ogg, or opus."}), 400
filename = secure_filename(file.filename)
ts = int(time.time() * 1000)
input_path = os.path.join(UPLOAD_DIR, f"{ts}_{filename}")
output_name = f"clone_{ts}.wav"
output_path = os.path.join(OUTPUT_DIR, output_name)
# Save upload before returning job id
file.save(input_path)
job_id = uuid.uuid4().hex
with JOBS_LOCK:
JOBS[job_id] = _new_job()
threading.Thread(
target=_run_job,
kwargs={
"job_id": job_id,
"text": text,
"language": language,
"device": device,
"input_path": input_path,
"output_name": output_name,
"output_path": output_path,
},
daemon=True,
).start()
return jsonify({"success": True, "job_id": job_id})
@app.route("/api/clone_status/<job_id>", methods=["GET"])
def api_clone_status(job_id: str):
_cleanup_jobs()
with JOBS_LOCK:
job = JOBS.get(job_id)
if not job:
return jsonify({"success": False, "error": "Invalid job id"}), 404
return jsonify({"success": True, "status": job["status"], "steps": job["steps"], "error": job["error"], "audio_url": job["audio_url"]})
@app.route("/api/clone", methods=["POST"])
def api_clone():
text = (request.form.get("text") or "").strip()
language = (request.form.get("language") or "en").strip()
device = (request.form.get("device") or None)
file = request.files.get("reference")
if not text:
return jsonify({"success": False, "error": "Text is required."}), 400
if not file or file.filename == "":
return jsonify({"success": False, "error": "Reference audio file is required."}), 400
if not allowed_file(file.filename):
return jsonify({"success": False, "error": "Unsupported file type. Use wav, mp3, m4a, flac, ogg, or opus."}), 400
filename = secure_filename(file.filename)
ts = int(time.time() * 1000)
input_path = os.path.join(UPLOAD_DIR, f"{ts}_{filename}")
output_name = f"clone_{ts}.wav"
output_path = os.path.join(OUTPUT_DIR, output_name)
file.save(input_path)
# Convert to WAV if necessary (for formats like WEBM/M4A)
ref_path = input_path
if _should_convert_to_wav(input_path):
if _ffmpeg_path():
try:
ref_path = _convert_to_wav(input_path)
except Exception as e:
return jsonify({"success": False, "error": str(e)}), 400
else:
return jsonify({"success": False, "error": "Reference format not supported by backend. Install ffmpeg or upload WAV/OGG/OPUS/MP3/M4A."}), 400
try:
# Perform cloning
do_clone(text=text, speaker_wav=ref_path, language=language, output=output_path, device=device)
except Exception as e:
return jsonify({"success": False, "error": str(e)}), 500
audio_url = url_for("serve_output", filename=output_name)
return jsonify({"success": True, "audio_url": audio_url})
if __name__ == "__main__":
# For local development
app.run(host="127.0.0.1", port=5000, debug=True, use_reloader=False)