#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import sys
# Set UTF-8 encoding for Windows
if sys.platform == 'win32':
import codecs
sys.stdout = codecs.getwriter('utf-8')(sys.stdout.detach())
sys.stderr = codecs.getwriter('utf-8')(sys.stderr.detach())
# Load environment variables from .env file (optimized for HF Spaces)
try:
# Only load .env in local development, skip in production
if not os.environ.get("SPACE_ID") and not os.environ.get("HF_SPACE_ID"):
from dotenv import load_dotenv
load_dotenv()
print("✅ Environment variables loaded from .env file")
else:
print("🏭 Production environment - using system environment variables")
except ImportError:
print("⚠️ python-dotenv not installed. Using system environment variables only.")
except Exception as e:
print(f"⚠️ Error loading .env file: {e}")
# Essential imports for HF Spaces
import numpy as np
import gradio as gr
# Try to import google-generativeai with fallback
try:
import google.generativeai as genai
GENAI_AVAILABLE = True
except ImportError as e:
print(f"⚠️ google-generativeai not available: {e}")
GENAI_AVAILABLE = False
genai = None
try:
from gtts import gTTS, lang
GTTS_AVAILABLE = True
except ImportError as e:
print(f"⚠️ gtts not available: {e}")
GTTS_AVAILABLE = False
import tempfile
# import soundfile as sf # Import locally to avoid startup overhead
# Kokoro not used - removed for performance
import time
import base64
# Try to import optional dependencies
try:
import edge_tts
EDGE_TTS_AVAILABLE = True
except ImportError as e:
print(f"⚠️ edge-tts not available: {e}")
EDGE_TTS_AVAILABLE = False
import asyncio
import io
try:
import PyPDF2
PDF_AVAILABLE = True
except ImportError:
PDF_AVAILABLE = False
try:
import docx
DOCX_AVAILABLE = True
except ImportError:
DOCX_AVAILABLE = False
import shutil
import atexit
import glob
import datetime
# Librosa not used - removed for performance
# === RECORD DATA MANAGEMENT ===
RECORD_DATA_DIR = "record_data"
def create_record_data_directory():
"""Create record_data directory if it doesn't exist"""
if not os.path.exists(RECORD_DATA_DIR):
os.makedirs(RECORD_DATA_DIR)
print(f"✅ Created directory: {RECORD_DATA_DIR}")
return RECORD_DATA_DIR
def cleanup_record_data():
"""Clean up record_data directory when app closes (disabled for production)"""
try:
# Disable cleanup for HF Spaces and production environments
if os.environ.get("SPACE_ID") or os.environ.get("HF_SPACE_ID"):
print(f"🏭 Production environment detected - keeping {RECORD_DATA_DIR} directory")
return
# Only cleanup in local development
if os.path.exists(RECORD_DATA_DIR):
shutil.rmtree(RECORD_DATA_DIR)
print(f"🧹 Cleaned up {RECORD_DATA_DIR} directory")
except Exception as e:
print(f"⚠️ Error cleaning up {RECORD_DATA_DIR}: {e}")
def save_recorded_audio(audio_data, original_filename=None):
"""Save audio data to record_data directory"""
try:
# Create directory if needed
create_record_data_directory()
# Generate filename with timestamp
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
if original_filename:
name_part = os.path.splitext(os.path.basename(original_filename))[0]
filename = f"recorded_{name_part}_{timestamp}.wav"
else:
filename = f"recorded_{timestamp}.wav"
filepath = os.path.join(RECORD_DATA_DIR, filename)
# Handle different audio data types
if isinstance(audio_data, str) and os.path.exists(audio_data):
# File path - copy the file
shutil.copy2(audio_data, filepath)
elif isinstance(audio_data, tuple) and len(audio_data) == 2:
# Numpy array format (sample_rate, audio_array)
sample_rate, audio_array = audio_data
import soundfile as sf
sf.write(filepath, audio_array, sample_rate)
print(f"📊 Saved numpy audio: sr={sample_rate}, shape={audio_array.shape}")
else:
# Raw data
with open(filepath, 'wb') as f:
f.write(audio_data)
print(f"✅ Saved recorded audio: {filepath}")
return filepath
except Exception as e:
print(f"❌ Error saving recorded audio: {e}")
import traceback
traceback.print_exc()
return None
def get_recorded_files():
"""Get list of recorded audio files"""
try:
if not os.path.exists(RECORD_DATA_DIR):
print(f"📁 Record directory does not exist: {RECORD_DATA_DIR}")
return []
# Get all audio files in record_data
pattern = os.path.join(RECORD_DATA_DIR, "*.wav")
files = glob.glob(pattern)
print(f"🔍 Found {len(files)} files in {RECORD_DATA_DIR}")
# Sort by modification time (newest first)
files.sort(key=os.path.getmtime, reverse=True)
# Return just filenames for display
filenames = [os.path.basename(f) for f in files]
print(f"📂 Returning filenames: {filenames}")
return filenames
except Exception as e:
print(f"❌ Error getting recorded files: {e}")
return []
def get_recorded_file_path(filename):
"""Get full path of recorded file"""
return os.path.join(RECORD_DATA_DIR, filename)
def delete_recorded_file(filename):
"""Delete recorded file from record_data directory"""
try:
if not filename or not filename.strip():
return "❌ Không có file nào được chọn để xóa"
file_path = get_recorded_file_path(filename)
print(f"🗑️ Attempting to delete: {file_path}")
if os.path.exists(file_path):
os.remove(file_path)
print(f"✅ Successfully deleted: {filename}")
return f"✅ Đã xóa file: {filename}"
else:
print(f"❌ File not found: {file_path}")
return f"❌ Không tìm thấy file: {filename}"
except Exception as e:
print(f"❌ Error deleting file: {e}")
return f"❌ Lỗi khi xóa file: {str(e)}"
# Register cleanup function to run when app exits (disabled for stability)
# atexit.register(cleanup_record_data) # Disabled to prevent data loss on deployment
# DOCX support already checked above
# Configure Gemini API - Delayed configuration for faster startup
GEMINI_API_KEY = None
def configure_gemini_api():
"""Configure Gemini API on first use to speed up startup"""
global GEMINI_API_KEY
if not GENAI_AVAILABLE:
print("❌ google-generativeai not available")
return None
if GEMINI_API_KEY is None:
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY")
if GEMINI_API_KEY:
genai.configure(api_key=GEMINI_API_KEY)
print("✅ Gemini API configured successfully")
else:
print("⚠️ GEMINI_API_KEY or GOOGLE_API_KEY not found in environment variables")
return GEMINI_API_KEY
# Language configurations for Audio Translation (simplified)
if GTTS_AVAILABLE:
GTTS_LANGUAGES = lang.tts_langs()
GTTS_LANGUAGES['ja'] = 'Japanese'
else:
GTTS_LANGUAGES = {'en': 'English', 'vi': 'Vietnamese'}
SUPPORTED_LANGUAGES = sorted(list(GTTS_LANGUAGES.values()))
# Voice mapping for Edge TTS - defined once for performance
VOICE_MAP = {
"🇻🇳 HoaiMy - Nữ Việt Chuẩn": "vi-VN-HoaiMyNeural",
"🇻🇳 NamMinh - Nam Việt Chuẩn": "vi-VN-NamMinhNeural",
"🇺🇸 Aria - Nữ Mỹ": "en-US-AriaNeural",
"🇺🇸 Guy - Nam Mỹ": "en-US-GuyNeural",
"🇬🇧 Sonia - Nữ Anh": "en-GB-SoniaNeural",
"🇬🇧 Ryan - Nam Anh": "en-GB-RyanNeural",
"🇩🇪 Katja - Deutsche Frau": "de-DE-KatjaNeural",
"🇩🇪 Conrad - Deutscher Mann": "de-DE-ConradNeural",
"🇫🇷 Denise - Française": "fr-FR-DeniseNeural",
"🇫🇷 Henri - Français": "fr-FR-HenriNeural",
"🇪🇸 Elvira - Española": "es-ES-ElviraNeural",
"🇪🇸 Alvaro - Español": "es-ES-AlvaroNeural",
"🇮🇹 Elsa - Italiana": "it-IT-ElsaNeural",
"🇮🇹 Diego - Italiano": "it-IT-DiegoNeural",
"🇯🇵 Nanami - 日本女性": "ja-JP-NanamiNeural",
"🇯🇵 Keita - 日本男性": "ja-JP-KeitaNeural",
"🇰🇷 SunHi - 한국 여성": "ko-KR-SunHiNeural",
"🇰🇷 BongJin - 한국 남성": "ko-KR-BongJinNeural",
"🇨🇳 Xiaoxiao - 中文女声": "zh-CN-XiaoxiaoNeural",
"🇨🇳 Yunxi - 中文男声": "zh-CN-YunxiNeural",
"🇷🇺 Svetlana - Русская": "ru-RU-SvetlanaNeural",
"🇷🇺 Dmitry - Русский": "ru-RU-DmitryNeural",
"🇵🇹 Francisca - Portuguesa": "pt-BR-FranciscaNeural",
"🇵🇹 Antonio - Português": "pt-BR-AntonioNeural",
"🇸🇦 Zariyah - عربية": "ar-SA-ZariyahNeural",
"🇸🇦 Hamed - عربي": "ar-SA-HamedNeural"
}
# Voice RAG Functions (Tích hợp từ hf_Voice_Audio_Translation)
def read_pdf(file_path):
"""Extract text from PDF file"""
try:
with open(file_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
except Exception as e:
return f"Error reading PDF: {str(e)}"
def read_docx(file_path):
"""Extract text from Word document"""
try:
if not DOCX_AVAILABLE:
return "❌ python-docx not available"
doc = docx.Document(file_path)
text = ""
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
return text
except Exception as e:
return f"Error reading DOCX: {str(e)}"
def read_txt(file_path):
"""Extract text from TXT file"""
try:
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()
except Exception as e:
return f"Error reading TXT: {str(e)}"
def extract_text_from_file(file_path):
"""Extract text from various file formats"""
if file_path is None:
return "No file uploaded"
file_extension = os.path.splitext(file_path)[1].lower()
if file_extension == '.pdf':
return read_pdf(file_path)
elif file_extension == '.docx':
return read_docx(file_path)
elif file_extension == '.txt':
return read_txt(file_path)
else:
return f"Unsupported file format: {file_extension}"
def detect_language_from_text(text):
"""Detect language from text content"""
# Vietnamese detection
vietnamese_chars = 'àáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ'
if any(char in text.lower() for char in vietnamese_chars):
return "Vietnamese"
# Chinese detection
chinese_chars = '中文汉字學習语言'
if any(char in text for char in chinese_chars):
return "Chinese"
# Japanese detection
japanese_chars = 'ひらがなカタカナ日本語'
if any(char in text for char in japanese_chars):
return "Japanese"
# Korean detection
korean_chars = '한국어문자'
if any(char in text for char in korean_chars):
return "Korean"
# French detection
french_words = ['le', 'la', 'les', 'de', 'et', 'à', 'un', 'une', 'ce', 'qui', 'que']
french_chars = 'àâäéèêëïîôöùûüÿç'
if any(word in text.lower() for word in french_words) or any(char in text.lower() for char in french_chars):
return "French"
# German detection
german_words = ['der', 'die', 'das', 'und', 'ist', 'ich', 'bin', 'haben', 'sein', 'werden']
german_chars = 'äöüß'
if any(word in text.lower() for word in german_words) or any(char in text.lower() for char in german_chars):
return "German"
# Spanish detection
spanish_words = ['el', 'la', 'de', 'que', 'y', 'a', 'en', 'un', 'es', 'se', 'no', 'te', 'lo']
spanish_chars = 'ñáéíóúü'
if any(word in text.lower() for word in spanish_words) or any(char in text.lower() for char in spanish_chars):
return "Spanish"
# English detection (default)
english_words = ['the', 'and', 'is', 'are', 'have', 'has', 'will', 'would', 'can', 'could']
if any(word in text.lower() for word in english_words):
return "English"
return "English" # Default fallback
def process_with_gemini(text, question, answer_language="Vietnamese"):
"""Process text and question using Gemini with multi-language support"""
try:
api_key = configure_gemini_api()
if not api_key:
return "❌ Lỗi: Chưa cấu hình GEMINI_API_KEY hoặc GOOGLE_API_KEY trong environment variables"
model = genai.GenerativeModel("gemini-2.0-flash")
# Detect document language
detected_doc_language = detect_language_from_text(text)
prompt = f"""
Based on the following document content, please answer the question in {answer_language}:
Document Content (detected language: {detected_doc_language}):
{text}
Question: {question}
Please provide a comprehensive and accurate answer in {answer_language}.
If the document is in a different language than the question, please still answer in {answer_language}.
Maintain the factual accuracy while adapting cultural context appropriately.
"""
response = model.generate_content(prompt)
return response.text
except Exception as e:
return f"Error processing with Gemini: {str(e)}"
def text_to_speech_rag(text, voice_selection):
"""Convert text to speech using Edge TTS for RAG results"""
try:
if not text or text.startswith("Error"):
return None
# Use global VOICE_MAP for performance
voice_name = VOICE_MAP.get(voice_selection, "vi-VN-HoaiMyNeural")
text_limited = text[:2000] if len(text) > 2000 else text
# Generate speech using Edge TTS
audio_data = asyncio.run(generate_speech(text_limited, voice_name, 0.0))
# Save to temporary file
fd, temp_output_path = tempfile.mkstemp(suffix=".wav", prefix="voice_rag_audio_")
os.close(fd)
# Write raw audio data to temporary file
with open(temp_output_path, 'wb') as f:
f.write(audio_data)
return temp_output_path
except Exception as e:
print(f"TTS Error: {str(e)}")
return None
def voice_rag_pipeline(uploaded_file, question, answer_language="Vietnamese", voice_selection="🇻🇳 HoaiMy - Nữ Việt Chuẩn", text_format="txt"):
"""Complete Voice RAG pipeline with multi-language support and downloadable text"""
if uploaded_file is None:
return "Please upload a document first.", "N/A", None, None
if not question.strip():
return "Please enter a question.", "N/A", None, None
# Extract text from uploaded file
extracted_text = extract_text_from_file(uploaded_file)
if extracted_text.startswith("Error"):
return extracted_text, "Error", None, None
# Detect document language
detected_doc_language = detect_language_from_text(extracted_text)
# Process with Gemini using selected answer language
answer = process_with_gemini(extracted_text, question, answer_language)
# Generate speech using selected voice
audio_file = text_to_speech_rag(answer, voice_selection)
# Create formatted content for download
if text_format.lower() == "md":
# Create beautiful Markdown format
formatted_content = format_voice_rag_response(
question, answer, detected_doc_language, voice_selection
)
text_file_path = create_text_file(formatted_content, "md", "voice_rag_response")
else:
# Create standard text file
text_file_path = create_text_file(answer, text_format, "voice_rag_answer")
return answer, detected_doc_language, audio_file, text_file_path
def detect_language(text):
"""Detect language of input text with improved accuracy"""
if not text.strip():
return "unknown"
text_lower = text.lower()
# Vietnamese detection (more comprehensive)
vietnamese_chars = 'àáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ'
vietnamese_words = ['và', 'của', 'là', 'có', 'này', 'được', 'cho', 'từ', 'một', 'những', 'tôi', 'bạn']
vietnamese_score = sum(1 for char in text if char in vietnamese_chars) + sum(2 for word in vietnamese_words if word in text_lower)
# English detection (more comprehensive)
english_words = ['the', 'and', 'is', 'are', 'have', 'has', 'will', 'would', 'can', 'could', 'that', 'this', 'with', 'for', 'you', 'he', 'she', 'it', 'they', 'we']
english_score = sum(1 for word in english_words if word in text_lower)
# German detection
german_words = ['der', 'die', 'das', 'und', 'ist', 'ich', 'bin', 'haben', 'sein', 'werden', 'mit', 'auf', 'für', 'von']
german_chars = 'äöüß'
german_score = sum(1 for word in german_words if word in text_lower) + sum(1 for char in text if char in german_chars)
# French detection
french_words = ['le', 'la', 'les', 'de', 'et', 'à', 'un', 'une', 'ce', 'qui', 'que', 'avec', 'pour', 'dans']
french_chars = 'àâäéèêëïîôöùûüÿç'
french_score = sum(1 for word in french_words if word in text_lower) + sum(0.5 for char in text if char in french_chars)
# Spanish detection
spanish_words = ['el', 'la', 'de', 'que', 'y', 'a', 'en', 'un', 'es', 'se', 'no', 'te', 'lo', 'con', 'para']
spanish_chars = 'ñáéíóúü'
spanish_score = sum(1 for word in spanish_words if word in text_lower) + sum(0.5 for char in text if char in spanish_chars)
# Score-based detection
scores = {
'Vietnamese': vietnamese_score,
'English': english_score,
'German': german_score,
'French': french_score,
'Spanish': spanish_score
}
# Find the language with highest score
max_score = max(scores.values())
if max_score > 0:
detected = max(scores, key=scores.get)
print(f"🔍 Language detection scores: {scores}")
print(f"🎯 Detected language: {detected} (score: {max_score})")
return detected
# Default fallback
print(f"⚠️ Could not detect language, defaulting to English")
return "English"
async def generate_speech(text, voice_name, rate):
"""Generate speech using Edge TTS"""
communicate = edge_tts.Communicate(text, voice_name, rate=f"{rate:+.0%}")
# Create in-memory buffer
audio_buffer = io.BytesIO()
async for chunk in communicate.stream():
if chunk["type"] == "audio":
audio_buffer.write(chunk["data"])
audio_buffer.seek(0)
return audio_buffer.getvalue()
def create_text_file(content, file_format="txt", filename_prefix="translated_text"):
"""
Create a downloadable text file from content in TXT, DOCX, or MD format
"""
if not content or content.startswith("Lỗi:") or content.startswith("❌"):
return None
try:
if file_format.lower() == "docx" and DOCX_AVAILABLE:
# Create Word document
fd, temp_file_path = tempfile.mkstemp(suffix=".docx", prefix=f"{filename_prefix}_")
os.close(fd)
if not DOCX_AVAILABLE:
return None
from docx import Document
doc = Document()
doc.add_heading('Nội dung đã dịch', 0)
doc.add_paragraph(content)
doc.save(temp_file_path)
return temp_file_path
elif file_format.lower() == "md":
# Create Markdown file
fd, temp_file_path = tempfile.mkstemp(suffix=".md", prefix=f"{filename_prefix}_")
os.close(fd)
with open(temp_file_path, 'w', encoding='utf-8') as f:
f.write(content)
return temp_file_path
else:
# Create TXT file (default)
fd, temp_file_path = tempfile.mkstemp(suffix=".txt", prefix=f"{filename_prefix}_")
os.close(fd)
with open(temp_file_path, 'w', encoding='utf-8') as f:
f.write(content)
return temp_file_path
except Exception as e:
return None
def format_voice_rag_response(question, answer, detected_language, voice_selection, timestamp=None):
"""
Format Voice RAG response as beautiful Markdown
"""
if timestamp is None:
timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# Clean and format the answer
formatted_answer = answer.strip()
# Create beautiful Markdown document
markdown_content = f"""# 📚 Voice RAG - Intelligent Document Q&A
---
## 📄 **Session Information**
| **Field** | **Details** |
|-----------|-------------|
| 🕒 **Timestamp** | {timestamp} |
| 🌍 **Document Language** | {detected_language} |
| 🎭 **Voice Selection** | {voice_selection} |
| 🤖 **AI Model** | Google Gemini 2.0 Flash |
---
## ❓ **Question**
> {question}
---
## 💬 **AI Response**
{formatted_answer}
---
---
## 📱 **Generated by**
**🎙️ Voice AI Platform** - Digitized Brains
*Powered by Claude Code & Google Gemini 2.0 Flash*
> 🌐 **Voice RAG Technology** - Combining document intelligence with premium voice synthesis
---
*Generated on {timestamp} | Voice: {voice_selection} | Language: {detected_language}*
"""
return markdown_content
def format_voice_studio_response(text, voice_selection, speed, detected_language="Auto-detected", timestamp=None):
"""
Format Voice Studio response as simple Markdown
"""
if timestamp is None:
timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# Clean and format the text
formatted_text = text.strip()
# Create simple Markdown document
markdown_content = f"""# Voice Studio Result
## Input Text ({detected_language})
{formatted_text}
---
*Generated on {timestamp} | Voice: {voice_selection} | Speed: {speed:.1f}x*
"""
return markdown_content
def format_audio_translation_response(original_text, translated_text, source_language, target_language, voice_selection, timestamp=None):
"""
Format Audio Translation response as simple Markdown
"""
if timestamp is None:
timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# Clean and format the texts
formatted_original = original_text.strip()
formatted_translated = translated_text.strip()
# Create simple Markdown document
markdown_content = f"""# Audio Translation Result
## Original Text ({source_language})
{formatted_original}
## Translated Text ({target_language})
{formatted_translated}
---
*Generated on {timestamp} | {source_language} → {target_language} | Voice: {voice_selection}*
"""
return markdown_content
def create_audio_voice_studio(text, voice_selection, speed, text_format="txt"):
"""Voice Studio functionality with text file generation"""
if not text.strip():
return "❌ Vui lòng nhập văn bản / Please enter text / Bitte Text eingeben", None
try:
# Use global VOICE_MAP for performance (avoiding recreation on each call)
voice_name = VOICE_MAP.get(voice_selection, "vi-VN-HoaiMyNeural")
text_limited = text[:1000] if len(text) > 1000 else text
# Convert speed (0.5-2.0) to rate percentage (-50% to +100%)
rate_percent = (speed - 1.0)
# Generate speech using Edge TTS
audio_data = asyncio.run(generate_speech(text_limited, voice_name, rate_percent))
# Convert to base64
audio_base64 = base64.b64encode(audio_data).decode('utf-8')
timestamp = int(time.time())
filename = f"voice_{voice_name}_{speed}x_{timestamp}.mp3"
# Detect language
detected_lang = detect_language(text_limited)
# Mobile-optimized HTML player
html_player = f'''
'''
# Create text file based on format
text_file_path = None
if text_format == "md":
# Use Markdown formatting function
detected_language = detect_language(text_limited)
markdown_content = format_voice_studio_response(text_limited, voice_selection, speed, detected_language)
text_file_path = create_text_file(markdown_content, "md", "voice_studio")
elif text_format == "docx":
# Create Word document with Voice Studio formatting
detected_language = detect_language(text_limited)
markdown_content = format_voice_studio_response(text_limited, voice_selection, speed, detected_language)
text_file_path = create_text_file(markdown_content, "docx", "voice_studio")
elif text_format == "txt":
# Create simple text file
text_file_path = create_text_file(text_limited, "txt", "voice_studio")
return html_player, text_file_path
except Exception as e:
return f"❌ Error: {str(e)}", None
# Language mapping for voices - defined once for performance
VOICE_TO_LANGUAGE = {
# Vietnamese
"🇻🇳 HoaiMy - Nữ Việt Chuẩn": "Vietnamese",
"🇻🇳 NamMinh - Nam Việt Chuẩn": "Vietnamese",
# English
"🇺🇸 Aria - Nữ Mỹ": "English",
"🇺🇸 Guy - Nam Mỹ": "English",
"🇬🇧 Sonia - Nữ Anh": "English",
"🇬🇧 Ryan - Nam Anh": "English",
# German
"🇩🇪 Katja - Deutsche Frau": "German",
"🇩🇪 Conrad - Deutscher Mann": "German",
# French
"🇫🇷 Denise - Française": "French",
"🇫🇷 Henri - Français": "French",
# Spanish
"🇪🇸 Elvira - Española": "Spanish",
"🇪🇸 Alvaro - Español": "Spanish",
# Italian
"🇮🇹 Elsa - Italiana": "Italian",
"🇮🇹 Diego - Italiano": "Italian",
# Japanese
"🇯🇵 Nanami - 日本女性": "Japanese",
"🇯🇵 Keita - 日本男性": "Japanese",
# Korean
"🇰🇷 SunHi - 한국 여성": "Korean",
"🇰🇷 BongJin - 한국 남성": "Korean",
# Chinese
"🇨🇳 Xiaoxiao - 中文女声": "Chinese",
"🇨🇳 Yunxi - 中文男声": "Chinese",
# Russian
"🇷🇺 Svetlana - Русская": "Russian",
"🇷🇺 Dmitry - Русский": "Russian",
# Portuguese
"🇵🇹 Francisca - Portuguesa": "Portuguese",
"🇵🇹 Antonio - Português": "Portuguese",
# Arabic
"🇸🇦 Zariyah - عربية": "Arabic",
"🇸🇦 Hamed - عربي": "Arabic"
}
def get_target_language_from_voice(voice_selection):
"""Map voice selection to target language for translation"""
return VOICE_TO_LANGUAGE.get(voice_selection, "Vietnamese")
def translate_text_with_gemini(text, target_language):
"""Translate text using Gemini API"""
try:
api_key = configure_gemini_api()
if not api_key:
return f"❌ Lỗi: Chưa cấu hình GEMINI_API_KEY hoặc GOOGLE_API_KEY trong environment variables"
if not text.strip():
return ""
model = genai.GenerativeModel("gemini-2.0-flash")
prompt = f"""Translate the following text to {target_language}. Return ONLY the translated text, nothing else:
{text}"""
response = model.generate_content(prompt)
translated_text = response.text.strip()
# Clean up any unwanted text that might be included
if translated_text.lower().startswith("translation:"):
translated_text = translated_text[12:].strip()
if translated_text.lower().startswith("here is"):
lines = translated_text.split('\n')
if len(lines) > 1:
translated_text = '\n'.join(lines[1:]).strip()
return translated_text
except Exception as e:
return f"Lỗi dịch thuật: {str(e)}"
def translate_audio(audio_file, target_country, voice_selection, text_format="txt"):
"""
Transcribe, translate and synthesize audio to target language with Voice Studio integration
"""
try:
api_key = configure_gemini_api()
if not api_key:
return "❌ Lỗi: Chưa cấu hình GEMINI_API_KEY hoặc GOOGLE_API_KEY trong environment variables", "Không xác định", "", target_country, None, None, "", "", None
if audio_file is None:
return "Lỗi: Vui lòng tải lên file audio", "Không xác định", "", target_country, None, None, "", "", None
# Save recorded audio to record_data directory
print(f"🔍 Processing audio file type: {type(audio_file)}")
saved_audio_path = save_recorded_audio(audio_file)
if saved_audio_path:
print(f"🎤 Audio saved to record_data: {os.path.basename(saved_audio_path)}")
# Debug: check if file really exists
if os.path.exists(saved_audio_path):
file_size = os.path.getsize(saved_audio_path)
print(f"✅ File confirmed: {saved_audio_path} ({file_size} bytes)")
else:
print(f"❌ File not found after save: {saved_audio_path}")
return "❌ Lỗi: Không thể lưu file audio", "Không xác định", "", target_country, None, None, "", "", None
else:
print("❌ Failed to save audio file")
return "❌ Lỗi: Không thể lưu file audio", "Không xác định", "", target_country, None, None, "", "", None
# Get target language from voice selection
target_language = get_target_language_from_voice(voice_selection)
# Transcribe audio using Gemini
model = genai.GenerativeModel("gemini-2.0-flash")
# Read audio file using saved path
with open(saved_audio_path, 'rb') as f:
audio_data = f.read()
# Create audio blob
audio_blob = {
'mime_type': 'audio/wav',
'data': audio_data
}
# Step 1: Transcribe audio only first
transcribe_prompt = """Transcribe this audio accurately in its original language. Return only the transcribed text, nothing else."""
response = model.generate_content([transcribe_prompt, audio_blob])
transcription = response.text.strip()
# Step 2: Detect language of transcription
detected_lang = detect_language(transcription)
# Step 3: Translate if needed (only if source is different from target)
if detected_lang.lower() != target_language.lower():
print(f"🔄 Translating from {detected_lang} to {target_language}")
translated_text = translate_text_with_gemini(transcription, target_language)
# Check if translation was successful
if translated_text.startswith("❌") or translated_text.startswith("Lỗi"):
print(f"❌ Translation failed: {translated_text}")
# Use original transcription if translation fails
translated_text = transcription
else:
print(f"✅ Translation successful")
else:
print(f"ℹ️ No translation needed - same language ({detected_lang})")
translated_text = transcription
# Generate audio using Edge TTS (use global VOICE_MAP for performance)
edge_voice = VOICE_MAP.get(voice_selection, "vi-VN-HoaiMyNeural")
print(f"🎙️ Generating audio with voice: {edge_voice}")
audio_data = asyncio.run(generate_speech(translated_text, edge_voice, 0.0))
print(f"🎵 Generated audio data: {len(audio_data)} bytes")
# Save audio file
fd, temp_output_path = tempfile.mkstemp(suffix=".wav", prefix="translated_audio_")
os.close(fd)
print(f"📁 Created temp audio file: {temp_output_path}")
# Write raw audio data to temporary file
with open(temp_output_path, 'wb') as f:
f.write(audio_data)
# Verify file was created
if os.path.exists(temp_output_path):
file_size = os.path.getsize(temp_output_path)
print(f"✅ Audio file created successfully: {file_size} bytes")
else:
print(f"❌ Failed to create audio file: {temp_output_path}")
# Create text file for download with proper formatting
text_file_path = None
if text_format == "md":
# Use Markdown formatting function for Audio Translation
markdown_content = format_audio_translation_response(
transcription, translated_text, detected_lang, target_language, voice_selection
)
text_file_path = create_text_file(markdown_content, "md", "audio_translation")
elif text_format == "docx":
# Create Word document with Audio Translation formatting
markdown_content = format_audio_translation_response(
transcription, translated_text, detected_lang, target_language, voice_selection
)
text_file_path = create_text_file(markdown_content, "docx", "audio_translation")
else:
# Create simple text file
text_file_path = create_text_file(translated_text, "txt", "audio_translation")
return transcription, detected_lang, translated_text, target_language, temp_output_path, temp_output_path, transcription, translated_text, text_file_path
except Exception as e:
# Get target language for error response
target_language = get_target_language_from_voice(voice_selection) if 'voice_selection' in locals() else "Vietnamese"
return f"Lỗi: {str(e)}", "Lỗi", "", target_language, None, None, "", "", None
# Voice choices organized by country - ONLY OFFICIAL VOICES
voice_choices_by_country = {
"🇻🇳 Việt Nam": [
"🇻🇳 HoaiMy - Nữ Việt Chuẩn",
"🇻🇳 NamMinh - Nam Việt Chuẩn"
],
"🇺🇸 Hoa Kỳ": [
"🇺🇸 Aria - Nữ Mỹ",
"🇺🇸 Guy - Nam Mỹ"
],
"🇬🇧 Anh": [
"🇬🇧 Sonia - Nữ Anh",
"🇬🇧 Ryan - Nam Anh"
],
"🇩🇪 Đức": [
"🇩🇪 Katja - Deutsche Frau",
"🇩🇪 Conrad - Deutscher Mann"
],
"🇫🇷 Pháp": [
"🇫🇷 Denise - Française",
"🇫🇷 Henri - Français"
],
"🇪🇸 Tây Ban Nha": [
"🇪🇸 Elvira - Española",
"🇪🇸 Alvaro - Español"
],
"🇮🇹 Ý": [
"🇮🇹 Elsa - Italiana",
"🇮🇹 Diego - Italiano"
],
"🇯🇵 Nhật Bản": [
"🇯🇵 Nanami - 日本女性",
"🇯🇵 Keita - 日本男性"
],
"🇰🇷 Hàn Quốc": [
"🇰🇷 SunHi - 한국 여성",
"🇰🇷 BongJin - 한국 남성"
],
"🇨🇳 Trung Quốc": [
"🇨🇳 Xiaoxiao - 中文女声",
"🇨🇳 Yunxi - 中文男声"
],
"🇷🇺 Nga": [
"🇷🇺 Svetlana - Русская",
"🇷🇺 Dmitry - Русский"
],
"🇵🇹 Bồ Đào Nha": [
"🇵🇹 Francisca - Portuguesa",
"🇵🇹 Antonio - Português"
],
"🇸🇦 Ả Rập": [
"🇸🇦 Zariyah - عربية",
"🇸🇦 Hamed - عربي"
]
}
def update_voices(country):
"""Update voice choices based on selected country"""
if country in voice_choices_by_country:
voices = voice_choices_by_country[country]
return gr.Dropdown(choices=voices, value=voices[0])
else:
# Default to Vietnamese voices
default_voices = voice_choices_by_country["🇻🇳 Việt Nam"]
return gr.Dropdown(choices=default_voices, value=default_voices[0])
# Lightweight CSS - optimized for performance
css = """
* {
font-family: system-ui, -apple-system, 'Segoe UI', Arial, sans-serif;
}
.gradio-container {
max-width: 1200px;
margin: 0 auto;
position: relative;
}
/* Critical fix for dropdown interaction */
.gradio-container * {
pointer-events: auto;
}
/* Hide Gradio footer */
.footer {
display: none !important;
}
/* Pulsing animation for processing status */
@keyframes pulse-processing {
0% {
opacity: 1;
transform: scale(1);
box-shadow: 0 4px 15px rgba(255, 193, 7, 0.3);
}
50% {
opacity: 0.8;
transform: scale(1.02);
box-shadow: 0 6px 25px rgba(255, 193, 7, 0.6);
}
100% {
opacity: 1;
transform: scale(1);
box-shadow: 0 4px 15px rgba(255, 193, 7, 0.3);
}
}
.status-processing {
animation: pulse-processing 1.5s ease-in-out infinite;
background: linear-gradient(135deg, #FFC107 0%, #FF9800 100%) !important;
}
/* Success status animation */
@keyframes pulse-success {
0% {
opacity: 1;
transform: scale(1);
}
50% {
opacity: 0.9;
transform: scale(1.01);
}
100% {
opacity: 1;
transform: scale(1);
}
}
.status-success {
animation: pulse-success 2s ease-in-out 3;
background: linear-gradient(135deg, #4CAF50 0%, #2E7D32 100%) !important;
}
/* Custom footer to cover Gradio attribution */
.custom-footer {
position: fixed;
bottom: 0;
left: 0;
right: 0;
background: linear-gradient(135deg, #4A90E2 0%, #2E86AB 70%, #FF8A65 85%, #FF6B9D 100%);
color: white;
padding: 15px;
text-align: center;
font-weight: bold;
z-index: 1000;
box-shadow: 0 -2px 10px rgba(0,0,0,0.1);
}
/* Add padding to body to account for fixed footer */
body {
padding-bottom: 60px;
}
/* Mobile-first responsive design */
.input-card {
background: rgba(255,255,255,0.95);
border-radius: 16px;
padding: 16px;
margin: 10px 0;
box-shadow: 0 4px 20px rgba(0,0,0,0.1);
backdrop-filter: blur(10px);
}
.output-area {
background: rgba(255,255,255,0.95);
border-radius: 16px;
padding: 16px;
margin: 15px 0;
min-height: 200px;
box-shadow: 0 4px 20px rgba(0,0,0,0.1);
}
.examples-section {
background: rgba(255,255,255,0.9);
border-radius: 16px;
padding: 16px;
margin: 20px 0;
}
.main-header {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
padding: 20px;
border-radius: 10px;
margin-bottom: 20px;
text-align: center;
}
.feature-box {
background: #f8f9fa;
padding: 15px;
border-radius: 8px;
margin: 10px 0;
border-left: 4px solid #667eea;
}
.status-indicator {
display: inline-block;
padding: 5px 10px;
border-radius: 15px;
font-size: 12px;
font-weight: bold;
margin: 5px;
}
.status-success {
background-color: #d4edda;
color: #155724;
}
.status-processing {
background-color: #fff3cd;
color: #856404;
}
.comparison-section {
border: 1px solid #e0e0e0;
border-radius: 8px;
padding: 15px;
margin: 10px 0;
background: #fafafa;
}
.language-label {
font-weight: bold;
color: #667eea;
padding: 5px 10px;
background: #f0f2ff;
border-radius: 15px;
display: inline-block;
margin-bottom: 10px;
font-size: 14px;
}
.content-compare {
background: white;
border: 1px solid #ddd;
border-radius: 6px;
padding: 12px;
min-height: 120px;
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
line-height: 1.5;
}
/* Reset any problematic dropdown styles */
.gradio-container * {
pointer-events: auto;
}
/* Remove any potential blocking overlays */
.gradio-container::before,
.gradio-container::after {
display: none;
}
/* Ensure all interactive elements work */
button, select, input, textarea, .gr-dropdown {
pointer-events: auto !important;
position: relative !important;
}
/* Simple dropdown fix without complex selectors */
[class*="dropdown"] {
position: relative !important;
z-index: 999 !important;
}
[class*="dropdown"] * {
pointer-events: auto !important;
}
/* Make sure no overlay blocks clicks */
.gradio-container .gr-form {
position: relative;
z-index: 1;
}
.gradio-container .gr-block {
position: relative;
z-index: 1;
}
.mobile-button {
width: 100% !important;
padding: 15px !important;
font-size: 1.1em !important;
margin: 20px 0 !important;
border-radius: 12px !important;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
border: none !important;
color: white !important;
font-weight: bold !important;
box-shadow: 0 4px 15px rgba(102, 126, 234, 0.3) !important;
transition: all 0.3s ease !important;
cursor: pointer !important;
position: relative !important;
overflow: hidden !important;
}
.mobile-button:hover {
transform: translateY(-2px) !important;
box-shadow: 0 8px 25px rgba(102, 126, 234, 0.4) !important;
background: linear-gradient(135deg, #5a6fd8 0%, #6b4190 100%) !important;
}
.mobile-button:active {
transform: translateY(0px) !important;
box-shadow: 0 2px 10px rgba(102, 126, 234, 0.3) !important;
}
/* Ripple effect for button */
.mobile-button::before {
content: '';
position: absolute;
top: 50%;
left: 50%;
width: 0;
height: 0;
border-radius: 50%;
background: rgba(255, 255, 255, 0.3);
transform: translate(-50%, -50%);
transition: width 0.6s, height 0.6s;
}
.mobile-button:active::before {
width: 300px;
height: 300px;
}
/* Loading spinner animation */
@keyframes spin {
0% { transform: rotate(0deg); }
100% { transform: rotate(360deg); }
}
.loading-spinner {
display: inline-block;
width: 20px;
height: 20px;
border: 3px solid rgba(255,255,255,0.3);
border-radius: 50%;
border-top-color: white;
animation: spin 1s ease-in-out infinite;
margin-right: 10px;
}
/* Button pulse effect when processing */
@keyframes pulse {
0% {
box-shadow: 0 4px 15px rgba(102, 126, 234, 0.3);
}
50% {
box-shadow: 0 8px 25px rgba(102, 126, 234, 0.6);
}
100% {
box-shadow: 0 4px 15px rgba(102, 126, 234, 0.3);
}
}
.button-processing {
animation: pulse 2s ease-in-out infinite;
background: linear-gradient(135deg, #FF8E53 0%, #FF6B6B 100%) !important;
}
.mobile-textbox textarea {
border-radius: 10px !important;
border: 2px solid #e0e0e0 !important;
padding: 12px !important;
font-size: 1em !important;
line-height: 1.5 !important;
}
.mobile-compare textarea {
border-radius: 8px !important;
border: 1px solid #ddd !important;
padding: 10px !important;
background: #fafafa !important;
font-size: 0.95em !important;
}
.mobile-audio {
margin: 10px 0 !important;
border-radius: 10px !important;
}
.mobile-file {
margin: 10px 0 !important;
border-radius: 10px !important;
}
/* Beautiful Markdown styling for Voice RAG responses */
.markdown-response {
background: linear-gradient(135deg, #ffffff 0%, #f8fffe 100%);
border-radius: 12px;
padding: 20px;
margin: 15px 0;
box-shadow: 0 4px 20px rgba(0,0,0,0.1);
border-left: 4px solid #4CAF50;
}
.markdown-response h1 {
color: #2e7d32;
border-bottom: 2px solid #4CAF50;
padding-bottom: 10px;
margin-bottom: 20px;
font-size: 1.8em;
}
.markdown-response h2 {
color: #388E3C;
margin-top: 25px;
margin-bottom: 15px;
font-size: 1.4em;
border-left: 3px solid #4CAF50;
padding-left: 15px;
}
.markdown-response h3 {
color: #43A047;
margin-top: 20px;
margin-bottom: 12px;
font-size: 1.2em;
}
.markdown-response p {
line-height: 1.6;
margin-bottom: 12px;
color: #333;
}
.markdown-response blockquote {
background: linear-gradient(135deg, #e8f5e8 0%, #c8e6c9 100%);
border-left: 4px solid #4CAF50;
padding: 15px 20px;
margin: 15px 0;
border-radius: 8px;
font-style: italic;
color: #2e7d32;
}
.markdown-response table {
width: 100%;
border-collapse: collapse;
margin: 15px 0;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
border-radius: 8px;
overflow: hidden;
}
.markdown-response table th {
background: linear-gradient(135deg, #4CAF50 0%, #45a049 100%);
color: white;
padding: 12px 15px;
text-align: left;
font-weight: bold;
}
.markdown-response table td {
padding: 12px 15px;
border-bottom: 1px solid #e0e0e0;
background: white;
}
.markdown-response table tr:nth-child(even) td {
background: #f8fffe;
}
.markdown-response table tr:hover td {
background: #e8f5e8;
transition: background 0.3s ease;
}
.markdown-response ul, .markdown-response ol {
margin: 15px 0;
padding-left: 25px;
}
.markdown-response li {
margin-bottom: 8px;
line-height: 1.5;
}
.markdown-response code {
background: #f5f5f5;
border: 1px solid #e0e0e0;
border-radius: 4px;
padding: 2px 6px;
font-family: 'Courier New', monospace;
color: #d32f2f;
}
.markdown-response pre {
background: #f5f5f5;
border: 1px solid #e0e0e0;
border-radius: 8px;
padding: 15px;
overflow-x: auto;
margin: 15px 0;
}
.markdown-response pre code {
background: none;
border: none;
padding: 0;
color: #333;
}
.markdown-response hr {
border: none;
height: 2px;
background: linear-gradient(90deg, transparent, #4CAF50, transparent);
margin: 25px 0;
}
.markdown-response strong {
color: #2e7d32;
font-weight: bold;
}
.markdown-response em {
color: #388E3C;
font-style: italic;
}
/* Responsive design for markdown */
@media (max-width: 768px) {
.markdown-response {
padding: 15px;
margin: 10px 0;
}
.markdown-response table {
font-size: 0.9em;
}
.markdown-response h1 {
font-size: 1.6em;
}
.markdown-response h2 {
font-size: 1.3em;
}
}
/* Mobile responsive breakpoints */
@media (max-width: 768px) {
.gradio-container {
padding: 10px !important;
}
.input-card {
padding: 12px !important;
margin: 8px 0 !important;
}
.output-area {
padding: 12px !important;
margin: 10px 0 !important;
}
.examples-section {
padding: 12px !important;
}
.main-header h2 {
font-size: 1.5em !important;
}
.main-header p {
font-size: 1em !important;
}
/* Mobile layout adjustments - less aggressive */
.gr-row {
flex-direction: column;
}
.gr-column {
width: 100%;
margin-bottom: 15px;
}
}
@media (max-width: 480px) {
.gradio-container {
padding: 5px !important;
}
.input-card {
padding: 10px !important;
margin: 5px 0 !important;
}
.main-header {
padding: 15px !important;
}
.main-header h2 {
font-size: 1.3em !important;
}
.mobile-button {
padding: 12px !important;
font-size: 1em !important;
}
}
/* JavaScript for button interactions */
"""
# Add JavaScript for button effects
js_code = """
"""
# Create interface with tabs
with gr.Blocks(css=css, title="🎙️ Voice AI Platform - Voice RAG & Audio Translation") as demo:
# Simplified header for faster loading on HF Spaces
if not (os.environ.get("SPACE_ID") or os.environ.get("HF_SPACE_ID")):
# Only load complex microphone permissions in local development
gr.HTML("""
🎙️ Voice AI Platform
Voice RAG, Audio Translation và Voice Studio - Nền tảng AI giọng nói toàn diện
✨ Tính năng mới: Voice RAG với 24 giọng nói đa ngôn ngữ
🧠 Digitized Brains
""")
else:
# Production mode - minimal header
gr.HTML('
🎙️ Voice AI Platform
')
with gr.Tabs():
# Tab 1: Voice RAG
with gr.TabItem("📚 Voice RAG"):
# Header section with hf_voice style
gr.HTML("""
📚 Voice RAG
Hỏi đáp tài liệu thông minh
🌍 Multi-Language
13 ngôn ngữ trả lời
🎤 Voice Output
24 giọng nói đa dạng
🔄 AI Gemini
Gemini 2.0 Flash
""")
gr.Markdown("### 📝 Upload tài liệu và đặt câu hỏi")
# Input section - Mobile optimized
with gr.Column():
# Document upload
with gr.Row():
file_upload_rag = gr.File(
label="📎 Tải lên tài liệu (PDF, DOCX, TXT)",
file_types=[".pdf", ".docx", ".txt"]
)
# Question input
with gr.Row():
question_input_rag = gr.Textbox(
label="❓ Câu hỏi của bạn",
placeholder="Hãy đặt câu hỏi về nội dung tài liệu...",
lines=3
)
# Language selection for answer
with gr.Row():
answer_language_dropdown_rag = gr.Dropdown(
choices=SUPPORTED_LANGUAGES,
value="Vietnamese",
label="🌍 Ngôn ngữ trả lời"
)
# Voice selection từ Voice Studio
with gr.Row():
with gr.Column(scale=1):
rag_country_dropdown = gr.Dropdown(
choices=list(voice_choices_by_country.keys()),
value="🇻🇳 Việt Nam",
label="🌍 Chọn quốc gia giọng nói"
)
with gr.Column(scale=1):
rag_voice_dropdown = gr.Dropdown(
choices=voice_choices_by_country["🇻🇳 Việt Nam"],
value="🇻🇳 HoaiMy - Nữ Việt Chuẩn",
label="🎭 Chọn giọng nói"
)
# Format selection for download
with gr.Row():
rag_text_format_dropdown = gr.Dropdown(
choices=["Markdown (.md)", "TXT (.txt)", "Word (.docx)"] if DOCX_AVAILABLE else ["Markdown (.md)", "TXT (.txt)"],
value="Markdown (.md)",
label="📄 Định dạng file trả lời"
)
# Process button
with gr.Row():
submit_btn_rag = gr.Button(
"🚀 Xử lý tài liệu và trả lời",
variant="primary",
size="lg"
)
# Results section - Mobile optimized
with gr.Column():
# Document info section
with gr.Accordion("📄 Thông tin tài liệu", open=True):
detected_doc_language_rag = gr.Textbox(
label="🌐 Ngôn ngữ tài liệu được phát hiện",
lines=1,
interactive=False,
placeholder="Tự động nhận diện ngôn ngữ tài liệu..."
)
# Text answer section
with gr.Accordion("💬 Câu trả lời", open=True):
gr.HTML("""
💬 AI Response with Markdown Formatting
Formatted response with tables, headers, and beautiful layout
""")
answer_output_rag = gr.Markdown(
value="**Câu trả lời sẽ xuất hiện ở đây sau khi xử lý...**\n\n*Hỗ trợ format Markdown với tables, headers, lists và nhiều style khác*",
label="",
show_label=False,
elem_classes=["markdown-response"]
)
# Downloads section - Mobile optimized
with gr.Accordion("💾 Tải xuống kết quả", open=True):
gr.HTML("""
Tải xuống câu trả lời dưới dạng file và audio
""")
# Stack vertically on mobile
with gr.Column():
# Audio download section
with gr.Row():
audio_output_rag = gr.Audio(
label="🔊 Audio câu trả lời",
type="filepath"
)
# Text download section
with gr.Row():
text_output_rag = gr.File(
label="📄 Văn bản câu trả lời",
file_count="single",
file_types=[".md", ".txt", ".docx"]
)
# Status indicator for RAG
rag_status_text = gr.HTML("""
✅ Sẵn sàng xử lý tài liệu
""")
# Helper function for RAG format
def get_rag_format_from_dropdown(format_choice):
if "Word" in format_choice or "docx" in format_choice:
return "docx"
elif "Markdown" in format_choice or "md" in format_choice:
return "md"
return "txt"
# RAG processing function
def update_rag_status_processing():
return """
""")
gr.Markdown("### 📝 Nhập nội dung và chọn giọng nói")
with gr.Row():
text_input = gr.Textbox(
placeholder="Nhập văn bản cần chuyển thành giọng nói...",
lines=4,
label="Văn bản",
scale=2
)
with gr.Row():
with gr.Column(scale=1):
country_dropdown = gr.Dropdown(
choices=list(voice_choices_by_country.keys()),
value="🇻🇳 Việt Nam",
label="🌍 Chọn quốc gia"
)
with gr.Column(scale=1):
voice_dropdown = gr.Dropdown(
choices=voice_choices_by_country["🇻🇳 Việt Nam"],
value="🇻🇳 HoaiMy - Nữ Việt Chuẩn",
label="🎭 Chọn giọng nói"
)
with gr.Row():
with gr.Column(scale=2):
speed_slider = gr.Slider(
minimum=0.5,
maximum=2.0,
value=1.0,
step=0.1,
label="⚡ Tốc độ phát"
)
with gr.Column(scale=1):
voice_studio_format_dropdown = gr.Dropdown(
choices=["Markdown (.md)", "TXT (.txt)", "Word (.docx)"] if DOCX_AVAILABLE else ["Markdown (.md)", "TXT (.txt)"],
value="Markdown (.md)",
label="📄 Định dạng file tải xuống"
)
# Translation feature
with gr.Row():
with gr.Column(scale=1):
translate_checkbox = gr.Checkbox(
label="🌍 Dịch văn bản trước khi tạo giọng nói",
value=False
)
with gr.Column(scale=2):
translate_btn = gr.Button("🔄 DỊCH VĂN BẢN", variant="secondary", size="lg", visible=False)
# Show translated text when translation is enabled
translated_text_output = gr.Textbox(
label="📝 Văn bản đã dịch",
lines=3,
interactive=True,
visible=False,
placeholder="Văn bản sau khi dịch sẽ hiển thị ở đây..."
)
generate_btn = gr.Button("🎵 TẠO GIỌNG NÓI", variant="primary", size="lg")
# Status indicator for Voice Studio
studio_status_text = gr.HTML("""
"
)
# Download section for Voice Studio
with gr.Accordion("💾 Tải xuống kết quả", open=False):
gr.HTML("""
📄 Tải xuống văn bản với Markdown formatting
File chứa thông tin session, cấu hình giọng nói và technical details
""")
voice_studio_text_output = gr.File(
label="📄 Văn bản với thông tin chi tiết",
file_count="single",
file_types=[".md", ".txt", ".docx"]
)
# Examples section
gr.Markdown("### 📚 Ví dụ nhanh")
with gr.Row():
example_vn = gr.Button("🇻🇳 Tiếng Việt", size="sm")
example_en = gr.Button("🇺🇸 English", size="sm")
example_de = gr.Button("🇩🇪 Deutsch", size="sm")
example_translate = gr.Button("🌍 Dịch thuật", size="sm")
# Example button functions
def load_vn_example():
return "Xin chào! Chào mừng bạn đến với studio giọng nói.", "🇻🇳 Việt Nam"
def load_en_example():
return "Hello! Welcome to our voice studio.", "🇺🇸 Hoa Kỳ"
def load_de_example():
return "Hallo! Willkommen in unserem Sprachstudio.", "🇩🇪 Đức"
def load_translate_example():
return "Hello! This is an example text for translation.", "🇺🇸 Hoa Kỳ", True
# Translation functions
def toggle_translation_ui(translate_enabled):
"""Show/hide translation UI elements"""
return (
gr.update(visible=translate_enabled), # translate_btn
gr.update(visible=translate_enabled) # translated_text_output
)
def translate_text_interface(text, voice_selection):
"""Translate text for Voice Studio"""
if not text.strip():
return "Vui lòng nhập văn bản trước khi dịch"
target_language = get_target_language_from_voice(voice_selection)
translated = translate_text_with_gemini(text, target_language)
return translated
def create_voice_with_translation(original_text, translated_text, translate_enabled, voice_selection, speed, text_format="txt"):
"""Create voice using original or translated text"""
if translate_enabled and translated_text.strip() and not translated_text.startswith("Lỗi"):
# Use translated text
return create_audio_voice_studio(translated_text, voice_selection, speed, text_format)
else:
# Use original text
return create_audio_voice_studio(original_text, voice_selection, speed, text_format)
# Event handlers for Voice Studio
country_dropdown.change(
fn=update_voices,
inputs=[country_dropdown],
outputs=[voice_dropdown]
)
example_vn.click(
fn=load_vn_example,
outputs=[text_input, country_dropdown]
)
example_en.click(
fn=load_en_example,
outputs=[text_input, country_dropdown]
)
example_de.click(
fn=load_de_example,
outputs=[text_input, country_dropdown]
)
example_translate.click(
fn=load_translate_example,
outputs=[text_input, country_dropdown, translate_checkbox]
)
# Translation UI toggle
translate_checkbox.change(
fn=toggle_translation_ui,
inputs=[translate_checkbox],
outputs=[translate_btn, translated_text_output]
)
# Translation button
translate_btn.click(
fn=translate_text_interface,
inputs=[text_input, voice_dropdown],
outputs=[translated_text_output]
)
# Helper function to extract format and process Voice Studio
def process_voice_studio(original_text, translated_text, translate_enabled, voice_selection, speed, format_choice):
"""Process Voice Studio with format support"""
# Extract format from dropdown
if "Markdown" in format_choice:
text_format = "md"
elif "Word" in format_choice:
text_format = "docx"
else:
text_format = "txt"
return create_voice_with_translation(original_text, translated_text, translate_enabled, voice_selection, speed, text_format)
# Generate voice with translation support
generate_btn.click(
fn=process_voice_studio,
inputs=[text_input, translated_text_output, translate_checkbox, voice_dropdown, speed_slider, voice_studio_format_dropdown],
outputs=[audio_output_vs, voice_studio_text_output]
)
# Audio Translation Tab
with gr.TabItem("🎙️ Audio Translation"):
# Colorful feature cards like Voice Studio
gr.HTML("""
🎤 Ghi âm
Microphone
Real-time
📁 Upload
Audio Files
WAV • MP3
🔄 AI Dịch
13 ngôn ngữ
Gemini 2.0
🎵 Tổng hợp
Neural TTS
26 giọng
""")
# Input section with colorful design
gr.HTML("""
🎤 Tải lên file audio hoặc ghi âm trực tiếp
Hỗ trợ file WAV, MP3 hoặc ghi âm real-time qua microphone
""")
# Enhanced microphone permission notice and controls
if not (os.environ.get("SPACE_ID") or os.environ.get("HF_SPACE_ID")):
gr.HTML("""
🎤 Microphone Ready - Bạn có thể ghi âm trực tiếp
🎤 Microphone Access Required
Để sử dụng ghi âm, vui lòng cho phép truy cập microphone. 🔗 Mở cửa sổ mới
⚠️ Iframe Restriction
Microphone có thể bị hạn chế trong iframe.
Mở trong cửa sổ mới
để sử dụng đầy đủ tính năng.
""")
else:
# Production mode - simple microphone notice
gr.HTML('
📎 Upload audio file or use microphone
')
audio_input = gr.Audio(
label="📎 Tải lên file audio hoặc ghi âm trực tiếp",
type="numpy", # Use numpy to avoid temp file issues
sources=["upload", "microphone"],
show_label=True,
interactive=True,
elem_id="audio-input-translation"
)
# Audio Recording Control Buttons
with gr.Row():
save_recording_btn = gr.Button(
"💾 Save Recording",
variant="secondary",
size="sm"
)
new_recording_btn = gr.Button(
"🎙️ New Record",
variant="primary",
size="sm"
)
# Button descriptions
gr.HTML("""
💾 Lưu file audio hiện tại vào record_data🎙️ Xóa audio hiện tại để ghi âm mới
""")
# Status for recording actions
recording_status = gr.HTML(
value="
Sẵn sàng ghi âm hoặc tải lên file
"
)
# === RECORDED FILES FUNCTIONS ===
def refresh_recorded_files():
"""Refresh the list of recorded files"""
files = get_recorded_files()
print(f"🔄 Refreshing dropdown - found files: {files}")
return gr.Dropdown(choices=files, value=None)
def load_recorded_file(filename):
"""Load selected recorded file for playback"""
print(f"🎵 Loading recorded file: {filename}")
if filename and filename.strip():
file_path = get_recorded_file_path(filename)
print(f"📁 Full path: {file_path}")
if os.path.exists(file_path):
file_size = os.path.getsize(file_path)
print(f"✅ File exists, size: {file_size} bytes")
try:
# Load audio as numpy array for Gradio compatibility
import soundfile as sf
audio_data, sample_rate = sf.read(file_path)
print(f"🎵 Loaded audio: shape={audio_data.shape}, sr={sample_rate}")
# Return tuple (sample_rate, audio_data) for Gradio numpy type
return (sample_rate, audio_data)
except Exception as e:
print(f"❌ Error loading audio: {e}")
return None
else:
print(f"❌ File not found: {file_path}")
print(f"📁 Directory contents: {os.listdir(os.path.dirname(file_path)) if os.path.exists(os.path.dirname(file_path)) else 'Directory not found'}")
else:
print("❌ No filename provided")
return None
def use_recorded_for_translation(filename, country, voice, fmt):
"""Use selected recorded file for translation"""
print(f"🔄 Using recorded file for translation: {filename}")
if filename and filename.strip():
file_path = get_recorded_file_path(filename)
print(f"📁 Translation file path: {file_path}")
if os.path.exists(file_path):
print(f"✅ Starting translation for: {filename}")
# Use the same translation function
return translate_audio(file_path, country, voice, get_format_from_dropdown(fmt))
else:
print(f"❌ File not found for translation: {file_path}")
# Return empty results if no file selected
print("❌ No file selected for translation")
return "", "", "", "", None, "", "", None
def prepare_recorded_file_download(filename):
"""Prepare recorded file for download"""
print(f"📥 Preparing download for: {filename}")
if filename and filename.strip():
file_path = get_recorded_file_path(filename)
print(f"📁 Download file path: {file_path}")
if os.path.exists(file_path):
print(f"✅ File ready for download: {filename}")
return file_path
else:
print(f"❌ Download file not found: {file_path}")
print("❌ No file selected for download")
return None
def save_current_recording(audio_file):
"""Save current audio recording to record_data"""
if audio_file is None:
current_files = get_recorded_files()
return (
"
❌ Không có file audio để lưu
",
gr.Dropdown(choices=current_files, value=None)
)
try:
saved_path = save_recorded_audio(audio_file)
if saved_path:
saved_filename = os.path.basename(saved_path)
# Get updated file list after saving
updated_files = get_recorded_files()
print(f"🔄 After save - updated files: {updated_files}")
return (
f"
",
gr.Dropdown(choices=current_files, value=None)
)
def clear_audio_for_new_recording():
"""Clear audio input for new recording"""
return (
None, # Clear audio input
"
🎙️ Sẵn sàng ghi âm mới
"
)
def delete_selected_file(filename):
"""Delete selected file and refresh dropdown"""
if not filename or not filename.strip():
current_files = get_recorded_files()
return (
"
❌ Vui lòng chọn file để xóa
",
gr.Dropdown(choices=current_files, value=None),
None # Clear audio player
)
# Delete the file
delete_result = delete_recorded_file(filename)
# Refresh file list
updated_files = get_recorded_files()
# Determine status color based on result
if "✅" in delete_result:
status_html = f"
{delete_result}
"
else:
status_html = f"
{delete_result}
"
return (
status_html,
gr.Dropdown(choices=updated_files, value=None),
None # Clear audio player
)
# Recorded Files Management Section
with gr.Accordion("🎤 File đã ghi âm", open=False):
gr.HTML("""
📁 Quản lý file đã ghi
Chọn file từ danh sách để phát lại hoặc dịch thuật
""")
# Refresh button for recorded files
refresh_files_btn = gr.Button(
"🔄 Làm mới danh sách",
variant="secondary",
size="sm"
)
# Status display for file operations
file_operation_status = gr.HTML(
value="
Chọn file để thực hiện thao tác
"
)
# Dropdown for recorded files
initial_files = get_recorded_files()
print(f"🔍 Initial recorded files: {initial_files}")
recorded_files_dropdown = gr.Dropdown(
choices=initial_files,
label="📂 Chọn file đã ghi",
info="Các file audio đã được ghi âm trước đó"
)
# Preview and controls for selected file
with gr.Row():
with gr.Column():
# Audio player for selected file
recorded_audio_player = gr.Audio(
label="🎵 Phát lại file đã chọn",
interactive=False,
show_label=True,
type="numpy" # Use numpy for better compatibility
)
with gr.Column():
# Action buttons
use_for_translation_btn = gr.Button(
"🔄 Sử dụng để dịch thuật",
variant="primary",
size="sm"
)
with gr.Row():
download_recorded_btn = gr.Button(
"📥 Tải xuống",
variant="secondary",
size="sm"
)
delete_recorded_btn = gr.Button(
"🗑️ Xóa file",
variant="stop",
size="sm"
)
# Download link for recorded file
download_recorded_file = gr.File(
label="📥 File tải xuống",
visible=True,
file_count="single"
)
# Settings section with gradient header
gr.HTML("""
🌍 Cài đặt dịch thuật
Chọn ngôn ngữ đích và giọng nói cho kết quả dịch thuật
""")
# Separate dropdowns without complex wrappers to avoid CSS conflicts
target_country_dropdown = gr.Dropdown(
choices=list(voice_choices_by_country.keys()),
value="🇻🇳 Việt Nam",
label="🌍 Chọn quốc gia đích"
)
target_voice_dropdown = gr.Dropdown(
choices=voice_choices_by_country["🇻🇳 Việt Nam"],
value="🇻🇳 HoaiMy - Nữ Việt Chuẩn",
label="🎭 Chọn giọng nói đích"
)
text_format_dropdown = gr.Dropdown(
choices=["Markdown (.md)", "TXT (.txt)", "Word (.docx)"] if DOCX_AVAILABLE else ["Markdown (.md)", "TXT (.txt)"],
value="Markdown (.md)",
label="📄 Định dạng file văn bản"
)
# Colorful action button
gr.HTML("""
""")
# Auto-translate on audio upload - no manual button needed
# Results section with colorful headers
gr.HTML("""
📊 Kết quả xử lý
Phiên âm, dịch thuật và tổng hợp giọng nói
""")
# Dynamic status indicator
status_text = gr.HTML("")
# Card-based layout for mobile
with gr.Column(elem_classes=["output-area"]):
# Original content card
gr.HTML("""
📝 Nội dung gốc từ audio
""")
transcription_output = gr.Textbox(
label="🎯 Phiên âm từ audio",
lines=4,
interactive=False,
placeholder="Nội dung phiên âm từ file audio sẽ hiển thị ở đây...",
elem_classes=["mobile-textbox"]
)
detected_language = gr.Textbox(
label="🌐 Ngôn ngữ được phát hiện",
lines=1,
interactive=False,
placeholder="Tự động nhận diện...",
elem_classes=["mobile-textbox"]
)
# Translation result card
gr.HTML("""
✨ Kết quả dịch thuật
""")
translation_output = gr.Textbox(
label="🔄 Nội dung đã dịch",
lines=4,
interactive=False,
placeholder="Bản dịch sẽ hiển thị ở đây...",
elem_classes=["mobile-textbox"]
)
target_language_display = gr.Textbox(
label="🎯 Ngôn ngữ đích",
lines=1,
interactive=False,
placeholder="Chưa chọn...",
elem_classes=["mobile-textbox"]
)
# Mobile-friendly comparison section
with gr.Accordion("🔍 So sánh nội dung", open=False):
gr.HTML("""
Xem nội dung gốc và bản dịch để so sánh
""")
# Stack vertically on mobile for better readability
with gr.Column():
gr.HTML("""
📝 Ngôn ngữ gốc
""")
original_compare = gr.Textbox(
label="",
lines=4,
interactive=False,
show_label=False,
placeholder="Nội dung phiên âm từ audio sẽ hiển thị ở đây...",
elem_classes=["mobile-compare"]
)
gr.HTML("""
✨ Sau khi dịch
""")
translated_compare = gr.Textbox(
label="",
lines=4,
interactive=False,
show_label=False,
placeholder="Nội dung sau khi dịch sẽ hiển thị ở đây...",
elem_classes=["mobile-compare"]
)
# Mobile-optimized download section
with gr.Accordion("💾 Tải xuống kết quả", open=True):
gr.HTML("""
💾 Tải xuống kết quả
File audio và văn bản đã dịch
""")
# Stack downloads vertically for mobile
with gr.Column():
gr.HTML("""
🔊 Audio đã dịch
""")
audio_output_at = gr.Audio(
label="🎵 Audio đã dịch",
type="filepath",
show_label=True,
elem_classes=["mobile-audio"],
format="wav" # Specify format explicitly
)
# Explicit download component for translated audio
audio_download_at = gr.File(
label="📥 Tải xuống audio đã dịch",
file_count="single",
file_types=[".wav"],
visible=True
)
gr.HTML("""
📄 Văn bản đã dịch
""")
text_output = gr.File(
label="",
file_count="single",
file_types=[".txt", ".docx"],
show_label=False,
elem_classes=["mobile-file"]
)
# Event handlers for Audio Translation with colorful status
def update_status_processing():
return """
⚡ Đang tự động dịch thuật...
"""
def update_status_complete():
return """
✅ Dịch thuật hoàn thành!
"""
target_country_dropdown.change(
fn=update_voices,
inputs=[target_country_dropdown],
outputs=[target_voice_dropdown]
)
# Update target language display when dropdown changes
target_voice_dropdown.change(
fn=lambda voice: voice,
inputs=[target_voice_dropdown],
outputs=[target_language_display]
)
# Helper function to extract format
def get_format_from_dropdown(format_choice):
if "Markdown" in format_choice:
return "md"
elif "Word" in format_choice:
return "docx"
return "txt"
# Auto-translate when audio is uploaded or changed
audio_input.change(
fn=lambda: update_status_processing(),
outputs=[status_text]
).then(
fn=lambda audio, country, voice, fmt: translate_audio(audio, country, voice, get_format_from_dropdown(fmt)) if audio is not None else ("", "", "📎 Vui lòng tải lên file audio hoặc ghi âm", country, None, "", "", None),
inputs=[audio_input, target_country_dropdown, target_voice_dropdown, text_format_dropdown],
outputs=[
transcription_output,
detected_language,
translation_output,
target_language_display,
audio_output_at,
audio_download_at,
original_compare,
translated_compare,
text_output
]
).then(
fn=lambda: update_status_complete(),
outputs=[status_text]
).then(
fn=refresh_recorded_files,
outputs=[recorded_files_dropdown]
)
# === RECORDED FILES EVENT HANDLERS ===
# Save current recording
save_recording_btn.click(
fn=save_current_recording,
inputs=[audio_input],
outputs=[recording_status, recorded_files_dropdown]
)
# New recording (clear audio)
new_recording_btn.click(
fn=clear_audio_for_new_recording,
outputs=[audio_input, recording_status]
)
refresh_files_btn.click(
fn=refresh_recorded_files,
outputs=[recorded_files_dropdown]
)
recorded_files_dropdown.change(
fn=load_recorded_file,
inputs=[recorded_files_dropdown],
outputs=[recorded_audio_player]
)
use_for_translation_btn.click(
fn=lambda: update_status_processing(),
outputs=[status_text]
).then(
fn=use_recorded_for_translation,
inputs=[recorded_files_dropdown, target_country_dropdown, target_voice_dropdown, text_format_dropdown],
outputs=[
transcription_output,
detected_language,
translation_output,
target_language_display,
audio_output_at,
audio_download_at,
original_compare,
translated_compare,
text_output
]
).then(
fn=lambda: update_status_complete(),
outputs=[status_text]
).then(
fn=refresh_recorded_files,
outputs=[recorded_files_dropdown]
)
download_recorded_btn.click(
fn=prepare_recorded_file_download,
inputs=[recorded_files_dropdown],
outputs=[download_recorded_file]
)
delete_recorded_btn.click(
fn=delete_selected_file,
inputs=[recorded_files_dropdown],
outputs=[file_operation_status, recorded_files_dropdown, recorded_audio_player]
)
# Features section cho Voice RAG
gr.Markdown("### 📚 Tính năng chính")
with gr.Row():
with gr.Column():
gr.HTML("""
📚 Voice RAG
Upload tài liệu và đặt câu hỏi. Nhận trả lời bằng giọng nói đa ngôn ngữ.
✓ Hỗ trợ PDF, DOCX, TXT
✓ AI Gemini 2.0 Flash
✓ 24 giọng nói đa quốc gia
""")
with gr.Column():
gr.HTML("""
🌍 Audio Translation
Dịch thuật âm thanh sang nhiều ngôn ngữ với giọng nói tự nhiên.
✓ Ghi âm real-time
✓ 13 ngôn ngữ chính
✓ Edge TTS Neural
""")
with gr.Row():
with gr.Column():
gr.HTML("""
🎤 Voice Studio
Chuyển văn bản thành giọng nói với nhiều lựa chọn quốc gia và giọng nói.
✓ 13 quốc gia
✓ Tích hợp dịch thuật
✓ Điều chỉnh tốc độ
""")
# Footer
gr.HTML("""
""")
# Add JavaScript for button effects
gr.HTML(js_code)
if __name__ == "__main__":
import sys
import locale
import os
# Ensure UTF-8 encoding
if sys.platform == 'win32':
os.environ['PYTHONIOENCODING'] = 'utf-8'
# Optimize startup for HF Spaces
print(f"===== Application Startup at {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} =====")
# Only create record_data directory when actually needed to speed up startup
if not os.environ.get("SPACE_ID") and not os.environ.get("HF_SPACE_ID"):
create_record_data_directory()
print(f"📁 Record data directory ready: {RECORD_DATA_DIR}")
else:
print(f"🏭 Production mode - record_data will be created on first use")
# Set environment variables for iframe support
os.environ['GRADIO_ALLOW_FLAGGING'] = 'never'
# Disable Gradio temp directory to prevent file serving issues
# os.environ['GRADIO_TEMP_DIR'] = '/tmp'
# Hugging Face Spaces configuration - Use standard port 7860 for HF
if os.environ.get("SPACE_ID") or os.environ.get("HF_SPACE_ID"):
# HF Spaces standard configuration
port = 7860
print("🏭 Using HF Spaces standard port 7860")
else:
# Local development
port = int(os.environ.get("GRADIO_SERVER_PORT", 7880))
print(f"🖥️ Using local development port {port}")
demo.launch(
server_name="0.0.0.0",
server_port=port,
share=False,
show_error=True,
ssr_mode=False, # Disable SSR to prevent timeout issues on HF Spaces
enable_monitoring=False # Disable monitoring for faster startup
)