Spaces:
Running
Running
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
import os | |
import sys | |
# Set UTF-8 encoding for Windows | |
if sys.platform == 'win32': | |
import codecs | |
sys.stdout = codecs.getwriter('utf-8')(sys.stdout.detach()) | |
sys.stderr = codecs.getwriter('utf-8')(sys.stderr.detach()) | |
# Load environment variables from .env file (optimized for HF Spaces) | |
try: | |
# Only load .env in local development, skip in production | |
if not os.environ.get("SPACE_ID") and not os.environ.get("HF_SPACE_ID"): | |
from dotenv import load_dotenv | |
load_dotenv() | |
print("✅ Environment variables loaded from .env file") | |
else: | |
print("🏭 Production environment - using system environment variables") | |
except ImportError: | |
print("⚠️ python-dotenv not installed. Using system environment variables only.") | |
except Exception as e: | |
print(f"⚠️ Error loading .env file: {e}") | |
# Essential imports for HF Spaces | |
import numpy as np | |
import gradio as gr | |
# Try to import google-generativeai with fallback | |
try: | |
import google.generativeai as genai | |
GENAI_AVAILABLE = True | |
except ImportError as e: | |
print(f"⚠️ google-generativeai not available: {e}") | |
GENAI_AVAILABLE = False | |
genai = None | |
try: | |
from gtts import gTTS, lang | |
GTTS_AVAILABLE = True | |
except ImportError as e: | |
print(f"⚠️ gtts not available: {e}") | |
GTTS_AVAILABLE = False | |
import tempfile | |
# import soundfile as sf # Import locally to avoid startup overhead | |
# Kokoro not used - removed for performance | |
import time | |
import base64 | |
# Try to import optional dependencies | |
try: | |
import edge_tts | |
EDGE_TTS_AVAILABLE = True | |
except ImportError as e: | |
print(f"⚠️ edge-tts not available: {e}") | |
EDGE_TTS_AVAILABLE = False | |
import asyncio | |
import io | |
try: | |
import PyPDF2 | |
PDF_AVAILABLE = True | |
except ImportError: | |
PDF_AVAILABLE = False | |
try: | |
import docx | |
DOCX_AVAILABLE = True | |
except ImportError: | |
DOCX_AVAILABLE = False | |
import shutil | |
import atexit | |
import glob | |
import datetime | |
# Librosa not used - removed for performance | |
# === RECORD DATA MANAGEMENT === | |
RECORD_DATA_DIR = "record_data" | |
def create_record_data_directory(): | |
"""Create record_data directory if it doesn't exist""" | |
if not os.path.exists(RECORD_DATA_DIR): | |
os.makedirs(RECORD_DATA_DIR) | |
print(f"✅ Created directory: {RECORD_DATA_DIR}") | |
return RECORD_DATA_DIR | |
def cleanup_record_data(): | |
"""Clean up record_data directory when app closes (disabled for production)""" | |
try: | |
# Disable cleanup for HF Spaces and production environments | |
if os.environ.get("SPACE_ID") or os.environ.get("HF_SPACE_ID"): | |
print(f"🏭 Production environment detected - keeping {RECORD_DATA_DIR} directory") | |
return | |
# Only cleanup in local development | |
if os.path.exists(RECORD_DATA_DIR): | |
shutil.rmtree(RECORD_DATA_DIR) | |
print(f"🧹 Cleaned up {RECORD_DATA_DIR} directory") | |
except Exception as e: | |
print(f"⚠️ Error cleaning up {RECORD_DATA_DIR}: {e}") | |
def save_recorded_audio(audio_data, original_filename=None): | |
"""Save audio data to record_data directory""" | |
try: | |
# Create directory if needed | |
create_record_data_directory() | |
# Generate filename with timestamp | |
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") | |
if original_filename: | |
name_part = os.path.splitext(os.path.basename(original_filename))[0] | |
filename = f"recorded_{name_part}_{timestamp}.wav" | |
else: | |
filename = f"recorded_{timestamp}.wav" | |
filepath = os.path.join(RECORD_DATA_DIR, filename) | |
# Handle different audio data types | |
if isinstance(audio_data, str) and os.path.exists(audio_data): | |
# File path - copy the file | |
shutil.copy2(audio_data, filepath) | |
elif isinstance(audio_data, tuple) and len(audio_data) == 2: | |
# Numpy array format (sample_rate, audio_array) | |
sample_rate, audio_array = audio_data | |
import soundfile as sf | |
sf.write(filepath, audio_array, sample_rate) | |
print(f"📊 Saved numpy audio: sr={sample_rate}, shape={audio_array.shape}") | |
else: | |
# Raw data | |
with open(filepath, 'wb') as f: | |
f.write(audio_data) | |
print(f"✅ Saved recorded audio: {filepath}") | |
return filepath | |
except Exception as e: | |
print(f"❌ Error saving recorded audio: {e}") | |
import traceback | |
traceback.print_exc() | |
return None | |
def get_recorded_files(): | |
"""Get list of recorded audio files""" | |
try: | |
if not os.path.exists(RECORD_DATA_DIR): | |
print(f"📁 Record directory does not exist: {RECORD_DATA_DIR}") | |
return [] | |
# Get all audio files in record_data | |
pattern = os.path.join(RECORD_DATA_DIR, "*.wav") | |
files = glob.glob(pattern) | |
print(f"🔍 Found {len(files)} files in {RECORD_DATA_DIR}") | |
# Sort by modification time (newest first) | |
files.sort(key=os.path.getmtime, reverse=True) | |
# Return just filenames for display | |
filenames = [os.path.basename(f) for f in files] | |
print(f"📂 Returning filenames: {filenames}") | |
return filenames | |
except Exception as e: | |
print(f"❌ Error getting recorded files: {e}") | |
return [] | |
def get_recorded_file_path(filename): | |
"""Get full path of recorded file""" | |
return os.path.join(RECORD_DATA_DIR, filename) | |
def delete_recorded_file(filename): | |
"""Delete recorded file from record_data directory""" | |
try: | |
if not filename or not filename.strip(): | |
return "❌ Không có file nào được chọn để xóa" | |
file_path = get_recorded_file_path(filename) | |
print(f"🗑️ Attempting to delete: {file_path}") | |
if os.path.exists(file_path): | |
os.remove(file_path) | |
print(f"✅ Successfully deleted: {filename}") | |
return f"✅ Đã xóa file: {filename}" | |
else: | |
print(f"❌ File not found: {file_path}") | |
return f"❌ Không tìm thấy file: {filename}" | |
except Exception as e: | |
print(f"❌ Error deleting file: {e}") | |
return f"❌ Lỗi khi xóa file: {str(e)}" | |
# Register cleanup function to run when app exits (disabled for stability) | |
# atexit.register(cleanup_record_data) # Disabled to prevent data loss on deployment | |
# DOCX support already checked above | |
# Configure Gemini API - Delayed configuration for faster startup | |
GEMINI_API_KEY = None | |
def configure_gemini_api(): | |
"""Configure Gemini API on first use to speed up startup""" | |
global GEMINI_API_KEY | |
if not GENAI_AVAILABLE: | |
print("❌ google-generativeai not available") | |
return None | |
if GEMINI_API_KEY is None: | |
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY") | |
if GEMINI_API_KEY: | |
genai.configure(api_key=GEMINI_API_KEY) | |
print("✅ Gemini API configured successfully") | |
else: | |
print("⚠️ GEMINI_API_KEY or GOOGLE_API_KEY not found in environment variables") | |
return GEMINI_API_KEY | |
# Language configurations for Audio Translation (simplified) | |
if GTTS_AVAILABLE: | |
GTTS_LANGUAGES = lang.tts_langs() | |
GTTS_LANGUAGES['ja'] = 'Japanese' | |
else: | |
GTTS_LANGUAGES = {'en': 'English', 'vi': 'Vietnamese'} | |
SUPPORTED_LANGUAGES = sorted(list(GTTS_LANGUAGES.values())) | |
# Voice mapping for Edge TTS - defined once for performance | |
VOICE_MAP = { | |
"🇻🇳 HoaiMy - Nữ Việt Chuẩn": "vi-VN-HoaiMyNeural", | |
"🇻🇳 NamMinh - Nam Việt Chuẩn": "vi-VN-NamMinhNeural", | |
"🇺🇸 Aria - Nữ Mỹ": "en-US-AriaNeural", | |
"🇺🇸 Guy - Nam Mỹ": "en-US-GuyNeural", | |
"🇬🇧 Sonia - Nữ Anh": "en-GB-SoniaNeural", | |
"🇬🇧 Ryan - Nam Anh": "en-GB-RyanNeural", | |
"🇩🇪 Katja - Deutsche Frau": "de-DE-KatjaNeural", | |
"🇩🇪 Conrad - Deutscher Mann": "de-DE-ConradNeural", | |
"🇫🇷 Denise - Française": "fr-FR-DeniseNeural", | |
"🇫🇷 Henri - Français": "fr-FR-HenriNeural", | |
"🇪🇸 Elvira - Española": "es-ES-ElviraNeural", | |
"🇪🇸 Alvaro - Español": "es-ES-AlvaroNeural", | |
"🇮🇹 Elsa - Italiana": "it-IT-ElsaNeural", | |
"🇮🇹 Diego - Italiano": "it-IT-DiegoNeural", | |
"🇯🇵 Nanami - 日本女性": "ja-JP-NanamiNeural", | |
"🇯🇵 Keita - 日本男性": "ja-JP-KeitaNeural", | |
"🇰🇷 SunHi - 한국 여성": "ko-KR-SunHiNeural", | |
"🇰🇷 BongJin - 한국 남성": "ko-KR-BongJinNeural", | |
"🇨🇳 Xiaoxiao - 中文女声": "zh-CN-XiaoxiaoNeural", | |
"🇨🇳 Yunxi - 中文男声": "zh-CN-YunxiNeural", | |
"🇷🇺 Svetlana - Русская": "ru-RU-SvetlanaNeural", | |
"🇷🇺 Dmitry - Русский": "ru-RU-DmitryNeural", | |
"🇵🇹 Francisca - Portuguesa": "pt-BR-FranciscaNeural", | |
"🇵🇹 Antonio - Português": "pt-BR-AntonioNeural", | |
"🇸🇦 Zariyah - عربية": "ar-SA-ZariyahNeural", | |
"🇸🇦 Hamed - عربي": "ar-SA-HamedNeural" | |
} | |
# Voice RAG Functions (Tích hợp từ hf_Voice_Audio_Translation) | |
def read_pdf(file_path): | |
"""Extract text from PDF file""" | |
try: | |
with open(file_path, 'rb') as file: | |
reader = PyPDF2.PdfReader(file) | |
text = "" | |
for page in reader.pages: | |
text += page.extract_text() | |
return text | |
except Exception as e: | |
return f"Error reading PDF: {str(e)}" | |
def read_docx(file_path): | |
"""Extract text from Word document""" | |
try: | |
if not DOCX_AVAILABLE: | |
return "❌ python-docx not available" | |
doc = docx.Document(file_path) | |
text = "" | |
for paragraph in doc.paragraphs: | |
text += paragraph.text + "\n" | |
return text | |
except Exception as e: | |
return f"Error reading DOCX: {str(e)}" | |
def read_txt(file_path): | |
"""Extract text from TXT file""" | |
try: | |
with open(file_path, 'r', encoding='utf-8') as file: | |
return file.read() | |
except Exception as e: | |
return f"Error reading TXT: {str(e)}" | |
def extract_text_from_file(file_path): | |
"""Extract text from various file formats""" | |
if file_path is None: | |
return "No file uploaded" | |
file_extension = os.path.splitext(file_path)[1].lower() | |
if file_extension == '.pdf': | |
return read_pdf(file_path) | |
elif file_extension == '.docx': | |
return read_docx(file_path) | |
elif file_extension == '.txt': | |
return read_txt(file_path) | |
else: | |
return f"Unsupported file format: {file_extension}" | |
def detect_language_from_text(text): | |
"""Detect language from text content""" | |
# Vietnamese detection | |
vietnamese_chars = 'àáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ' | |
if any(char in text.lower() for char in vietnamese_chars): | |
return "Vietnamese" | |
# Chinese detection | |
chinese_chars = '中文汉字學習语言' | |
if any(char in text for char in chinese_chars): | |
return "Chinese" | |
# Japanese detection | |
japanese_chars = 'ひらがなカタカナ日本語' | |
if any(char in text for char in japanese_chars): | |
return "Japanese" | |
# Korean detection | |
korean_chars = '한국어문자' | |
if any(char in text for char in korean_chars): | |
return "Korean" | |
# French detection | |
french_words = ['le', 'la', 'les', 'de', 'et', 'à', 'un', 'une', 'ce', 'qui', 'que'] | |
french_chars = 'àâäéèêëïîôöùûüÿç' | |
if any(word in text.lower() for word in french_words) or any(char in text.lower() for char in french_chars): | |
return "French" | |
# German detection | |
german_words = ['der', 'die', 'das', 'und', 'ist', 'ich', 'bin', 'haben', 'sein', 'werden'] | |
german_chars = 'äöüß' | |
if any(word in text.lower() for word in german_words) or any(char in text.lower() for char in german_chars): | |
return "German" | |
# Spanish detection | |
spanish_words = ['el', 'la', 'de', 'que', 'y', 'a', 'en', 'un', 'es', 'se', 'no', 'te', 'lo'] | |
spanish_chars = 'ñáéíóúü' | |
if any(word in text.lower() for word in spanish_words) or any(char in text.lower() for char in spanish_chars): | |
return "Spanish" | |
# English detection (default) | |
english_words = ['the', 'and', 'is', 'are', 'have', 'has', 'will', 'would', 'can', 'could'] | |
if any(word in text.lower() for word in english_words): | |
return "English" | |
return "English" # Default fallback | |
def process_with_gemini(text, question, answer_language="Vietnamese"): | |
"""Process text and question using Gemini with multi-language support""" | |
try: | |
api_key = configure_gemini_api() | |
if not api_key: | |
return "❌ Lỗi: Chưa cấu hình GEMINI_API_KEY hoặc GOOGLE_API_KEY trong environment variables" | |
model = genai.GenerativeModel("gemini-2.0-flash") | |
# Detect document language | |
detected_doc_language = detect_language_from_text(text) | |
prompt = f""" | |
Based on the following document content, please answer the question in {answer_language}: | |
Document Content (detected language: {detected_doc_language}): | |
{text} | |
Question: {question} | |
Please provide a comprehensive and accurate answer in {answer_language}. | |
If the document is in a different language than the question, please still answer in {answer_language}. | |
Maintain the factual accuracy while adapting cultural context appropriately. | |
""" | |
response = model.generate_content(prompt) | |
return response.text | |
except Exception as e: | |
return f"Error processing with Gemini: {str(e)}" | |
def text_to_speech_rag(text, voice_selection): | |
"""Convert text to speech using Edge TTS for RAG results""" | |
try: | |
if not text or text.startswith("Error"): | |
return None | |
# Use global VOICE_MAP for performance | |
voice_name = VOICE_MAP.get(voice_selection, "vi-VN-HoaiMyNeural") | |
text_limited = text[:2000] if len(text) > 2000 else text | |
# Generate speech using Edge TTS | |
audio_data = asyncio.run(generate_speech(text_limited, voice_name, 0.0)) | |
# Save to temporary file | |
fd, temp_output_path = tempfile.mkstemp(suffix=".wav", prefix="voice_rag_audio_") | |
os.close(fd) | |
# Write raw audio data to temporary file | |
with open(temp_output_path, 'wb') as f: | |
f.write(audio_data) | |
return temp_output_path | |
except Exception as e: | |
print(f"TTS Error: {str(e)}") | |
return None | |
def voice_rag_pipeline(uploaded_file, question, answer_language="Vietnamese", voice_selection="🇻🇳 HoaiMy - Nữ Việt Chuẩn", text_format="txt"): | |
"""Complete Voice RAG pipeline with multi-language support and downloadable text""" | |
if uploaded_file is None: | |
return "Please upload a document first.", "N/A", None, None | |
if not question.strip(): | |
return "Please enter a question.", "N/A", None, None | |
# Extract text from uploaded file | |
extracted_text = extract_text_from_file(uploaded_file) | |
if extracted_text.startswith("Error"): | |
return extracted_text, "Error", None, None | |
# Detect document language | |
detected_doc_language = detect_language_from_text(extracted_text) | |
# Process with Gemini using selected answer language | |
answer = process_with_gemini(extracted_text, question, answer_language) | |
# Generate speech using selected voice | |
audio_file = text_to_speech_rag(answer, voice_selection) | |
# Create formatted content for download | |
if text_format.lower() == "md": | |
# Create beautiful Markdown format | |
formatted_content = format_voice_rag_response( | |
question, answer, detected_doc_language, voice_selection | |
) | |
text_file_path = create_text_file(formatted_content, "md", "voice_rag_response") | |
else: | |
# Create standard text file | |
text_file_path = create_text_file(answer, text_format, "voice_rag_answer") | |
return answer, detected_doc_language, audio_file, text_file_path | |
def detect_language(text): | |
"""Detect language of input text with improved accuracy""" | |
if not text.strip(): | |
return "unknown" | |
text_lower = text.lower() | |
# Vietnamese detection (more comprehensive) | |
vietnamese_chars = 'àáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ' | |
vietnamese_words = ['và', 'của', 'là', 'có', 'này', 'được', 'cho', 'từ', 'một', 'những', 'tôi', 'bạn'] | |
vietnamese_score = sum(1 for char in text if char in vietnamese_chars) + sum(2 for word in vietnamese_words if word in text_lower) | |
# English detection (more comprehensive) | |
english_words = ['the', 'and', 'is', 'are', 'have', 'has', 'will', 'would', 'can', 'could', 'that', 'this', 'with', 'for', 'you', 'he', 'she', 'it', 'they', 'we'] | |
english_score = sum(1 for word in english_words if word in text_lower) | |
# German detection | |
german_words = ['der', 'die', 'das', 'und', 'ist', 'ich', 'bin', 'haben', 'sein', 'werden', 'mit', 'auf', 'für', 'von'] | |
german_chars = 'äöüß' | |
german_score = sum(1 for word in german_words if word in text_lower) + sum(1 for char in text if char in german_chars) | |
# French detection | |
french_words = ['le', 'la', 'les', 'de', 'et', 'à', 'un', 'une', 'ce', 'qui', 'que', 'avec', 'pour', 'dans'] | |
french_chars = 'àâäéèêëïîôöùûüÿç' | |
french_score = sum(1 for word in french_words if word in text_lower) + sum(0.5 for char in text if char in french_chars) | |
# Spanish detection | |
spanish_words = ['el', 'la', 'de', 'que', 'y', 'a', 'en', 'un', 'es', 'se', 'no', 'te', 'lo', 'con', 'para'] | |
spanish_chars = 'ñáéíóúü' | |
spanish_score = sum(1 for word in spanish_words if word in text_lower) + sum(0.5 for char in text if char in spanish_chars) | |
# Score-based detection | |
scores = { | |
'Vietnamese': vietnamese_score, | |
'English': english_score, | |
'German': german_score, | |
'French': french_score, | |
'Spanish': spanish_score | |
} | |
# Find the language with highest score | |
max_score = max(scores.values()) | |
if max_score > 0: | |
detected = max(scores, key=scores.get) | |
print(f"🔍 Language detection scores: {scores}") | |
print(f"🎯 Detected language: {detected} (score: {max_score})") | |
return detected | |
# Default fallback | |
print(f"⚠️ Could not detect language, defaulting to English") | |
return "English" | |
async def generate_speech(text, voice_name, rate): | |
"""Generate speech using Edge TTS""" | |
communicate = edge_tts.Communicate(text, voice_name, rate=f"{rate:+.0%}") | |
# Create in-memory buffer | |
audio_buffer = io.BytesIO() | |
async for chunk in communicate.stream(): | |
if chunk["type"] == "audio": | |
audio_buffer.write(chunk["data"]) | |
audio_buffer.seek(0) | |
return audio_buffer.getvalue() | |
def create_text_file(content, file_format="txt", filename_prefix="translated_text"): | |
""" | |
Create a downloadable text file from content in TXT, DOCX, or MD format | |
""" | |
if not content or content.startswith("Lỗi:") or content.startswith("❌"): | |
return None | |
try: | |
if file_format.lower() == "docx" and DOCX_AVAILABLE: | |
# Create Word document | |
fd, temp_file_path = tempfile.mkstemp(suffix=".docx", prefix=f"{filename_prefix}_") | |
os.close(fd) | |
if not DOCX_AVAILABLE: | |
return None | |
from docx import Document | |
doc = Document() | |
doc.add_heading('Nội dung đã dịch', 0) | |
doc.add_paragraph(content) | |
doc.save(temp_file_path) | |
return temp_file_path | |
elif file_format.lower() == "md": | |
# Create Markdown file | |
fd, temp_file_path = tempfile.mkstemp(suffix=".md", prefix=f"{filename_prefix}_") | |
os.close(fd) | |
with open(temp_file_path, 'w', encoding='utf-8') as f: | |
f.write(content) | |
return temp_file_path | |
else: | |
# Create TXT file (default) | |
fd, temp_file_path = tempfile.mkstemp(suffix=".txt", prefix=f"{filename_prefix}_") | |
os.close(fd) | |
with open(temp_file_path, 'w', encoding='utf-8') as f: | |
f.write(content) | |
return temp_file_path | |
except Exception as e: | |
return None | |
def format_voice_rag_response(question, answer, detected_language, voice_selection, timestamp=None): | |
""" | |
Format Voice RAG response as beautiful Markdown | |
""" | |
if timestamp is None: | |
timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) | |
# Clean and format the answer | |
formatted_answer = answer.strip() | |
# Create beautiful Markdown document | |
markdown_content = f"""# 📚 Voice RAG - Intelligent Document Q&A | |
--- | |
## 📄 **Session Information** | |
| **Field** | **Details** | | |
|-----------|-------------| | |
| 🕒 **Timestamp** | {timestamp} | | |
| 🌍 **Document Language** | {detected_language} | | |
| 🎭 **Voice Selection** | {voice_selection} | | |
| 🤖 **AI Model** | Google Gemini 2.0 Flash | | |
--- | |
## ❓ **Question** | |
> {question} | |
--- | |
## 💬 **AI Response** | |
{formatted_answer} | |
--- | |
--- | |
## 📱 **Generated by** | |
**🎙️ Voice AI Platform** - Digitized Brains | |
*Powered by Claude Code & Google Gemini 2.0 Flash* | |
> 🌐 **Voice RAG Technology** - Combining document intelligence with premium voice synthesis | |
--- | |
*Generated on {timestamp} | Voice: {voice_selection} | Language: {detected_language}* | |
""" | |
return markdown_content | |
def format_voice_studio_response(text, voice_selection, speed, detected_language="Auto-detected", timestamp=None): | |
""" | |
Format Voice Studio response as simple Markdown | |
""" | |
if timestamp is None: | |
timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) | |
# Clean and format the text | |
formatted_text = text.strip() | |
# Create simple Markdown document | |
markdown_content = f"""# Voice Studio Result | |
## Input Text ({detected_language}) | |
{formatted_text} | |
--- | |
*Generated on {timestamp} | Voice: {voice_selection} | Speed: {speed:.1f}x* | |
""" | |
return markdown_content | |
def format_audio_translation_response(original_text, translated_text, source_language, target_language, voice_selection, timestamp=None): | |
""" | |
Format Audio Translation response as simple Markdown | |
""" | |
if timestamp is None: | |
timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) | |
# Clean and format the texts | |
formatted_original = original_text.strip() | |
formatted_translated = translated_text.strip() | |
# Create simple Markdown document | |
markdown_content = f"""# Audio Translation Result | |
## Original Text ({source_language}) | |
{formatted_original} | |
## Translated Text ({target_language}) | |
{formatted_translated} | |
--- | |
*Generated on {timestamp} | {source_language} → {target_language} | Voice: {voice_selection}* | |
""" | |
return markdown_content | |
def create_audio_voice_studio(text, voice_selection, speed, text_format="txt"): | |
"""Voice Studio functionality with text file generation""" | |
if not text.strip(): | |
return "❌ Vui lòng nhập văn bản / Please enter text / Bitte Text eingeben", None | |
try: | |
# Use global VOICE_MAP for performance (avoiding recreation on each call) | |
voice_name = VOICE_MAP.get(voice_selection, "vi-VN-HoaiMyNeural") | |
text_limited = text[:1000] if len(text) > 1000 else text | |
# Convert speed (0.5-2.0) to rate percentage (-50% to +100%) | |
rate_percent = (speed - 1.0) | |
# Generate speech using Edge TTS | |
audio_data = asyncio.run(generate_speech(text_limited, voice_name, rate_percent)) | |
# Convert to base64 | |
audio_base64 = base64.b64encode(audio_data).decode('utf-8') | |
timestamp = int(time.time()) | |
filename = f"voice_{voice_name}_{speed}x_{timestamp}.mp3" | |
# Detect language | |
detected_lang = detect_language(text_limited) | |
# Mobile-optimized HTML player | |
html_player = f''' | |
<div style=" | |
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
border-radius: 20px; | |
padding: 20px; | |
margin: 10px 0; | |
box-shadow: 0 8px 32px rgba(0,0,0,0.2); | |
color: white; | |
text-align: center; | |
"> | |
<div style="margin-bottom: 20px;"> | |
<h3 style="color: #fff; margin: 0 0 15px 0; font-size: 1.3em; text-shadow: 1px 1px 2px rgba(0,0,0,0.3);"> | |
🎵 Âm thanh hoàn thành! | |
</h3> | |
<div style=" | |
background: rgba(255,255,255,0.2); | |
border-radius: 12px; | |
padding: 12px; | |
font-size: 0.9em; | |
line-height: 1.5; | |
backdrop-filter: blur(10px); | |
"> | |
<div><strong>🎭 Giọng:</strong> {voice_selection}</div> | |
<div><strong>⚡ Tốc độ:</strong> {speed:.1f}x | <strong>🌍 Ngôn ngữ:</strong> {detected_lang.title()}</div> | |
<div><strong>📝 Độ dài:</strong> {len(text_limited)} ký tự</div> | |
</div> | |
</div> | |
<audio controls style=" | |
width: 100%; | |
max-width: 100%; | |
height: 50px; | |
margin: 20px 0; | |
border-radius: 25px; | |
background: rgba(255,255,255,0.95); | |
box-shadow: 0 4px 15px rgba(0,0,0,0.2); | |
"> | |
<source src="data:audio/mpeg;base64,{audio_base64}" type="audio/mpeg"> | |
Trình duyệt không hỗ trợ audio. | |
</audio> | |
<div style=" | |
display: flex; | |
justify-content: center; | |
margin-top: 20px; | |
"> | |
<a href="data:audio/mpeg;base64,{audio_base64}" download="{filename}" | |
style=" | |
background: linear-gradient(45deg, #28a745, #20c997); | |
color: white; | |
padding: 15px 30px; | |
text-decoration: none; | |
border-radius: 25px; | |
font-weight: 700; | |
font-size: 1.1em; | |
display: flex; | |
align-items: center; | |
justify-content: center; | |
box-shadow: 0 4px 15px rgba(40,167,69,0.3); | |
transition: all 0.3s ease; | |
min-height: 48px; | |
min-width: 200px; | |
" | |
ontouchstart="" | |
onmouseover="this.style.transform='translateY(-2px)'; this.style.boxShadow='0 6px 20px rgba(40,167,69,0.4)'" | |
onmouseout="this.style.transform='translateY(0)'; this.style.boxShadow='0 4px 15px rgba(40,167,69,0.3)'"> | |
📥 TẢI XUỐNG MP3 | |
</a> | |
</div> | |
</div> | |
''' | |
# Create text file based on format | |
text_file_path = None | |
if text_format == "md": | |
# Use Markdown formatting function | |
detected_language = detect_language(text_limited) | |
markdown_content = format_voice_studio_response(text_limited, voice_selection, speed, detected_language) | |
text_file_path = create_text_file(markdown_content, "md", "voice_studio") | |
elif text_format == "docx": | |
# Create Word document with Voice Studio formatting | |
detected_language = detect_language(text_limited) | |
markdown_content = format_voice_studio_response(text_limited, voice_selection, speed, detected_language) | |
text_file_path = create_text_file(markdown_content, "docx", "voice_studio") | |
elif text_format == "txt": | |
# Create simple text file | |
text_file_path = create_text_file(text_limited, "txt", "voice_studio") | |
return html_player, text_file_path | |
except Exception as e: | |
return f"❌ Error: {str(e)}", None | |
# Language mapping for voices - defined once for performance | |
VOICE_TO_LANGUAGE = { | |
# Vietnamese | |
"🇻🇳 HoaiMy - Nữ Việt Chuẩn": "Vietnamese", | |
"🇻🇳 NamMinh - Nam Việt Chuẩn": "Vietnamese", | |
# English | |
"🇺🇸 Aria - Nữ Mỹ": "English", | |
"🇺🇸 Guy - Nam Mỹ": "English", | |
"🇬🇧 Sonia - Nữ Anh": "English", | |
"🇬🇧 Ryan - Nam Anh": "English", | |
# German | |
"🇩🇪 Katja - Deutsche Frau": "German", | |
"🇩🇪 Conrad - Deutscher Mann": "German", | |
# French | |
"🇫🇷 Denise - Française": "French", | |
"🇫🇷 Henri - Français": "French", | |
# Spanish | |
"🇪🇸 Elvira - Española": "Spanish", | |
"🇪🇸 Alvaro - Español": "Spanish", | |
# Italian | |
"🇮🇹 Elsa - Italiana": "Italian", | |
"🇮🇹 Diego - Italiano": "Italian", | |
# Japanese | |
"🇯🇵 Nanami - 日本女性": "Japanese", | |
"🇯🇵 Keita - 日本男性": "Japanese", | |
# Korean | |
"🇰🇷 SunHi - 한국 여성": "Korean", | |
"🇰🇷 BongJin - 한국 남성": "Korean", | |
# Chinese | |
"🇨🇳 Xiaoxiao - 中文女声": "Chinese", | |
"🇨🇳 Yunxi - 中文男声": "Chinese", | |
# Russian | |
"🇷🇺 Svetlana - Русская": "Russian", | |
"🇷🇺 Dmitry - Русский": "Russian", | |
# Portuguese | |
"🇵🇹 Francisca - Portuguesa": "Portuguese", | |
"🇵🇹 Antonio - Português": "Portuguese", | |
# Arabic | |
"🇸🇦 Zariyah - عربية": "Arabic", | |
"🇸🇦 Hamed - عربي": "Arabic" | |
} | |
def get_target_language_from_voice(voice_selection): | |
"""Map voice selection to target language for translation""" | |
return VOICE_TO_LANGUAGE.get(voice_selection, "Vietnamese") | |
def translate_text_with_gemini(text, target_language): | |
"""Translate text using Gemini API""" | |
try: | |
api_key = configure_gemini_api() | |
if not api_key: | |
return f"❌ Lỗi: Chưa cấu hình GEMINI_API_KEY hoặc GOOGLE_API_KEY trong environment variables" | |
if not text.strip(): | |
return "" | |
model = genai.GenerativeModel("gemini-2.0-flash") | |
prompt = f"""Translate the following text to {target_language}. Return ONLY the translated text, nothing else: | |
{text}""" | |
response = model.generate_content(prompt) | |
translated_text = response.text.strip() | |
# Clean up any unwanted text that might be included | |
if translated_text.lower().startswith("translation:"): | |
translated_text = translated_text[12:].strip() | |
if translated_text.lower().startswith("here is"): | |
lines = translated_text.split('\n') | |
if len(lines) > 1: | |
translated_text = '\n'.join(lines[1:]).strip() | |
return translated_text | |
except Exception as e: | |
return f"Lỗi dịch thuật: {str(e)}" | |
def translate_audio(audio_file, target_country, voice_selection, text_format="txt"): | |
""" | |
Transcribe, translate and synthesize audio to target language with Voice Studio integration | |
""" | |
try: | |
api_key = configure_gemini_api() | |
if not api_key: | |
return "❌ Lỗi: Chưa cấu hình GEMINI_API_KEY hoặc GOOGLE_API_KEY trong environment variables", "Không xác định", "", target_country, None, None, "", "", None | |
if audio_file is None: | |
return "Lỗi: Vui lòng tải lên file audio", "Không xác định", "", target_country, None, None, "", "", None | |
# Save recorded audio to record_data directory | |
print(f"🔍 Processing audio file type: {type(audio_file)}") | |
saved_audio_path = save_recorded_audio(audio_file) | |
if saved_audio_path: | |
print(f"🎤 Audio saved to record_data: {os.path.basename(saved_audio_path)}") | |
# Debug: check if file really exists | |
if os.path.exists(saved_audio_path): | |
file_size = os.path.getsize(saved_audio_path) | |
print(f"✅ File confirmed: {saved_audio_path} ({file_size} bytes)") | |
else: | |
print(f"❌ File not found after save: {saved_audio_path}") | |
return "❌ Lỗi: Không thể lưu file audio", "Không xác định", "", target_country, None, None, "", "", None | |
else: | |
print("❌ Failed to save audio file") | |
return "❌ Lỗi: Không thể lưu file audio", "Không xác định", "", target_country, None, None, "", "", None | |
# Get target language from voice selection | |
target_language = get_target_language_from_voice(voice_selection) | |
# Transcribe audio using Gemini | |
model = genai.GenerativeModel("gemini-2.0-flash") | |
# Read audio file using saved path | |
with open(saved_audio_path, 'rb') as f: | |
audio_data = f.read() | |
# Create audio blob | |
audio_blob = { | |
'mime_type': 'audio/wav', | |
'data': audio_data | |
} | |
# Step 1: Transcribe audio only first | |
transcribe_prompt = """Transcribe this audio accurately in its original language. Return only the transcribed text, nothing else.""" | |
response = model.generate_content([transcribe_prompt, audio_blob]) | |
transcription = response.text.strip() | |
# Step 2: Detect language of transcription | |
detected_lang = detect_language(transcription) | |
# Step 3: Translate if needed (only if source is different from target) | |
if detected_lang.lower() != target_language.lower(): | |
print(f"🔄 Translating from {detected_lang} to {target_language}") | |
translated_text = translate_text_with_gemini(transcription, target_language) | |
# Check if translation was successful | |
if translated_text.startswith("❌") or translated_text.startswith("Lỗi"): | |
print(f"❌ Translation failed: {translated_text}") | |
# Use original transcription if translation fails | |
translated_text = transcription | |
else: | |
print(f"✅ Translation successful") | |
else: | |
print(f"ℹ️ No translation needed - same language ({detected_lang})") | |
translated_text = transcription | |
# Generate audio using Edge TTS (use global VOICE_MAP for performance) | |
edge_voice = VOICE_MAP.get(voice_selection, "vi-VN-HoaiMyNeural") | |
print(f"🎙️ Generating audio with voice: {edge_voice}") | |
audio_data = asyncio.run(generate_speech(translated_text, edge_voice, 0.0)) | |
print(f"🎵 Generated audio data: {len(audio_data)} bytes") | |
# Save audio file | |
fd, temp_output_path = tempfile.mkstemp(suffix=".wav", prefix="translated_audio_") | |
os.close(fd) | |
print(f"📁 Created temp audio file: {temp_output_path}") | |
# Write raw audio data to temporary file | |
with open(temp_output_path, 'wb') as f: | |
f.write(audio_data) | |
# Verify file was created | |
if os.path.exists(temp_output_path): | |
file_size = os.path.getsize(temp_output_path) | |
print(f"✅ Audio file created successfully: {file_size} bytes") | |
else: | |
print(f"❌ Failed to create audio file: {temp_output_path}") | |
# Create text file for download with proper formatting | |
text_file_path = None | |
if text_format == "md": | |
# Use Markdown formatting function for Audio Translation | |
markdown_content = format_audio_translation_response( | |
transcription, translated_text, detected_lang, target_language, voice_selection | |
) | |
text_file_path = create_text_file(markdown_content, "md", "audio_translation") | |
elif text_format == "docx": | |
# Create Word document with Audio Translation formatting | |
markdown_content = format_audio_translation_response( | |
transcription, translated_text, detected_lang, target_language, voice_selection | |
) | |
text_file_path = create_text_file(markdown_content, "docx", "audio_translation") | |
else: | |
# Create simple text file | |
text_file_path = create_text_file(translated_text, "txt", "audio_translation") | |
return transcription, detected_lang, translated_text, target_language, temp_output_path, temp_output_path, transcription, translated_text, text_file_path | |
except Exception as e: | |
# Get target language for error response | |
target_language = get_target_language_from_voice(voice_selection) if 'voice_selection' in locals() else "Vietnamese" | |
return f"Lỗi: {str(e)}", "Lỗi", "", target_language, None, None, "", "", None | |
# Voice choices organized by country - ONLY OFFICIAL VOICES | |
voice_choices_by_country = { | |
"🇻🇳 Việt Nam": [ | |
"🇻🇳 HoaiMy - Nữ Việt Chuẩn", | |
"🇻🇳 NamMinh - Nam Việt Chuẩn" | |
], | |
"🇺🇸 Hoa Kỳ": [ | |
"🇺🇸 Aria - Nữ Mỹ", | |
"🇺🇸 Guy - Nam Mỹ" | |
], | |
"🇬🇧 Anh": [ | |
"🇬🇧 Sonia - Nữ Anh", | |
"🇬🇧 Ryan - Nam Anh" | |
], | |
"🇩🇪 Đức": [ | |
"🇩🇪 Katja - Deutsche Frau", | |
"🇩🇪 Conrad - Deutscher Mann" | |
], | |
"🇫🇷 Pháp": [ | |
"🇫🇷 Denise - Française", | |
"🇫🇷 Henri - Français" | |
], | |
"🇪🇸 Tây Ban Nha": [ | |
"🇪🇸 Elvira - Española", | |
"🇪🇸 Alvaro - Español" | |
], | |
"🇮🇹 Ý": [ | |
"🇮🇹 Elsa - Italiana", | |
"🇮🇹 Diego - Italiano" | |
], | |
"🇯🇵 Nhật Bản": [ | |
"🇯🇵 Nanami - 日本女性", | |
"🇯🇵 Keita - 日本男性" | |
], | |
"🇰🇷 Hàn Quốc": [ | |
"🇰🇷 SunHi - 한국 여성", | |
"🇰🇷 BongJin - 한국 남성" | |
], | |
"🇨🇳 Trung Quốc": [ | |
"🇨🇳 Xiaoxiao - 中文女声", | |
"🇨🇳 Yunxi - 中文男声" | |
], | |
"🇷🇺 Nga": [ | |
"🇷🇺 Svetlana - Русская", | |
"🇷🇺 Dmitry - Русский" | |
], | |
"🇵🇹 Bồ Đào Nha": [ | |
"🇵🇹 Francisca - Portuguesa", | |
"🇵🇹 Antonio - Português" | |
], | |
"🇸🇦 Ả Rập": [ | |
"🇸🇦 Zariyah - عربية", | |
"🇸🇦 Hamed - عربي" | |
] | |
} | |
def update_voices(country): | |
"""Update voice choices based on selected country""" | |
if country in voice_choices_by_country: | |
voices = voice_choices_by_country[country] | |
return gr.Dropdown(choices=voices, value=voices[0]) | |
else: | |
# Default to Vietnamese voices | |
default_voices = voice_choices_by_country["🇻🇳 Việt Nam"] | |
return gr.Dropdown(choices=default_voices, value=default_voices[0]) | |
# Lightweight CSS - optimized for performance | |
css = """ | |
* { | |
font-family: system-ui, -apple-system, 'Segoe UI', Arial, sans-serif; | |
} | |
.gradio-container { | |
max-width: 1200px; | |
margin: 0 auto; | |
position: relative; | |
} | |
/* Critical fix for dropdown interaction */ | |
.gradio-container * { | |
pointer-events: auto; | |
} | |
/* Hide Gradio footer */ | |
.footer { | |
display: none !important; | |
} | |
/* Pulsing animation for processing status */ | |
@keyframes pulse-processing { | |
0% { | |
opacity: 1; | |
transform: scale(1); | |
box-shadow: 0 4px 15px rgba(255, 193, 7, 0.3); | |
} | |
50% { | |
opacity: 0.8; | |
transform: scale(1.02); | |
box-shadow: 0 6px 25px rgba(255, 193, 7, 0.6); | |
} | |
100% { | |
opacity: 1; | |
transform: scale(1); | |
box-shadow: 0 4px 15px rgba(255, 193, 7, 0.3); | |
} | |
} | |
.status-processing { | |
animation: pulse-processing 1.5s ease-in-out infinite; | |
background: linear-gradient(135deg, #FFC107 0%, #FF9800 100%) !important; | |
} | |
/* Success status animation */ | |
@keyframes pulse-success { | |
0% { | |
opacity: 1; | |
transform: scale(1); | |
} | |
50% { | |
opacity: 0.9; | |
transform: scale(1.01); | |
} | |
100% { | |
opacity: 1; | |
transform: scale(1); | |
} | |
} | |
.status-success { | |
animation: pulse-success 2s ease-in-out 3; | |
background: linear-gradient(135deg, #4CAF50 0%, #2E7D32 100%) !important; | |
} | |
/* Custom footer to cover Gradio attribution */ | |
.custom-footer { | |
position: fixed; | |
bottom: 0; | |
left: 0; | |
right: 0; | |
background: linear-gradient(135deg, #4A90E2 0%, #2E86AB 70%, #FF8A65 85%, #FF6B9D 100%); | |
color: white; | |
padding: 15px; | |
text-align: center; | |
font-weight: bold; | |
z-index: 1000; | |
box-shadow: 0 -2px 10px rgba(0,0,0,0.1); | |
} | |
/* Add padding to body to account for fixed footer */ | |
body { | |
padding-bottom: 60px; | |
} | |
/* Mobile-first responsive design */ | |
.input-card { | |
background: rgba(255,255,255,0.95); | |
border-radius: 16px; | |
padding: 16px; | |
margin: 10px 0; | |
box-shadow: 0 4px 20px rgba(0,0,0,0.1); | |
backdrop-filter: blur(10px); | |
} | |
.output-area { | |
background: rgba(255,255,255,0.95); | |
border-radius: 16px; | |
padding: 16px; | |
margin: 15px 0; | |
min-height: 200px; | |
box-shadow: 0 4px 20px rgba(0,0,0,0.1); | |
} | |
.examples-section { | |
background: rgba(255,255,255,0.9); | |
border-radius: 16px; | |
padding: 16px; | |
margin: 20px 0; | |
} | |
.main-header { | |
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
color: white; | |
padding: 20px; | |
border-radius: 10px; | |
margin-bottom: 20px; | |
text-align: center; | |
} | |
.feature-box { | |
background: #f8f9fa; | |
padding: 15px; | |
border-radius: 8px; | |
margin: 10px 0; | |
border-left: 4px solid #667eea; | |
} | |
.status-indicator { | |
display: inline-block; | |
padding: 5px 10px; | |
border-radius: 15px; | |
font-size: 12px; | |
font-weight: bold; | |
margin: 5px; | |
} | |
.status-success { | |
background-color: #d4edda; | |
color: #155724; | |
} | |
.status-processing { | |
background-color: #fff3cd; | |
color: #856404; | |
} | |
.comparison-section { | |
border: 1px solid #e0e0e0; | |
border-radius: 8px; | |
padding: 15px; | |
margin: 10px 0; | |
background: #fafafa; | |
} | |
.language-label { | |
font-weight: bold; | |
color: #667eea; | |
padding: 5px 10px; | |
background: #f0f2ff; | |
border-radius: 15px; | |
display: inline-block; | |
margin-bottom: 10px; | |
font-size: 14px; | |
} | |
.content-compare { | |
background: white; | |
border: 1px solid #ddd; | |
border-radius: 6px; | |
padding: 12px; | |
min-height: 120px; | |
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; | |
line-height: 1.5; | |
} | |
/* Reset any problematic dropdown styles */ | |
.gradio-container * { | |
pointer-events: auto; | |
} | |
/* Remove any potential blocking overlays */ | |
.gradio-container::before, | |
.gradio-container::after { | |
display: none; | |
} | |
/* Ensure all interactive elements work */ | |
button, select, input, textarea, .gr-dropdown { | |
pointer-events: auto !important; | |
position: relative !important; | |
} | |
/* Simple dropdown fix without complex selectors */ | |
[class*="dropdown"] { | |
position: relative !important; | |
z-index: 999 !important; | |
} | |
[class*="dropdown"] * { | |
pointer-events: auto !important; | |
} | |
/* Make sure no overlay blocks clicks */ | |
.gradio-container .gr-form { | |
position: relative; | |
z-index: 1; | |
} | |
.gradio-container .gr-block { | |
position: relative; | |
z-index: 1; | |
} | |
.mobile-button { | |
width: 100% !important; | |
padding: 15px !important; | |
font-size: 1.1em !important; | |
margin: 20px 0 !important; | |
border-radius: 12px !important; | |
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important; | |
border: none !important; | |
color: white !important; | |
font-weight: bold !important; | |
box-shadow: 0 4px 15px rgba(102, 126, 234, 0.3) !important; | |
transition: all 0.3s ease !important; | |
cursor: pointer !important; | |
position: relative !important; | |
overflow: hidden !important; | |
} | |
.mobile-button:hover { | |
transform: translateY(-2px) !important; | |
box-shadow: 0 8px 25px rgba(102, 126, 234, 0.4) !important; | |
background: linear-gradient(135deg, #5a6fd8 0%, #6b4190 100%) !important; | |
} | |
.mobile-button:active { | |
transform: translateY(0px) !important; | |
box-shadow: 0 2px 10px rgba(102, 126, 234, 0.3) !important; | |
} | |
/* Ripple effect for button */ | |
.mobile-button::before { | |
content: ''; | |
position: absolute; | |
top: 50%; | |
left: 50%; | |
width: 0; | |
height: 0; | |
border-radius: 50%; | |
background: rgba(255, 255, 255, 0.3); | |
transform: translate(-50%, -50%); | |
transition: width 0.6s, height 0.6s; | |
} | |
.mobile-button:active::before { | |
width: 300px; | |
height: 300px; | |
} | |
/* Loading spinner animation */ | |
@keyframes spin { | |
0% { transform: rotate(0deg); } | |
100% { transform: rotate(360deg); } | |
} | |
.loading-spinner { | |
display: inline-block; | |
width: 20px; | |
height: 20px; | |
border: 3px solid rgba(255,255,255,0.3); | |
border-radius: 50%; | |
border-top-color: white; | |
animation: spin 1s ease-in-out infinite; | |
margin-right: 10px; | |
} | |
/* Button pulse effect when processing */ | |
@keyframes pulse { | |
0% { | |
box-shadow: 0 4px 15px rgba(102, 126, 234, 0.3); | |
} | |
50% { | |
box-shadow: 0 8px 25px rgba(102, 126, 234, 0.6); | |
} | |
100% { | |
box-shadow: 0 4px 15px rgba(102, 126, 234, 0.3); | |
} | |
} | |
.button-processing { | |
animation: pulse 2s ease-in-out infinite; | |
background: linear-gradient(135deg, #FF8E53 0%, #FF6B6B 100%) !important; | |
} | |
.mobile-textbox textarea { | |
border-radius: 10px !important; | |
border: 2px solid #e0e0e0 !important; | |
padding: 12px !important; | |
font-size: 1em !important; | |
line-height: 1.5 !important; | |
} | |
.mobile-compare textarea { | |
border-radius: 8px !important; | |
border: 1px solid #ddd !important; | |
padding: 10px !important; | |
background: #fafafa !important; | |
font-size: 0.95em !important; | |
} | |
.mobile-audio { | |
margin: 10px 0 !important; | |
border-radius: 10px !important; | |
} | |
.mobile-file { | |
margin: 10px 0 !important; | |
border-radius: 10px !important; | |
} | |
/* Beautiful Markdown styling for Voice RAG responses */ | |
.markdown-response { | |
background: linear-gradient(135deg, #ffffff 0%, #f8fffe 100%); | |
border-radius: 12px; | |
padding: 20px; | |
margin: 15px 0; | |
box-shadow: 0 4px 20px rgba(0,0,0,0.1); | |
border-left: 4px solid #4CAF50; | |
} | |
.markdown-response h1 { | |
color: #2e7d32; | |
border-bottom: 2px solid #4CAF50; | |
padding-bottom: 10px; | |
margin-bottom: 20px; | |
font-size: 1.8em; | |
} | |
.markdown-response h2 { | |
color: #388E3C; | |
margin-top: 25px; | |
margin-bottom: 15px; | |
font-size: 1.4em; | |
border-left: 3px solid #4CAF50; | |
padding-left: 15px; | |
} | |
.markdown-response h3 { | |
color: #43A047; | |
margin-top: 20px; | |
margin-bottom: 12px; | |
font-size: 1.2em; | |
} | |
.markdown-response p { | |
line-height: 1.6; | |
margin-bottom: 12px; | |
color: #333; | |
} | |
.markdown-response blockquote { | |
background: linear-gradient(135deg, #e8f5e8 0%, #c8e6c9 100%); | |
border-left: 4px solid #4CAF50; | |
padding: 15px 20px; | |
margin: 15px 0; | |
border-radius: 8px; | |
font-style: italic; | |
color: #2e7d32; | |
} | |
.markdown-response table { | |
width: 100%; | |
border-collapse: collapse; | |
margin: 15px 0; | |
box-shadow: 0 2px 10px rgba(0,0,0,0.1); | |
border-radius: 8px; | |
overflow: hidden; | |
} | |
.markdown-response table th { | |
background: linear-gradient(135deg, #4CAF50 0%, #45a049 100%); | |
color: white; | |
padding: 12px 15px; | |
text-align: left; | |
font-weight: bold; | |
} | |
.markdown-response table td { | |
padding: 12px 15px; | |
border-bottom: 1px solid #e0e0e0; | |
background: white; | |
} | |
.markdown-response table tr:nth-child(even) td { | |
background: #f8fffe; | |
} | |
.markdown-response table tr:hover td { | |
background: #e8f5e8; | |
transition: background 0.3s ease; | |
} | |
.markdown-response ul, .markdown-response ol { | |
margin: 15px 0; | |
padding-left: 25px; | |
} | |
.markdown-response li { | |
margin-bottom: 8px; | |
line-height: 1.5; | |
} | |
.markdown-response code { | |
background: #f5f5f5; | |
border: 1px solid #e0e0e0; | |
border-radius: 4px; | |
padding: 2px 6px; | |
font-family: 'Courier New', monospace; | |
color: #d32f2f; | |
} | |
.markdown-response pre { | |
background: #f5f5f5; | |
border: 1px solid #e0e0e0; | |
border-radius: 8px; | |
padding: 15px; | |
overflow-x: auto; | |
margin: 15px 0; | |
} | |
.markdown-response pre code { | |
background: none; | |
border: none; | |
padding: 0; | |
color: #333; | |
} | |
.markdown-response hr { | |
border: none; | |
height: 2px; | |
background: linear-gradient(90deg, transparent, #4CAF50, transparent); | |
margin: 25px 0; | |
} | |
.markdown-response strong { | |
color: #2e7d32; | |
font-weight: bold; | |
} | |
.markdown-response em { | |
color: #388E3C; | |
font-style: italic; | |
} | |
/* Responsive design for markdown */ | |
@media (max-width: 768px) { | |
.markdown-response { | |
padding: 15px; | |
margin: 10px 0; | |
} | |
.markdown-response table { | |
font-size: 0.9em; | |
} | |
.markdown-response h1 { | |
font-size: 1.6em; | |
} | |
.markdown-response h2 { | |
font-size: 1.3em; | |
} | |
} | |
/* Mobile responsive breakpoints */ | |
@media (max-width: 768px) { | |
.gradio-container { | |
padding: 10px !important; | |
} | |
.input-card { | |
padding: 12px !important; | |
margin: 8px 0 !important; | |
} | |
.output-area { | |
padding: 12px !important; | |
margin: 10px 0 !important; | |
} | |
.examples-section { | |
padding: 12px !important; | |
} | |
.main-header h2 { | |
font-size: 1.5em !important; | |
} | |
.main-header p { | |
font-size: 1em !important; | |
} | |
/* Mobile layout adjustments - less aggressive */ | |
.gr-row { | |
flex-direction: column; | |
} | |
.gr-column { | |
width: 100%; | |
margin-bottom: 15px; | |
} | |
} | |
@media (max-width: 480px) { | |
.gradio-container { | |
padding: 5px !important; | |
} | |
.input-card { | |
padding: 10px !important; | |
margin: 5px 0 !important; | |
} | |
.main-header { | |
padding: 15px !important; | |
} | |
.main-header h2 { | |
font-size: 1.3em !important; | |
} | |
.mobile-button { | |
padding: 12px !important; | |
font-size: 1em !important; | |
} | |
} | |
/* JavaScript for button interactions */ | |
""" | |
# Add JavaScript for button effects | |
js_code = """ | |
<script> | |
function addButtonEffects() { | |
// Find button by class since Gradio might change IDs | |
const buttons = document.querySelectorAll('.mobile-button'); | |
buttons.forEach(button => { | |
// Remove existing listeners to avoid duplicates | |
button.removeEventListener('click', handleClick); | |
// Add enhanced click effect | |
button.addEventListener('click', handleClick); | |
// Add hover effects for better interaction | |
button.addEventListener('mouseenter', function() { | |
if (!this.disabled) { | |
this.style.transform = 'translateY(-2px) scale(1.02)'; | |
} | |
}); | |
button.addEventListener('mouseleave', function() { | |
if (!this.disabled) { | |
this.style.transform = 'translateY(0) scale(1)'; | |
} | |
}); | |
}); | |
} | |
function handleClick(e) { | |
const button = e.target; | |
// Immediate visual feedback | |
button.style.transform = 'scale(0.98)'; | |
button.style.transition = 'all 0.1s ease'; | |
setTimeout(() => { | |
button.style.transform = 'scale(1)'; | |
button.style.transition = 'all 0.3s ease'; | |
}, 100); | |
// Add processing state | |
const originalText = button.innerHTML; | |
button.innerHTML = '<span class="loading-spinner"></span>⏳ ĐANG XỬ LÝ...'; | |
button.classList.add('button-processing'); | |
button.disabled = true; | |
// Monitor for completion and reset | |
let checkCount = 0; | |
const checkInterval = setInterval(() => { | |
checkCount++; | |
// Reset after 15 seconds max or if status changes | |
const statusElements = document.querySelectorAll('[style*="Hoàn thành"]'); | |
if (statusElements.length > 0 || checkCount > 50) { | |
clearInterval(checkInterval); | |
button.innerHTML = originalText; | |
button.classList.remove('button-processing'); | |
button.disabled = false; | |
button.style.transform = 'scale(1)'; | |
} | |
}, 300); | |
} | |
// Initialize when DOM is ready | |
if (document.readyState === 'loading') { | |
document.addEventListener('DOMContentLoaded', addButtonEffects); | |
} else { | |
addButtonEffects(); | |
} | |
// Re-initialize periodically for Gradio updates | |
setInterval(addButtonEffects, 2000); | |
</script> | |
""" | |
# Create interface with tabs | |
with gr.Blocks(css=css, title="🎙️ Voice AI Platform - Voice RAG & Audio Translation") as demo: | |
# Simplified header for faster loading on HF Spaces | |
if not (os.environ.get("SPACE_ID") or os.environ.get("HF_SPACE_ID")): | |
# Only load complex microphone permissions in local development | |
gr.HTML(""" | |
<meta charset="UTF-8"> | |
<meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
<meta http-equiv="Permissions-Policy" content="microphone=*, camera=*, display-capture=*, autoplay=*"> | |
<meta http-equiv="Feature-Policy" content="microphone 'self' *; camera 'self' *; autoplay 'self' *"> | |
<meta name="theme-color" content="#4A90E2"> | |
<script> | |
// Global microphone management | |
window.microphoneStatus = { | |
granted: false, | |
requested: false, | |
supported: false | |
}; | |
// Enhanced microphone permission request for iframe and main window | |
function initializeMicrophoneSupport() { | |
console.log('🎤 Initializing microphone support...'); | |
// Check browser support | |
if (navigator.mediaDevices && navigator.mediaDevices.getUserMedia) { | |
window.microphoneStatus.supported = true; | |
console.log('✅ Browser supports microphone'); | |
// Check current permission status | |
if (navigator.permissions) { | |
navigator.permissions.query({name: 'microphone'}).then(function(result) { | |
console.log('🔐 Current microphone permission:', result.state); | |
window.microphoneStatus.granted = (result.state === 'granted'); | |
// Update UI based on permission status | |
updateMicrophoneUI(result.state); | |
// Listen for permission changes | |
result.onchange = function() { | |
console.log('🔄 Microphone permission changed to:', this.state); | |
window.microphoneStatus.granted = (this.state === 'granted'); | |
updateMicrophoneUI(this.state); | |
}; | |
}).catch(function(err) { | |
console.log('⚠️ Permission query failed:', err); | |
}); | |
} | |
// Auto-request permissions if we're in iframe (with user gesture simulation) | |
if (window.location !== window.parent.location && !window.microphoneStatus.requested) { | |
console.log('🖼️ Running in iframe - preparing microphone access'); | |
window.microphoneStatus.requested = true; | |
// Add a global click listener to request permissions on first interaction | |
document.addEventListener('click', function requestOnFirstClick() { | |
if (!window.microphoneStatus.granted) { | |
console.log('👆 First click detected - requesting microphone access'); | |
requestMicrophonePermission(); | |
document.removeEventListener('click', requestOnFirstClick); | |
} | |
}, { once: true }); | |
} | |
} else { | |
console.log('❌ Browser does not support microphone'); | |
window.microphoneStatus.supported = false; | |
updateMicrophoneUI('unsupported'); | |
} | |
} | |
function requestMicrophonePermission() { | |
console.log('🎤 Requesting microphone permission...'); | |
if (navigator.mediaDevices && navigator.mediaDevices.getUserMedia) { | |
navigator.mediaDevices.getUserMedia({ | |
audio: { | |
echoCancellation: true, | |
noiseSuppression: true, | |
autoGainControl: true, | |
sampleRate: 44100 | |
} | |
}) | |
.then(function(stream) { | |
console.log('✅ Microphone access granted'); | |
window.microphoneStatus.granted = true; | |
// Stop the stream immediately (we just wanted permission) | |
stream.getTracks().forEach(track => track.stop()); | |
updateMicrophoneUI('granted'); | |
// Notify other parts of the app | |
window.dispatchEvent(new CustomEvent('microphoneGranted')); | |
}) | |
.catch(function(err) { | |
console.log('❌ Microphone access denied:', err); | |
window.microphoneStatus.granted = false; | |
updateMicrophoneUI('denied', err.message); | |
}); | |
} | |
} | |
function updateMicrophoneUI(status, errorMessage = '') { | |
// This will be called by the specific UI components | |
console.log('🎛️ Updating microphone UI for status:', status); | |
window.dispatchEvent(new CustomEvent('microphoneStatusChanged', { | |
detail: { status, errorMessage } | |
})); | |
} | |
// Initialize when DOM is ready | |
if (document.readyState === 'loading') { | |
document.addEventListener('DOMContentLoaded', initializeMicrophoneSupport); | |
} else { | |
initializeMicrophoneSupport(); | |
} | |
// Also initialize on any dynamic content changes (for Gradio updates) | |
if (window.MutationObserver) { | |
const observer = new MutationObserver(function(mutations) { | |
mutations.forEach(function(mutation) { | |
if (mutation.type === 'childList' && mutation.addedNodes.length > 0) { | |
// Check if audio components were added | |
const hasAudioComponent = Array.from(mutation.addedNodes).some(node => | |
node.nodeType === 1 && ( | |
node.querySelector && ( | |
node.querySelector('audio') || | |
node.querySelector('[data-testid*="audio"]') || | |
node.classList.contains('audio') | |
) | |
) | |
); | |
if (hasAudioComponent) { | |
console.log('🔄 Audio component detected, re-initializing microphone'); | |
setTimeout(initializeMicrophoneSupport, 500); | |
} | |
} | |
}); | |
}); | |
observer.observe(document.body, { | |
childList: true, | |
subtree: true | |
}); | |
} | |
</script> | |
<div style="text-align: center; background: linear-gradient(135deg, #4A90E2 0%, #FF6B9D 100%); color: white; padding: 20px; border-radius: 10px; margin-bottom: 20px;"> | |
<h1>🎙️ Voice AI Platform</h1> | |
<p>Voice RAG, Audio Translation và Voice Studio - Nền tảng AI giọng nói toàn diện</p> | |
<div style="margin-top: 10px; font-size: 14px; opacity: 0.9;"> | |
✨ Tính năng mới: Voice RAG với 24 giọng nói đa ngôn ngữ | |
</div> | |
<div style="margin-top: 8px;">🧠 <strong>Digitized Brains</strong></div> | |
</div> | |
""") | |
else: | |
# Production mode - minimal header | |
gr.HTML('<div style="text-align:center;"><h1>🎙️ Voice AI Platform</h1></div>') | |
with gr.Tabs(): | |
# Tab 1: Voice RAG | |
with gr.TabItem("📚 Voice RAG"): | |
# Header section with hf_voice style | |
gr.HTML(""" | |
<div style="display: flex; justify-content: center; gap: 15px; margin: 20px 0; flex-wrap: wrap;"> | |
<div style="background: linear-gradient(135deg, #FF6B6B 0%, #FF8E53 100%); padding: 15px; border-radius: 10px; color: white; text-align: center; min-width: 140px;"> | |
<h4>📚 Voice RAG</h4> | |
<p style="margin: 0; font-size: 12px;">Hỏi đáp tài liệu thông minh</p> | |
</div> | |
<div style="background: linear-gradient(135deg, #4ECDC4 0%, #44A08D 100%); padding: 15px; border-radius: 10px; color: white; text-align: center; min-width: 140px;"> | |
<h4>🌍 Multi-Language</h4> | |
<p style="margin: 0; font-size: 12px;">13 ngôn ngữ trả lời</p> | |
</div> | |
<div style="background: linear-gradient(135deg, #45B7D1 0%, #96C93D 100%); padding: 15px; border-radius: 10px; color: white; text-align: center; min-width: 140px;"> | |
<h4>🎤 Voice Output</h4> | |
<p style="margin: 0; font-size: 12px;">24 giọng nói đa dạng</p> | |
</div> | |
<div style="background: linear-gradient(135deg, #A8E6CF 0%, #88D8A3 100%); padding: 15px; border-radius: 10px; color: white; text-align: center; min-width: 140px;"> | |
<h4>🔄 AI Gemini</h4> | |
<p style="margin: 0; font-size: 12px;">Gemini 2.0 Flash</p> | |
</div> | |
</div> | |
""") | |
gr.Markdown("### 📝 Upload tài liệu và đặt câu hỏi") | |
# Input section - Mobile optimized | |
with gr.Column(): | |
# Document upload | |
with gr.Row(): | |
file_upload_rag = gr.File( | |
label="📎 Tải lên tài liệu (PDF, DOCX, TXT)", | |
file_types=[".pdf", ".docx", ".txt"] | |
) | |
# Question input | |
with gr.Row(): | |
question_input_rag = gr.Textbox( | |
label="❓ Câu hỏi của bạn", | |
placeholder="Hãy đặt câu hỏi về nội dung tài liệu...", | |
lines=3 | |
) | |
# Language selection for answer | |
with gr.Row(): | |
answer_language_dropdown_rag = gr.Dropdown( | |
choices=SUPPORTED_LANGUAGES, | |
value="Vietnamese", | |
label="🌍 Ngôn ngữ trả lời" | |
) | |
# Voice selection từ Voice Studio | |
with gr.Row(): | |
with gr.Column(scale=1): | |
rag_country_dropdown = gr.Dropdown( | |
choices=list(voice_choices_by_country.keys()), | |
value="🇻🇳 Việt Nam", | |
label="🌍 Chọn quốc gia giọng nói" | |
) | |
with gr.Column(scale=1): | |
rag_voice_dropdown = gr.Dropdown( | |
choices=voice_choices_by_country["🇻🇳 Việt Nam"], | |
value="🇻🇳 HoaiMy - Nữ Việt Chuẩn", | |
label="🎭 Chọn giọng nói" | |
) | |
# Format selection for download | |
with gr.Row(): | |
rag_text_format_dropdown = gr.Dropdown( | |
choices=["Markdown (.md)", "TXT (.txt)", "Word (.docx)"] if DOCX_AVAILABLE else ["Markdown (.md)", "TXT (.txt)"], | |
value="Markdown (.md)", | |
label="📄 Định dạng file trả lời" | |
) | |
# Process button | |
with gr.Row(): | |
submit_btn_rag = gr.Button( | |
"🚀 Xử lý tài liệu và trả lời", | |
variant="primary", | |
size="lg" | |
) | |
# Results section - Mobile optimized | |
with gr.Column(): | |
# Document info section | |
with gr.Accordion("📄 Thông tin tài liệu", open=True): | |
detected_doc_language_rag = gr.Textbox( | |
label="🌐 Ngôn ngữ tài liệu được phát hiện", | |
lines=1, | |
interactive=False, | |
placeholder="Tự động nhận diện ngôn ngữ tài liệu..." | |
) | |
# Text answer section | |
with gr.Accordion("💬 Câu trả lời", open=True): | |
gr.HTML(""" | |
<div style=" | |
background: linear-gradient(135deg, #e8f5e8 0%, #c8e6c9 100%); | |
padding: 15px; | |
border-radius: 12px; | |
margin: 15px 0; | |
border-left: 4px solid #4CAF50; | |
text-align: center; | |
"> | |
<h4 style="margin: 0 0 10px 0; color: #2e7d32;">💬 AI Response with Markdown Formatting</h4> | |
<p style="color: #388E3C; margin: 0; font-style: italic;"> | |
Formatted response with tables, headers, and beautiful layout | |
</p> | |
</div> | |
""") | |
answer_output_rag = gr.Markdown( | |
value="**Câu trả lời sẽ xuất hiện ở đây sau khi xử lý...**\n\n*Hỗ trợ format Markdown với tables, headers, lists và nhiều style khác*", | |
label="", | |
show_label=False, | |
elem_classes=["markdown-response"] | |
) | |
# Downloads section - Mobile optimized | |
with gr.Accordion("💾 Tải xuống kết quả", open=True): | |
gr.HTML(""" | |
<div style="text-align: center; margin-bottom: 15px;"> | |
<p style="color: #666; font-style: italic;">Tải xuống câu trả lời dưới dạng file và audio</p> | |
</div> | |
""") | |
# Stack vertically on mobile | |
with gr.Column(): | |
# Audio download section | |
with gr.Row(): | |
audio_output_rag = gr.Audio( | |
label="🔊 Audio câu trả lời", | |
type="filepath" | |
) | |
# Text download section | |
with gr.Row(): | |
text_output_rag = gr.File( | |
label="📄 Văn bản câu trả lời", | |
file_count="single", | |
file_types=[".md", ".txt", ".docx"] | |
) | |
# Status indicator for RAG | |
rag_status_text = gr.HTML(""" | |
<div style="text-align: center; margin: 20px 0;"> | |
<div style=" | |
background: linear-gradient(135deg, #4ECDC4 0%, #44A08D 100%); | |
color: white; | |
padding: 15px; | |
border-radius: 12px; | |
box-shadow: 0 4px 15px rgba(78,205,196,0.3); | |
"> | |
<span style="font-weight: bold; font-size: 1.1em;">✅ Sẵn sàng xử lý tài liệu</span> | |
</div> | |
</div> | |
""") | |
# Helper function for RAG format | |
def get_rag_format_from_dropdown(format_choice): | |
if "Word" in format_choice or "docx" in format_choice: | |
return "docx" | |
elif "Markdown" in format_choice or "md" in format_choice: | |
return "md" | |
return "txt" | |
# RAG processing function | |
def update_rag_status_processing(): | |
return """ | |
<div style="text-align: center; margin: 20px 0;"> | |
<div style=" | |
background: linear-gradient(135deg, #FF8E53 0%, #FF6B6B 100%); | |
color: white; | |
padding: 15px; | |
border-radius: 12px; | |
box-shadow: 0 4px 15px rgba(255,142,83,0.3); | |
"> | |
<span style="font-weight: bold; font-size: 1.1em;">⏳ Đang xử lý tài liệu...</span> | |
</div> | |
</div> | |
""" | |
def update_rag_status_complete(): | |
return """ | |
<div style="text-align: center; margin: 20px 0;"> | |
<div style=" | |
background: linear-gradient(135deg, #4ECDC4 0%, #44A08D 100%); | |
color: white; | |
padding: 15px; | |
border-radius: 12px; | |
box-shadow: 0 4px 15px rgba(78,205,196,0.3); | |
"> | |
<span style="font-weight: bold; font-size: 1.1em;">✅ Xử lý hoàn thành!</span> | |
</div> | |
</div> | |
""" | |
# Event handlers for Voice RAG | |
rag_country_dropdown.change( | |
fn=update_voices, | |
inputs=[rag_country_dropdown], | |
outputs=[rag_voice_dropdown] | |
) | |
submit_btn_rag.click( | |
fn=lambda: update_rag_status_processing(), | |
outputs=[rag_status_text] | |
).then( | |
fn=lambda file, question, lang, voice, fmt: voice_rag_pipeline(file, question, lang, voice, get_rag_format_from_dropdown(fmt)), | |
inputs=[file_upload_rag, question_input_rag, answer_language_dropdown_rag, rag_voice_dropdown, rag_text_format_dropdown], | |
outputs=[answer_output_rag, detected_doc_language_rag, audio_output_rag, text_output_rag] | |
).then( | |
fn=lambda: update_rag_status_complete(), | |
outputs=[rag_status_text] | |
) | |
# Voice Studio Tab | |
with gr.TabItem("🎤 Voice Studio"): | |
gr.HTML(""" | |
<div style="display: flex; justify-content: center; gap: 15px; margin: 20px 0; flex-wrap: wrap;"> | |
<div style="background: linear-gradient(135deg, #FF6B6B 0%, #FF8E53 100%); padding: 15px; border-radius: 10px; color: white; text-align: center; min-width: 150px;"> | |
<h4>🇻🇳 Tiếng Việt</h4> | |
<p style="margin: 0; font-size: 12px;">2 giọng chuẩn</p> | |
<p style="margin: 0; font-size: 10px;">HoaiMy • NamMinh</p> | |
</div> | |
<div style="background: linear-gradient(135deg, #4ECDC4 0%, #44A08D 100%); padding: 15px; border-radius: 10px; color: white; text-align: center; min-width: 150px;"> | |
<h4>🇺🇸🇬🇧 English</h4> | |
<p style="margin: 0; font-size: 12px;">4 giọng chuẩn</p> | |
<p style="margin: 0; font-size: 10px;">US • UK</p> | |
</div> | |
<div style="background: linear-gradient(135deg, #45B7D1 0%, #96C93D 100%); padding: 15px; border-radius: 10px; color: white; text-align: center; min-width: 150px;"> | |
<h4>🌍 Đa ngôn ngữ</h4> | |
<p style="margin: 0; font-size: 12px;">20 giọng chuẩn</p> | |
<p style="margin: 0; font-size: 10px;">10 ngôn ngữ</p> | |
</div> | |
</div> | |
""") | |
gr.Markdown("### 📝 Nhập nội dung và chọn giọng nói") | |
with gr.Row(): | |
text_input = gr.Textbox( | |
placeholder="Nhập văn bản cần chuyển thành giọng nói...", | |
lines=4, | |
label="Văn bản", | |
scale=2 | |
) | |
with gr.Row(): | |
with gr.Column(scale=1): | |
country_dropdown = gr.Dropdown( | |
choices=list(voice_choices_by_country.keys()), | |
value="🇻🇳 Việt Nam", | |
label="🌍 Chọn quốc gia" | |
) | |
with gr.Column(scale=1): | |
voice_dropdown = gr.Dropdown( | |
choices=voice_choices_by_country["🇻🇳 Việt Nam"], | |
value="🇻🇳 HoaiMy - Nữ Việt Chuẩn", | |
label="🎭 Chọn giọng nói" | |
) | |
with gr.Row(): | |
with gr.Column(scale=2): | |
speed_slider = gr.Slider( | |
minimum=0.5, | |
maximum=2.0, | |
value=1.0, | |
step=0.1, | |
label="⚡ Tốc độ phát" | |
) | |
with gr.Column(scale=1): | |
voice_studio_format_dropdown = gr.Dropdown( | |
choices=["Markdown (.md)", "TXT (.txt)", "Word (.docx)"] if DOCX_AVAILABLE else ["Markdown (.md)", "TXT (.txt)"], | |
value="Markdown (.md)", | |
label="📄 Định dạng file tải xuống" | |
) | |
# Translation feature | |
with gr.Row(): | |
with gr.Column(scale=1): | |
translate_checkbox = gr.Checkbox( | |
label="🌍 Dịch văn bản trước khi tạo giọng nói", | |
value=False | |
) | |
with gr.Column(scale=2): | |
translate_btn = gr.Button("🔄 DỊCH VĂN BẢN", variant="secondary", size="lg", visible=False) | |
# Show translated text when translation is enabled | |
translated_text_output = gr.Textbox( | |
label="📝 Văn bản đã dịch", | |
lines=3, | |
interactive=True, | |
visible=False, | |
placeholder="Văn bản sau khi dịch sẽ hiển thị ở đây..." | |
) | |
generate_btn = gr.Button("🎵 TẠO GIỌNG NÓI", variant="primary", size="lg") | |
# Status indicator for Voice Studio | |
studio_status_text = gr.HTML(""" | |
<div style="text-align: center; margin: 20px 0;"> | |
<div style=" | |
background: linear-gradient(135deg, #4ECDC4 0%, #44A08D 100%); | |
color: white; | |
padding: 15px; | |
border-radius: 12px; | |
box-shadow: 0 4px 15px rgba(78,205,196,0.3); | |
"> | |
<span style="font-weight: bold; font-size: 1.1em;">⚡ Sẵn sàng tạo giọng nói</span> | |
</div> | |
</div> | |
""") | |
gr.Markdown("### 🎧 Kết quả âm thanh") | |
audio_output_vs = gr.HTML( | |
value="<p style='text-align: center; color: #666; padding: 40px;'>Nhấn 'TẠO GIỌNG NÓI' để bắt đầu 🎤</p>" | |
) | |
# Download section for Voice Studio | |
with gr.Accordion("💾 Tải xuống kết quả", open=False): | |
gr.HTML(""" | |
<div style=" | |
background: linear-gradient(135deg, #e3f2fd 0%, #bbdefb 100%); | |
padding: 15px; | |
border-radius: 12px; | |
margin: 15px 0; | |
border-left: 4px solid #2196F3; | |
text-align: center; | |
"> | |
<h4 style="margin: 0 0 10px 0; color: #1976D2;">📄 Tải xuống văn bản với Markdown formatting</h4> | |
<p style="color: #1565C0; margin: 0; font-style: italic;"> | |
File chứa thông tin session, cấu hình giọng nói và technical details | |
</p> | |
</div> | |
""") | |
voice_studio_text_output = gr.File( | |
label="📄 Văn bản với thông tin chi tiết", | |
file_count="single", | |
file_types=[".md", ".txt", ".docx"] | |
) | |
# Examples section | |
gr.Markdown("### 📚 Ví dụ nhanh") | |
with gr.Row(): | |
example_vn = gr.Button("🇻🇳 Tiếng Việt", size="sm") | |
example_en = gr.Button("🇺🇸 English", size="sm") | |
example_de = gr.Button("🇩🇪 Deutsch", size="sm") | |
example_translate = gr.Button("🌍 Dịch thuật", size="sm") | |
# Example button functions | |
def load_vn_example(): | |
return "Xin chào! Chào mừng bạn đến với studio giọng nói.", "🇻🇳 Việt Nam" | |
def load_en_example(): | |
return "Hello! Welcome to our voice studio.", "🇺🇸 Hoa Kỳ" | |
def load_de_example(): | |
return "Hallo! Willkommen in unserem Sprachstudio.", "🇩🇪 Đức" | |
def load_translate_example(): | |
return "Hello! This is an example text for translation.", "🇺🇸 Hoa Kỳ", True | |
# Translation functions | |
def toggle_translation_ui(translate_enabled): | |
"""Show/hide translation UI elements""" | |
return ( | |
gr.update(visible=translate_enabled), # translate_btn | |
gr.update(visible=translate_enabled) # translated_text_output | |
) | |
def translate_text_interface(text, voice_selection): | |
"""Translate text for Voice Studio""" | |
if not text.strip(): | |
return "Vui lòng nhập văn bản trước khi dịch" | |
target_language = get_target_language_from_voice(voice_selection) | |
translated = translate_text_with_gemini(text, target_language) | |
return translated | |
def create_voice_with_translation(original_text, translated_text, translate_enabled, voice_selection, speed, text_format="txt"): | |
"""Create voice using original or translated text""" | |
if translate_enabled and translated_text.strip() and not translated_text.startswith("Lỗi"): | |
# Use translated text | |
return create_audio_voice_studio(translated_text, voice_selection, speed, text_format) | |
else: | |
# Use original text | |
return create_audio_voice_studio(original_text, voice_selection, speed, text_format) | |
# Event handlers for Voice Studio | |
country_dropdown.change( | |
fn=update_voices, | |
inputs=[country_dropdown], | |
outputs=[voice_dropdown] | |
) | |
example_vn.click( | |
fn=load_vn_example, | |
outputs=[text_input, country_dropdown] | |
) | |
example_en.click( | |
fn=load_en_example, | |
outputs=[text_input, country_dropdown] | |
) | |
example_de.click( | |
fn=load_de_example, | |
outputs=[text_input, country_dropdown] | |
) | |
example_translate.click( | |
fn=load_translate_example, | |
outputs=[text_input, country_dropdown, translate_checkbox] | |
) | |
# Translation UI toggle | |
translate_checkbox.change( | |
fn=toggle_translation_ui, | |
inputs=[translate_checkbox], | |
outputs=[translate_btn, translated_text_output] | |
) | |
# Translation button | |
translate_btn.click( | |
fn=translate_text_interface, | |
inputs=[text_input, voice_dropdown], | |
outputs=[translated_text_output] | |
) | |
# Helper function to extract format and process Voice Studio | |
def process_voice_studio(original_text, translated_text, translate_enabled, voice_selection, speed, format_choice): | |
"""Process Voice Studio with format support""" | |
# Extract format from dropdown | |
if "Markdown" in format_choice: | |
text_format = "md" | |
elif "Word" in format_choice: | |
text_format = "docx" | |
else: | |
text_format = "txt" | |
return create_voice_with_translation(original_text, translated_text, translate_enabled, voice_selection, speed, text_format) | |
# Generate voice with translation support | |
generate_btn.click( | |
fn=process_voice_studio, | |
inputs=[text_input, translated_text_output, translate_checkbox, voice_dropdown, speed_slider, voice_studio_format_dropdown], | |
outputs=[audio_output_vs, voice_studio_text_output] | |
) | |
# Audio Translation Tab | |
with gr.TabItem("🎙️ Audio Translation"): | |
# Colorful feature cards like Voice Studio | |
gr.HTML(""" | |
<div style="display: flex; justify-content: center; gap: 15px; margin: 20px 0; flex-wrap: wrap;"> | |
<div style="background: linear-gradient(135deg, #FF6B6B 0%, #FF8E53 100%); padding: 15px; border-radius: 10px; color: white; text-align: center; min-width: 150px;"> | |
<h4>🎤 Ghi âm</h4> | |
<p style="margin: 0; font-size: 12px;">Microphone</p> | |
<p style="margin: 0; font-size: 10px;">Real-time</p> | |
</div> | |
<div style="background: linear-gradient(135deg, #4ECDC4 0%, #44A08D 100%); padding: 15px; border-radius: 10px; color: white; text-align: center; min-width: 150px;"> | |
<h4>📁 Upload</h4> | |
<p style="margin: 0; font-size: 12px;">Audio Files</p> | |
<p style="margin: 0; font-size: 10px;">WAV • MP3</p> | |
</div> | |
<div style="background: linear-gradient(135deg, #45B7D1 0%, #96C93D 100%); padding: 15px; border-radius: 10px; color: white; text-align: center; min-width: 150px;"> | |
<h4>🔄 AI Dịch</h4> | |
<p style="margin: 0; font-size: 12px;">13 ngôn ngữ</p> | |
<p style="margin: 0; font-size: 10px;">Gemini 2.0</p> | |
</div> | |
<div style="background: linear-gradient(135deg, #A855F7 0%, #EC4899 100%); padding: 15px; border-radius: 10px; color: white; text-align: center; min-width: 150px;"> | |
<h4>🎵 Tổng hợp</h4> | |
<p style="margin: 0; font-size: 12px;">Neural TTS</p> | |
<p style="margin: 0; font-size: 10px;">26 giọng</p> | |
</div> | |
</div> | |
""") | |
# Input section with colorful design | |
gr.HTML(""" | |
<div style=" | |
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
color: white; | |
padding: 20px; | |
border-radius: 15px; | |
margin: 20px 0; | |
text-align: center; | |
box-shadow: 0 8px 32px rgba(0,0,0,0.2); | |
"> | |
<h3 style="margin: 0 0 10px 0;">🎤 Tải lên file audio hoặc ghi âm trực tiếp</h3> | |
<p style="margin: 0; opacity: 0.9; font-size: 0.95em;"> | |
Hỗ trợ file WAV, MP3 hoặc ghi âm real-time qua microphone | |
</p> | |
</div> | |
""") | |
# Enhanced microphone permission notice and controls | |
if not (os.environ.get("SPACE_ID") or os.environ.get("HF_SPACE_ID")): | |
gr.HTML(""" | |
<div id="microphone-section" style="margin: 15px 0;"> | |
<!-- Microphone Status Indicator --> | |
<div id="mic-status" style=" | |
background: linear-gradient(135deg, #e8f5e8 0%, #c8e6c9 100%); | |
color: #2e7d32; | |
padding: 12px; | |
border-radius: 8px; | |
margin-bottom: 10px; | |
text-align: center; | |
border: 1px solid #4caf50; | |
display: none; | |
"> | |
<strong>🎤 Microphone Ready</strong> - Bạn có thể ghi âm trực tiếp | |
</div> | |
<!-- Microphone Error/Permission Notice --> | |
<div id="microphone-notice" style=" | |
background: linear-gradient(135deg, #fff3cd 0%, #ffeaa7 100%); | |
color: #856404; | |
padding: 15px; | |
border-radius: 10px; | |
border: 1px solid #ffeaa7; | |
text-align: center; | |
display: none; | |
"> | |
<strong>🎤 Microphone Access Required</strong><br> | |
Để sử dụng ghi âm, vui lòng cho phép truy cập microphone.<br> | |
<button onclick="requestMicrophoneAccess()" style=" | |
background: #4caf50; | |
color: white; | |
padding: 8px 16px; | |
border: none; | |
border-radius: 6px; | |
cursor: pointer; | |
margin: 8px 4px; | |
">🎤 Kích hoạt Microphone</button> | |
<a href="#" onclick="window.open(window.location.href, '_blank')" style=" | |
background: #667eea; | |
color: white; | |
padding: 8px 16px; | |
text-decoration: none; | |
border-radius: 6px; | |
display: inline-block; | |
margin: 8px 4px; | |
">🔗 Mở cửa sổ mới</a> | |
</div> | |
<!-- Iframe Warning --> | |
<div id="iframe-warning" style=" | |
background: linear-gradient(135deg, #ffebee 0%, #ffcdd2 100%); | |
color: #c62828; | |
padding: 12px; | |
border-radius: 8px; | |
border: 1px solid #f44336; | |
text-align: center; | |
display: none; | |
"> | |
<strong>⚠️ Iframe Restriction</strong><br> | |
Microphone có thể bị hạn chế trong iframe. | |
<a href="#" onclick="window.open(window.location.href, '_blank')" style="color: #c62828; text-decoration: underline;"> | |
Mở trong cửa sổ mới | |
</a> để sử dụng đầy đủ tính năng. | |
</div> | |
</div> | |
<script> | |
// Enhanced microphone permission handling | |
let microphoneAccess = false; | |
function requestMicrophoneAccess() { | |
console.log('🎤 Audio Translation: Requesting microphone access...'); | |
// Use global microphone function if available | |
if (window.requestMicrophonePermission) { | |
window.requestMicrophonePermission(); | |
return; | |
} | |
// Fallback to local implementation | |
if (navigator.mediaDevices && navigator.mediaDevices.getUserMedia) { | |
navigator.mediaDevices.getUserMedia({ | |
audio: { | |
echoCancellation: true, | |
noiseSuppression: true, | |
autoGainControl: true, | |
sampleRate: 44100 | |
} | |
}) | |
.then(function(stream) { | |
console.log('✅ Audio Translation: Microphone access granted'); | |
microphoneAccess = true; | |
// Show success status | |
updateLocalMicrophoneUI('granted'); | |
// Stop the stream (we just wanted permission) | |
stream.getTracks().forEach(track => track.stop()); | |
// Trigger Gradio audio component refresh | |
setTimeout(() => { | |
const audioComponents = document.querySelectorAll('[data-testid*="audio"]'); | |
audioComponents.forEach(comp => { | |
// Try to trigger a refresh or re-initialization | |
if (comp.click) comp.click(); | |
}); | |
}, 500); | |
// Update global status if available | |
if (window.microphoneStatus) { | |
window.microphoneStatus.granted = true; | |
} | |
}) | |
.catch(function(err) { | |
console.log('❌ Audio Translation: Microphone access denied:', err); | |
updateLocalMicrophoneUI('denied', err.message); | |
}); | |
} else { | |
console.log('❌ getUserMedia not supported'); | |
updateLocalMicrophoneUI('unsupported'); | |
} | |
} | |
function updateLocalMicrophoneUI(status, errorMessage = '') { | |
const micStatus = document.getElementById('mic-status'); | |
const micNotice = document.getElementById('microphone-notice'); | |
switch(status) { | |
case 'granted': | |
if (micStatus) micStatus.style.display = 'block'; | |
if (micNotice) micNotice.style.display = 'none'; | |
microphoneAccess = true; | |
break; | |
case 'denied': | |
if (micNotice) { | |
micNotice.style.display = 'block'; | |
micNotice.innerHTML = ` | |
<strong>❌ Microphone Access Denied</strong><br> | |
Lỗi: ${errorMessage}<br> | |
Vui lòng kiểm tra cài đặt trình duyệt và cho phép microphone. | |
<br><br> | |
<button onclick="requestMicrophoneAccess()" style=" | |
background: #ff9800; | |
color: white; | |
padding: 8px 16px; | |
border: none; | |
border-radius: 6px; | |
cursor: pointer; | |
margin: 4px; | |
">🔄 Thử lại</button> | |
<button onclick="window.open(window.location.href, '_blank')" style=" | |
background: #2196f3; | |
color: white; | |
padding: 8px 16px; | |
border: none; | |
border-radius: 6px; | |
cursor: pointer; | |
margin: 4px; | |
">🔗 Mở cửa sổ mới</button> | |
`; | |
} | |
break; | |
case 'unsupported': | |
if (micNotice) { | |
micNotice.style.display = 'block'; | |
micNotice.innerHTML = ` | |
<strong>❌ Microphone Not Supported</strong><br> | |
Trình duyệt của bạn không hỗ trợ ghi âm.<br> | |
Vui lòng sử dụng Chrome, Firefox, Safari hoặc Edge phiên bản mới. | |
<br><br> | |
<a href="https://caniuse.com/stream" target="_blank" style=" | |
color: #856404; | |
text-decoration: underline; | |
">Kiểm tra tương thích trình duyệt</a> | |
`; | |
} | |
break; | |
default: | |
if (micNotice) { | |
micNotice.style.display = 'block'; | |
} | |
break; | |
} | |
} | |
// Listen for global microphone events | |
window.addEventListener('microphoneStatusChanged', function(event) { | |
console.log('🔄 Audio Translation: Received microphone status update:', event.detail); | |
updateLocalMicrophoneUI(event.detail.status, event.detail.errorMessage); | |
}); | |
window.addEventListener('microphoneGranted', function() { | |
console.log('✅ Audio Translation: Global microphone granted'); | |
updateLocalMicrophoneUI('granted'); | |
}); | |
// Check microphone availability on load | |
function checkMicrophoneAvailability() { | |
console.log('🔍 Audio Translation: Checking microphone availability...'); | |
// Check global status first | |
if (window.microphoneStatus) { | |
if (window.microphoneStatus.granted) { | |
updateLocalMicrophoneUI('granted'); | |
return; | |
} else if (!window.microphoneStatus.supported) { | |
updateLocalMicrophoneUI('unsupported'); | |
return; | |
} | |
} | |
// Check if we're in an iframe | |
if (window.location !== window.parent.location) { | |
console.log('Running in iframe'); | |
const iframeWarning = document.getElementById('iframe-warning'); | |
if (iframeWarning) { | |
setTimeout(() => { | |
iframeWarning.style.display = 'block'; | |
}, 1000); | |
} | |
} | |
// Try to get microphone permissions | |
if (navigator.mediaDevices && navigator.mediaDevices.getUserMedia) { | |
// Check if we already have permission | |
navigator.permissions.query({name: 'microphone'}).then(function(result) { | |
console.log('Microphone permission status:', result.state); | |
if (result.state === 'granted') { | |
const micStatus = document.getElementById('mic-status'); | |
if (micStatus) micStatus.style.display = 'block'; | |
microphoneAccess = true; | |
} else if (result.state === 'prompt' || result.state === 'denied') { | |
const micNotice = document.getElementById('microphone-notice'); | |
if (micNotice) { | |
setTimeout(() => { | |
micNotice.style.display = 'block'; | |
}, 1500); | |
} | |
} | |
// Listen for permission changes | |
result.onchange = function() { | |
console.log('Microphone permission changed to:', this.state); | |
if (this.state === 'granted') { | |
const micStatus = document.getElementById('mic-status'); | |
const micNotice = document.getElementById('microphone-notice'); | |
if (micStatus) micStatus.style.display = 'block'; | |
if (micNotice) micNotice.style.display = 'none'; | |
microphoneAccess = true; | |
} | |
}; | |
}).catch(function(err) { | |
console.log('Permission query failed:', err); | |
// Fallback to showing the notice | |
setTimeout(() => { | |
const micNotice = document.getElementById('microphone-notice'); | |
if (micNotice) micNotice.style.display = 'block'; | |
}, 2000); | |
}); | |
} else { | |
// Browser doesn't support getUserMedia | |
setTimeout(() => { | |
const micNotice = document.getElementById('microphone-notice'); | |
if (micNotice) { | |
micNotice.style.display = 'block'; | |
micNotice.innerHTML = ` | |
<strong>❌ Microphone Not Supported</strong><br> | |
Trình duyệt không hỗ trợ ghi âm. Vui lòng cập nhật trình duyệt. | |
`; | |
} | |
}, 1000); | |
} | |
} | |
// Initialize when DOM is ready | |
if (document.readyState === 'loading') { | |
document.addEventListener('DOMContentLoaded', checkMicrophoneAvailability); | |
} else { | |
checkMicrophoneAvailability(); | |
} | |
// Re-check periodically for dynamic content | |
setInterval(checkMicrophoneAvailability, 5000); | |
</script> | |
""") | |
else: | |
# Production mode - simple microphone notice | |
gr.HTML('<div style="text-align:center;color:#666;padding:10px;">📎 Upload audio file or use microphone</div>') | |
audio_input = gr.Audio( | |
label="📎 Tải lên file audio hoặc ghi âm trực tiếp", | |
type="numpy", # Use numpy to avoid temp file issues | |
sources=["upload", "microphone"], | |
show_label=True, | |
interactive=True, | |
elem_id="audio-input-translation" | |
) | |
# Audio Recording Control Buttons | |
with gr.Row(): | |
save_recording_btn = gr.Button( | |
"💾 Save Recording", | |
variant="secondary", | |
size="sm" | |
) | |
new_recording_btn = gr.Button( | |
"🎙️ New Record", | |
variant="primary", | |
size="sm" | |
) | |
# Button descriptions | |
gr.HTML(""" | |
<div style="display: flex; justify-content: space-between; margin: 5px 0 15px 0; font-size: 0.8em; color: #666;"> | |
<span>💾 Lưu file audio hiện tại vào record_data</span> | |
<span>🎙️ Xóa audio hiện tại để ghi âm mới</span> | |
</div> | |
""") | |
# Status for recording actions | |
recording_status = gr.HTML( | |
value="<p style='text-align: center; color: #666; font-style: italic;'>Sẵn sàng ghi âm hoặc tải lên file</p>" | |
) | |
# === RECORDED FILES FUNCTIONS === | |
def refresh_recorded_files(): | |
"""Refresh the list of recorded files""" | |
files = get_recorded_files() | |
print(f"🔄 Refreshing dropdown - found files: {files}") | |
return gr.Dropdown(choices=files, value=None) | |
def load_recorded_file(filename): | |
"""Load selected recorded file for playback""" | |
print(f"🎵 Loading recorded file: {filename}") | |
if filename and filename.strip(): | |
file_path = get_recorded_file_path(filename) | |
print(f"📁 Full path: {file_path}") | |
if os.path.exists(file_path): | |
file_size = os.path.getsize(file_path) | |
print(f"✅ File exists, size: {file_size} bytes") | |
try: | |
# Load audio as numpy array for Gradio compatibility | |
import soundfile as sf | |
audio_data, sample_rate = sf.read(file_path) | |
print(f"🎵 Loaded audio: shape={audio_data.shape}, sr={sample_rate}") | |
# Return tuple (sample_rate, audio_data) for Gradio numpy type | |
return (sample_rate, audio_data) | |
except Exception as e: | |
print(f"❌ Error loading audio: {e}") | |
return None | |
else: | |
print(f"❌ File not found: {file_path}") | |
print(f"📁 Directory contents: {os.listdir(os.path.dirname(file_path)) if os.path.exists(os.path.dirname(file_path)) else 'Directory not found'}") | |
else: | |
print("❌ No filename provided") | |
return None | |
def use_recorded_for_translation(filename, country, voice, fmt): | |
"""Use selected recorded file for translation""" | |
print(f"🔄 Using recorded file for translation: {filename}") | |
if filename and filename.strip(): | |
file_path = get_recorded_file_path(filename) | |
print(f"📁 Translation file path: {file_path}") | |
if os.path.exists(file_path): | |
print(f"✅ Starting translation for: {filename}") | |
# Use the same translation function | |
return translate_audio(file_path, country, voice, get_format_from_dropdown(fmt)) | |
else: | |
print(f"❌ File not found for translation: {file_path}") | |
# Return empty results if no file selected | |
print("❌ No file selected for translation") | |
return "", "", "", "", None, "", "", None | |
def prepare_recorded_file_download(filename): | |
"""Prepare recorded file for download""" | |
print(f"📥 Preparing download for: {filename}") | |
if filename and filename.strip(): | |
file_path = get_recorded_file_path(filename) | |
print(f"📁 Download file path: {file_path}") | |
if os.path.exists(file_path): | |
print(f"✅ File ready for download: {filename}") | |
return file_path | |
else: | |
print(f"❌ Download file not found: {file_path}") | |
print("❌ No file selected for download") | |
return None | |
def save_current_recording(audio_file): | |
"""Save current audio recording to record_data""" | |
if audio_file is None: | |
current_files = get_recorded_files() | |
return ( | |
"<p style='color: #e74c3c; text-align: center;'>❌ Không có file audio để lưu</p>", | |
gr.Dropdown(choices=current_files, value=None) | |
) | |
try: | |
saved_path = save_recorded_audio(audio_file) | |
if saved_path: | |
saved_filename = os.path.basename(saved_path) | |
# Get updated file list after saving | |
updated_files = get_recorded_files() | |
print(f"🔄 After save - updated files: {updated_files}") | |
return ( | |
f"<p style='color: #27ae60; text-align: center;'>✅ Đã lưu: {saved_filename}</p>", | |
gr.Dropdown(choices=updated_files, value=saved_filename) | |
) | |
else: | |
current_files = get_recorded_files() | |
return ( | |
"<p style='color: #e74c3c; text-align: center;'>❌ Lỗi khi lưu file</p>", | |
gr.Dropdown(choices=current_files, value=None) | |
) | |
except Exception as e: | |
current_files = get_recorded_files() | |
return ( | |
f"<p style='color: #e74c3c; text-align: center;'>❌ Lỗi: {str(e)}</p>", | |
gr.Dropdown(choices=current_files, value=None) | |
) | |
def clear_audio_for_new_recording(): | |
"""Clear audio input for new recording""" | |
return ( | |
None, # Clear audio input | |
"<p style='color: #3498db; text-align: center;'>🎙️ Sẵn sàng ghi âm mới</p>" | |
) | |
def delete_selected_file(filename): | |
"""Delete selected file and refresh dropdown""" | |
if not filename or not filename.strip(): | |
current_files = get_recorded_files() | |
return ( | |
"<p style='color: #e74c3c; text-align: center;'>❌ Vui lòng chọn file để xóa</p>", | |
gr.Dropdown(choices=current_files, value=None), | |
None # Clear audio player | |
) | |
# Delete the file | |
delete_result = delete_recorded_file(filename) | |
# Refresh file list | |
updated_files = get_recorded_files() | |
# Determine status color based on result | |
if "✅" in delete_result: | |
status_html = f"<p style='color: #27ae60; text-align: center;'>{delete_result}</p>" | |
else: | |
status_html = f"<p style='color: #e74c3c; text-align: center;'>{delete_result}</p>" | |
return ( | |
status_html, | |
gr.Dropdown(choices=updated_files, value=None), | |
None # Clear audio player | |
) | |
# Recorded Files Management Section | |
with gr.Accordion("🎤 File đã ghi âm", open=False): | |
gr.HTML(""" | |
<div style=" | |
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
color: white; | |
padding: 15px; | |
border-radius: 12px; | |
margin: 15px 0; | |
text-align: center; | |
"> | |
<h4 style="margin: 0 0 8px 0;">📁 Quản lý file đã ghi</h4> | |
<p style="margin: 0; opacity: 0.9; font-size: 0.9em;"> | |
Chọn file từ danh sách để phát lại hoặc dịch thuật | |
</p> | |
</div> | |
""") | |
# Refresh button for recorded files | |
refresh_files_btn = gr.Button( | |
"🔄 Làm mới danh sách", | |
variant="secondary", | |
size="sm" | |
) | |
# Status display for file operations | |
file_operation_status = gr.HTML( | |
value="<p style='text-align: center; color: #666; font-style: italic;'>Chọn file để thực hiện thao tác</p>" | |
) | |
# Dropdown for recorded files | |
initial_files = get_recorded_files() | |
print(f"🔍 Initial recorded files: {initial_files}") | |
recorded_files_dropdown = gr.Dropdown( | |
choices=initial_files, | |
label="📂 Chọn file đã ghi", | |
info="Các file audio đã được ghi âm trước đó" | |
) | |
# Preview and controls for selected file | |
with gr.Row(): | |
with gr.Column(): | |
# Audio player for selected file | |
recorded_audio_player = gr.Audio( | |
label="🎵 Phát lại file đã chọn", | |
interactive=False, | |
show_label=True, | |
type="numpy" # Use numpy for better compatibility | |
) | |
with gr.Column(): | |
# Action buttons | |
use_for_translation_btn = gr.Button( | |
"🔄 Sử dụng để dịch thuật", | |
variant="primary", | |
size="sm" | |
) | |
with gr.Row(): | |
download_recorded_btn = gr.Button( | |
"📥 Tải xuống", | |
variant="secondary", | |
size="sm" | |
) | |
delete_recorded_btn = gr.Button( | |
"🗑️ Xóa file", | |
variant="stop", | |
size="sm" | |
) | |
# Download link for recorded file | |
download_recorded_file = gr.File( | |
label="📥 File tải xuống", | |
visible=True, | |
file_count="single" | |
) | |
# Settings section with gradient header | |
gr.HTML(""" | |
<div style=" | |
background: linear-gradient(135deg, #FF6B6B 0%, #FF8E53 100%); | |
color: white; | |
padding: 18px; | |
border-radius: 12px; | |
margin: 25px 0 20px 0; | |
text-align: center; | |
box-shadow: 0 6px 24px rgba(255,107,107,0.3); | |
"> | |
<h3 style="margin: 0 0 8px 0;">🌍 Cài đặt dịch thuật</h3> | |
<p style="margin: 0; opacity: 0.9; font-size: 0.9em;"> | |
Chọn ngôn ngữ đích và giọng nói cho kết quả dịch thuật | |
</p> | |
</div> | |
""") | |
# Separate dropdowns without complex wrappers to avoid CSS conflicts | |
target_country_dropdown = gr.Dropdown( | |
choices=list(voice_choices_by_country.keys()), | |
value="🇻🇳 Việt Nam", | |
label="🌍 Chọn quốc gia đích" | |
) | |
target_voice_dropdown = gr.Dropdown( | |
choices=voice_choices_by_country["🇻🇳 Việt Nam"], | |
value="🇻🇳 HoaiMy - Nữ Việt Chuẩn", | |
label="🎭 Chọn giọng nói đích" | |
) | |
text_format_dropdown = gr.Dropdown( | |
choices=["Markdown (.md)", "TXT (.txt)", "Word (.docx)"] if DOCX_AVAILABLE else ["Markdown (.md)", "TXT (.txt)"], | |
value="Markdown (.md)", | |
label="📄 Định dạng file văn bản" | |
) | |
# Colorful action button | |
gr.HTML(""" | |
""") | |
# Auto-translate on audio upload - no manual button needed | |
# Results section with colorful headers | |
gr.HTML(""" | |
<div style=" | |
background: linear-gradient(135deg, #45B7D1 0%, #96C93D 100%); | |
color: white; | |
padding: 18px; | |
border-radius: 12px; | |
margin: 30px 0 20px 0; | |
text-align: center; | |
box-shadow: 0 6px 24px rgba(69,183,209,0.3); | |
"> | |
<h3 style="margin: 0 0 8px 0;">📊 Kết quả xử lý</h3> | |
<p style="margin: 0; opacity: 0.9; font-size: 0.9em;"> | |
Phiên âm, dịch thuật và tổng hợp giọng nói | |
</p> | |
</div> | |
""") | |
# Dynamic status indicator | |
status_text = gr.HTML("") | |
# Card-based layout for mobile | |
with gr.Column(elem_classes=["output-area"]): | |
# Original content card | |
gr.HTML(""" | |
<div style=" | |
background: linear-gradient(135deg, #e3f2fd 0%, #bbdefb 100%); | |
padding: 15px; | |
border-radius: 12px; | |
margin: 15px 0; | |
border-left: 4px solid #2196F3; | |
"> | |
<h4 style="margin: 0 0 10px 0; color: #1976D2;">📝 Nội dung gốc từ audio</h4> | |
</div> | |
""") | |
transcription_output = gr.Textbox( | |
label="🎯 Phiên âm từ audio", | |
lines=4, | |
interactive=False, | |
placeholder="Nội dung phiên âm từ file audio sẽ hiển thị ở đây...", | |
elem_classes=["mobile-textbox"] | |
) | |
detected_language = gr.Textbox( | |
label="🌐 Ngôn ngữ được phát hiện", | |
lines=1, | |
interactive=False, | |
placeholder="Tự động nhận diện...", | |
elem_classes=["mobile-textbox"] | |
) | |
# Translation result card | |
gr.HTML(""" | |
<div style=" | |
background: linear-gradient(135deg, #e8f5e8 0%, #c8e6c9 100%); | |
padding: 15px; | |
border-radius: 12px; | |
margin: 15px 0; | |
border-left: 4px solid #4CAF50; | |
"> | |
<h4 style="margin: 0 0 10px 0; color: #388E3C;">✨ Kết quả dịch thuật</h4> | |
</div> | |
""") | |
translation_output = gr.Textbox( | |
label="🔄 Nội dung đã dịch", | |
lines=4, | |
interactive=False, | |
placeholder="Bản dịch sẽ hiển thị ở đây...", | |
elem_classes=["mobile-textbox"] | |
) | |
target_language_display = gr.Textbox( | |
label="🎯 Ngôn ngữ đích", | |
lines=1, | |
interactive=False, | |
placeholder="Chưa chọn...", | |
elem_classes=["mobile-textbox"] | |
) | |
# Mobile-friendly comparison section | |
with gr.Accordion("🔍 So sánh nội dung", open=False): | |
gr.HTML(""" | |
<div style=" | |
text-align: center; | |
margin-bottom: 15px; | |
padding: 10px; | |
background: #f5f5f5; | |
border-radius: 8px; | |
"> | |
<p style="color: #666; font-style: italic; margin: 0;"> | |
Xem nội dung gốc và bản dịch để so sánh | |
</p> | |
</div> | |
""") | |
# Stack vertically on mobile for better readability | |
with gr.Column(): | |
gr.HTML(""" | |
<div style=" | |
background: #e3f2fd; | |
padding: 10px; | |
border-radius: 8px; | |
margin: 10px 0; | |
text-align: center; | |
font-weight: bold; | |
color: #1976D2; | |
">📝 Ngôn ngữ gốc</div> | |
""") | |
original_compare = gr.Textbox( | |
label="", | |
lines=4, | |
interactive=False, | |
show_label=False, | |
placeholder="Nội dung phiên âm từ audio sẽ hiển thị ở đây...", | |
elem_classes=["mobile-compare"] | |
) | |
gr.HTML(""" | |
<div style=" | |
background: #e8f5e8; | |
padding: 10px; | |
border-radius: 8px; | |
margin: 15px 0 5px 0; | |
text-align: center; | |
font-weight: bold; | |
color: #388E3C; | |
">✨ Sau khi dịch</div> | |
""") | |
translated_compare = gr.Textbox( | |
label="", | |
lines=4, | |
interactive=False, | |
show_label=False, | |
placeholder="Nội dung sau khi dịch sẽ hiển thị ở đây...", | |
elem_classes=["mobile-compare"] | |
) | |
# Mobile-optimized download section | |
with gr.Accordion("💾 Tải xuống kết quả", open=True): | |
gr.HTML(""" | |
<div style=" | |
background: linear-gradient(135deg, #fff3e0 0%, #ffcc80 100%); | |
padding: 15px; | |
border-radius: 12px; | |
margin: 15px 0; | |
border-left: 4px solid #FF9800; | |
text-align: center; | |
"> | |
<h4 style="margin: 0 0 10px 0; color: #E65100;">💾 Tải xuống kết quả</h4> | |
<p style="color: #BF360C; margin: 0; font-style: italic;"> | |
File audio và văn bản đã dịch | |
</p> | |
</div> | |
""") | |
# Stack downloads vertically for mobile | |
with gr.Column(): | |
gr.HTML(""" | |
<div style=" | |
background: #e3f2fd; | |
padding: 12px; | |
border-radius: 8px; | |
margin: 15px 0 10px 0; | |
text-align: center; | |
font-weight: bold; | |
color: #1976D2; | |
">🔊 Audio đã dịch</div> | |
""") | |
audio_output_at = gr.Audio( | |
label="🎵 Audio đã dịch", | |
type="filepath", | |
show_label=True, | |
elem_classes=["mobile-audio"], | |
format="wav" # Specify format explicitly | |
) | |
# Explicit download component for translated audio | |
audio_download_at = gr.File( | |
label="📥 Tải xuống audio đã dịch", | |
file_count="single", | |
file_types=[".wav"], | |
visible=True | |
) | |
gr.HTML(""" | |
<div style=" | |
background: #e8f5e8; | |
padding: 12px; | |
border-radius: 8px; | |
margin: 25px 0 10px 0; | |
text-align: center; | |
font-weight: bold; | |
color: #388E3C; | |
">📄 Văn bản đã dịch</div> | |
""") | |
text_output = gr.File( | |
label="", | |
file_count="single", | |
file_types=[".txt", ".docx"], | |
show_label=False, | |
elem_classes=["mobile-file"] | |
) | |
# Event handlers for Audio Translation with colorful status | |
def update_status_processing(): | |
return """ | |
<div class="status-processing" style=" | |
text-align: center; | |
margin: 20px 0; | |
padding: 15px; | |
border-radius: 12px; | |
color: white; | |
transition: all 0.3s ease; | |
"> | |
<span style="font-weight: bold; font-size: 1.1em;"> | |
⚡ Đang tự động dịch thuật... | |
</span> | |
</div> | |
""" | |
def update_status_complete(): | |
return """ | |
<div class="status-success" style=" | |
text-align: center; | |
margin: 20px 0; | |
padding: 15px; | |
border-radius: 12px; | |
color: white; | |
transition: all 0.3s ease; | |
"> | |
<span style="font-weight: bold; font-size: 1.1em;"> | |
✅ Dịch thuật hoàn thành! | |
</span> | |
</div> | |
""" | |
target_country_dropdown.change( | |
fn=update_voices, | |
inputs=[target_country_dropdown], | |
outputs=[target_voice_dropdown] | |
) | |
# Update target language display when dropdown changes | |
target_voice_dropdown.change( | |
fn=lambda voice: voice, | |
inputs=[target_voice_dropdown], | |
outputs=[target_language_display] | |
) | |
# Helper function to extract format | |
def get_format_from_dropdown(format_choice): | |
if "Markdown" in format_choice: | |
return "md" | |
elif "Word" in format_choice: | |
return "docx" | |
return "txt" | |
# Auto-translate when audio is uploaded or changed | |
audio_input.change( | |
fn=lambda: update_status_processing(), | |
outputs=[status_text] | |
).then( | |
fn=lambda audio, country, voice, fmt: translate_audio(audio, country, voice, get_format_from_dropdown(fmt)) if audio is not None else ("", "", "📎 Vui lòng tải lên file audio hoặc ghi âm", country, None, "", "", None), | |
inputs=[audio_input, target_country_dropdown, target_voice_dropdown, text_format_dropdown], | |
outputs=[ | |
transcription_output, | |
detected_language, | |
translation_output, | |
target_language_display, | |
audio_output_at, | |
audio_download_at, | |
original_compare, | |
translated_compare, | |
text_output | |
] | |
).then( | |
fn=lambda: update_status_complete(), | |
outputs=[status_text] | |
).then( | |
fn=refresh_recorded_files, | |
outputs=[recorded_files_dropdown] | |
) | |
# === RECORDED FILES EVENT HANDLERS === | |
# Save current recording | |
save_recording_btn.click( | |
fn=save_current_recording, | |
inputs=[audio_input], | |
outputs=[recording_status, recorded_files_dropdown] | |
) | |
# New recording (clear audio) | |
new_recording_btn.click( | |
fn=clear_audio_for_new_recording, | |
outputs=[audio_input, recording_status] | |
) | |
refresh_files_btn.click( | |
fn=refresh_recorded_files, | |
outputs=[recorded_files_dropdown] | |
) | |
recorded_files_dropdown.change( | |
fn=load_recorded_file, | |
inputs=[recorded_files_dropdown], | |
outputs=[recorded_audio_player] | |
) | |
use_for_translation_btn.click( | |
fn=lambda: update_status_processing(), | |
outputs=[status_text] | |
).then( | |
fn=use_recorded_for_translation, | |
inputs=[recorded_files_dropdown, target_country_dropdown, target_voice_dropdown, text_format_dropdown], | |
outputs=[ | |
transcription_output, | |
detected_language, | |
translation_output, | |
target_language_display, | |
audio_output_at, | |
audio_download_at, | |
original_compare, | |
translated_compare, | |
text_output | |
] | |
).then( | |
fn=lambda: update_status_complete(), | |
outputs=[status_text] | |
).then( | |
fn=refresh_recorded_files, | |
outputs=[recorded_files_dropdown] | |
) | |
download_recorded_btn.click( | |
fn=prepare_recorded_file_download, | |
inputs=[recorded_files_dropdown], | |
outputs=[download_recorded_file] | |
) | |
delete_recorded_btn.click( | |
fn=delete_selected_file, | |
inputs=[recorded_files_dropdown], | |
outputs=[file_operation_status, recorded_files_dropdown, recorded_audio_player] | |
) | |
# Features section cho Voice RAG | |
gr.Markdown("### 📚 Tính năng chính") | |
with gr.Row(): | |
with gr.Column(): | |
gr.HTML(""" | |
<div style="background: linear-gradient(135deg, #FF6B6B 0%, #FF8E53 100%); padding: 20px; border-radius: 15px; color: white; text-align: center; margin: 10px;"> | |
<h3>📚 Voice RAG</h3> | |
<p>Upload tài liệu và đặt câu hỏi. Nhận trả lời bằng giọng nói đa ngôn ngữ.</p> | |
<div style="margin-top: 15px;"> | |
<div style="background: rgba(255,255,255,0.2); padding: 8px; border-radius: 8px; margin: 5px 0;"> | |
✓ Hỗ trợ PDF, DOCX, TXT | |
</div> | |
<div style="background: rgba(255,255,255,0.2); padding: 8px; border-radius: 8px; margin: 5px 0;"> | |
✓ AI Gemini 2.0 Flash | |
</div> | |
<div style="background: rgba(255,255,255,0.2); padding: 8px; border-radius: 8px; margin: 5px 0;"> | |
✓ 24 giọng nói đa quốc gia | |
</div> | |
</div> | |
</div> | |
""") | |
with gr.Column(): | |
gr.HTML(""" | |
<div style="background: linear-gradient(135deg, #4ECDC4 0%, #44A08D 100%); padding: 20px; border-radius: 15px; color: white; text-align: center; margin: 10px;"> | |
<h3>🌍 Audio Translation</h3> | |
<p>Dịch thuật âm thanh sang nhiều ngôn ngữ với giọng nói tự nhiên.</p> | |
<div style="margin-top: 15px;"> | |
<div style="background: rgba(255,255,255,0.2); padding: 8px; border-radius: 8px; margin: 5px 0;"> | |
✓ Ghi âm real-time | |
</div> | |
<div style="background: rgba(255,255,255,0.2); padding: 8px; border-radius: 8px; margin: 5px 0;"> | |
✓ 13 ngôn ngữ chính | |
</div> | |
<div style="background: rgba(255,255,255,0.2); padding: 8px; border-radius: 8px; margin: 5px 0;"> | |
✓ Edge TTS Neural | |
</div> | |
</div> | |
</div> | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
gr.HTML(""" | |
<div style="background: linear-gradient(135deg, #45B7D1 0%, #96C93D 100%); padding: 20px; border-radius: 15px; color: white; text-align: center; margin: 10px;"> | |
<h3>🎤 Voice Studio</h3> | |
<p>Chuyển văn bản thành giọng nói với nhiều lựa chọn quốc gia và giọng nói.</p> | |
<div style="margin-top: 15px;"> | |
<div style="background: rgba(255,255,255,0.2); padding: 8px; border-radius: 8px; margin: 5px 0;"> | |
✓ 13 quốc gia | |
</div> | |
<div style="background: rgba(255,255,255,0.2); padding: 8px; border-radius: 8px; margin: 5px 0;"> | |
✓ Tích hợp dịch thuật | |
</div> | |
<div style="background: rgba(255,255,255,0.2); padding: 8px; border-radius: 8px; margin: 5px 0;"> | |
✓ Điều chỉnh tốc độ | |
</div> | |
</div> | |
</div> | |
""") | |
# Footer | |
gr.HTML(""" | |
<div class="custom-footer"> | |
<div style="display: flex; justify-content: center; align-items: center; gap: 15px; flex-wrap: wrap;"> | |
<div style="display: flex; align-items: center; gap: 8px;"> | |
<div style="background: rgba(255,255,255,0.2); padding: 8px 15px; border-radius: 20px; font-size: 16px;"> | |
🧠 DB | |
</div> | |
<span style="font-size: 18px; font-weight: bold;">Digitized Brains</span> | |
</div> | |
<div style="font-size: 14px; opacity: 0.9;"> | |
Voice Studio - AI Powered | |
</div> | |
</div> | |
</div> | |
""") | |
# Add JavaScript for button effects | |
gr.HTML(js_code) | |
if __name__ == "__main__": | |
import sys | |
import locale | |
import os | |
# Ensure UTF-8 encoding | |
if sys.platform == 'win32': | |
os.environ['PYTHONIOENCODING'] = 'utf-8' | |
# Optimize startup for HF Spaces | |
print(f"===== Application Startup at {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} =====") | |
# Only create record_data directory when actually needed to speed up startup | |
if not os.environ.get("SPACE_ID") and not os.environ.get("HF_SPACE_ID"): | |
create_record_data_directory() | |
print(f"📁 Record data directory ready: {RECORD_DATA_DIR}") | |
else: | |
print(f"🏭 Production mode - record_data will be created on first use") | |
# Set environment variables for iframe support | |
os.environ['GRADIO_ALLOW_FLAGGING'] = 'never' | |
# Disable Gradio temp directory to prevent file serving issues | |
# os.environ['GRADIO_TEMP_DIR'] = '/tmp' | |
# Hugging Face Spaces configuration - Use standard port 7860 for HF | |
if os.environ.get("SPACE_ID") or os.environ.get("HF_SPACE_ID"): | |
# HF Spaces standard configuration | |
port = 7860 | |
print("🏭 Using HF Spaces standard port 7860") | |
else: | |
# Local development | |
port = int(os.environ.get("GRADIO_SERVER_PORT", 7880)) | |
print(f"🖥️ Using local development port {port}") | |
demo.launch( | |
server_name="0.0.0.0", | |
server_port=port, | |
share=False, | |
show_error=True, | |
ssr_mode=False, # Disable SSR to prevent timeout issues on HF Spaces | |
enable_monitoring=False # Disable monitoring for faster startup | |
) |