#!/usr/bin/env python3 # -*- coding: utf-8 -*- import os import sys # Set UTF-8 encoding for Windows if sys.platform == 'win32': import codecs sys.stdout = codecs.getwriter('utf-8')(sys.stdout.detach()) sys.stderr = codecs.getwriter('utf-8')(sys.stderr.detach()) # Load environment variables from .env file (optimized for HF Spaces) try: # Only load .env in local development, skip in production if not os.environ.get("SPACE_ID") and not os.environ.get("HF_SPACE_ID"): from dotenv import load_dotenv load_dotenv() print("✅ Environment variables loaded from .env file") else: print("🏭 Production environment - using system environment variables") except ImportError: print("⚠️ python-dotenv not installed. Using system environment variables only.") except Exception as e: print(f"⚠️ Error loading .env file: {e}") # Essential imports for HF Spaces import numpy as np import gradio as gr # Try to import google-generativeai with fallback try: import google.generativeai as genai GENAI_AVAILABLE = True except ImportError as e: print(f"⚠️ google-generativeai not available: {e}") GENAI_AVAILABLE = False genai = None try: from gtts import gTTS, lang GTTS_AVAILABLE = True except ImportError as e: print(f"⚠️ gtts not available: {e}") GTTS_AVAILABLE = False import tempfile # import soundfile as sf # Import locally to avoid startup overhead # Kokoro not used - removed for performance import time import base64 # Try to import optional dependencies try: import edge_tts EDGE_TTS_AVAILABLE = True except ImportError as e: print(f"⚠️ edge-tts not available: {e}") EDGE_TTS_AVAILABLE = False import asyncio import io try: import PyPDF2 PDF_AVAILABLE = True except ImportError: PDF_AVAILABLE = False try: import docx DOCX_AVAILABLE = True except ImportError: DOCX_AVAILABLE = False import shutil import atexit import glob import datetime # Librosa not used - removed for performance # === RECORD DATA MANAGEMENT === RECORD_DATA_DIR = "record_data" def create_record_data_directory(): """Create record_data directory if it doesn't exist""" if not os.path.exists(RECORD_DATA_DIR): os.makedirs(RECORD_DATA_DIR) print(f"✅ Created directory: {RECORD_DATA_DIR}") return RECORD_DATA_DIR def cleanup_record_data(): """Clean up record_data directory when app closes (disabled for production)""" try: # Disable cleanup for HF Spaces and production environments if os.environ.get("SPACE_ID") or os.environ.get("HF_SPACE_ID"): print(f"🏭 Production environment detected - keeping {RECORD_DATA_DIR} directory") return # Only cleanup in local development if os.path.exists(RECORD_DATA_DIR): shutil.rmtree(RECORD_DATA_DIR) print(f"🧹 Cleaned up {RECORD_DATA_DIR} directory") except Exception as e: print(f"⚠️ Error cleaning up {RECORD_DATA_DIR}: {e}") def save_recorded_audio(audio_data, original_filename=None): """Save audio data to record_data directory""" try: # Create directory if needed create_record_data_directory() # Generate filename with timestamp timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") if original_filename: name_part = os.path.splitext(os.path.basename(original_filename))[0] filename = f"recorded_{name_part}_{timestamp}.wav" else: filename = f"recorded_{timestamp}.wav" filepath = os.path.join(RECORD_DATA_DIR, filename) # Handle different audio data types if isinstance(audio_data, str) and os.path.exists(audio_data): # File path - copy the file shutil.copy2(audio_data, filepath) elif isinstance(audio_data, tuple) and len(audio_data) == 2: # Numpy array format (sample_rate, audio_array) sample_rate, audio_array = audio_data import soundfile as sf sf.write(filepath, audio_array, sample_rate) print(f"📊 Saved numpy audio: sr={sample_rate}, shape={audio_array.shape}") else: # Raw data with open(filepath, 'wb') as f: f.write(audio_data) print(f"✅ Saved recorded audio: {filepath}") return filepath except Exception as e: print(f"❌ Error saving recorded audio: {e}") import traceback traceback.print_exc() return None def get_recorded_files(): """Get list of recorded audio files""" try: if not os.path.exists(RECORD_DATA_DIR): print(f"📁 Record directory does not exist: {RECORD_DATA_DIR}") return [] # Get all audio files in record_data pattern = os.path.join(RECORD_DATA_DIR, "*.wav") files = glob.glob(pattern) print(f"🔍 Found {len(files)} files in {RECORD_DATA_DIR}") # Sort by modification time (newest first) files.sort(key=os.path.getmtime, reverse=True) # Return just filenames for display filenames = [os.path.basename(f) for f in files] print(f"📂 Returning filenames: {filenames}") return filenames except Exception as e: print(f"❌ Error getting recorded files: {e}") return [] def get_recorded_file_path(filename): """Get full path of recorded file""" return os.path.join(RECORD_DATA_DIR, filename) def delete_recorded_file(filename): """Delete recorded file from record_data directory""" try: if not filename or not filename.strip(): return "❌ Không có file nào được chọn để xóa" file_path = get_recorded_file_path(filename) print(f"🗑️ Attempting to delete: {file_path}") if os.path.exists(file_path): os.remove(file_path) print(f"✅ Successfully deleted: {filename}") return f"✅ Đã xóa file: {filename}" else: print(f"❌ File not found: {file_path}") return f"❌ Không tìm thấy file: {filename}" except Exception as e: print(f"❌ Error deleting file: {e}") return f"❌ Lỗi khi xóa file: {str(e)}" # Register cleanup function to run when app exits (disabled for stability) # atexit.register(cleanup_record_data) # Disabled to prevent data loss on deployment # DOCX support already checked above # Configure Gemini API - Delayed configuration for faster startup GEMINI_API_KEY = None def configure_gemini_api(): """Configure Gemini API on first use to speed up startup""" global GEMINI_API_KEY if not GENAI_AVAILABLE: print("❌ google-generativeai not available") return None if GEMINI_API_KEY is None: GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY") if GEMINI_API_KEY: genai.configure(api_key=GEMINI_API_KEY) print("✅ Gemini API configured successfully") else: print("⚠️ GEMINI_API_KEY or GOOGLE_API_KEY not found in environment variables") return GEMINI_API_KEY # Language configurations for Audio Translation (simplified) if GTTS_AVAILABLE: GTTS_LANGUAGES = lang.tts_langs() GTTS_LANGUAGES['ja'] = 'Japanese' else: GTTS_LANGUAGES = {'en': 'English', 'vi': 'Vietnamese'} SUPPORTED_LANGUAGES = sorted(list(GTTS_LANGUAGES.values())) # Voice mapping for Edge TTS - defined once for performance VOICE_MAP = { "🇻🇳 HoaiMy - Nữ Việt Chuẩn": "vi-VN-HoaiMyNeural", "🇻🇳 NamMinh - Nam Việt Chuẩn": "vi-VN-NamMinhNeural", "🇺🇸 Aria - Nữ Mỹ": "en-US-AriaNeural", "🇺🇸 Guy - Nam Mỹ": "en-US-GuyNeural", "🇬🇧 Sonia - Nữ Anh": "en-GB-SoniaNeural", "🇬🇧 Ryan - Nam Anh": "en-GB-RyanNeural", "🇩🇪 Katja - Deutsche Frau": "de-DE-KatjaNeural", "🇩🇪 Conrad - Deutscher Mann": "de-DE-ConradNeural", "🇫🇷 Denise - Française": "fr-FR-DeniseNeural", "🇫🇷 Henri - Français": "fr-FR-HenriNeural", "🇪🇸 Elvira - Española": "es-ES-ElviraNeural", "🇪🇸 Alvaro - Español": "es-ES-AlvaroNeural", "🇮🇹 Elsa - Italiana": "it-IT-ElsaNeural", "🇮🇹 Diego - Italiano": "it-IT-DiegoNeural", "🇯🇵 Nanami - 日本女性": "ja-JP-NanamiNeural", "🇯🇵 Keita - 日本男性": "ja-JP-KeitaNeural", "🇰🇷 SunHi - 한국 여성": "ko-KR-SunHiNeural", "🇰🇷 BongJin - 한국 남성": "ko-KR-BongJinNeural", "🇨🇳 Xiaoxiao - 中文女声": "zh-CN-XiaoxiaoNeural", "🇨🇳 Yunxi - 中文男声": "zh-CN-YunxiNeural", "🇷🇺 Svetlana - Русская": "ru-RU-SvetlanaNeural", "🇷🇺 Dmitry - Русский": "ru-RU-DmitryNeural", "🇵🇹 Francisca - Portuguesa": "pt-BR-FranciscaNeural", "🇵🇹 Antonio - Português": "pt-BR-AntonioNeural", "🇸🇦 Zariyah - عربية": "ar-SA-ZariyahNeural", "🇸🇦 Hamed - عربي": "ar-SA-HamedNeural" } # Voice RAG Functions (Tích hợp từ hf_Voice_Audio_Translation) def read_pdf(file_path): """Extract text from PDF file""" try: with open(file_path, 'rb') as file: reader = PyPDF2.PdfReader(file) text = "" for page in reader.pages: text += page.extract_text() return text except Exception as e: return f"Error reading PDF: {str(e)}" def read_docx(file_path): """Extract text from Word document""" try: if not DOCX_AVAILABLE: return "❌ python-docx not available" doc = docx.Document(file_path) text = "" for paragraph in doc.paragraphs: text += paragraph.text + "\n" return text except Exception as e: return f"Error reading DOCX: {str(e)}" def read_txt(file_path): """Extract text from TXT file""" try: with open(file_path, 'r', encoding='utf-8') as file: return file.read() except Exception as e: return f"Error reading TXT: {str(e)}" def extract_text_from_file(file_path): """Extract text from various file formats""" if file_path is None: return "No file uploaded" file_extension = os.path.splitext(file_path)[1].lower() if file_extension == '.pdf': return read_pdf(file_path) elif file_extension == '.docx': return read_docx(file_path) elif file_extension == '.txt': return read_txt(file_path) else: return f"Unsupported file format: {file_extension}" def detect_language_from_text(text): """Detect language from text content""" # Vietnamese detection vietnamese_chars = 'àáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ' if any(char in text.lower() for char in vietnamese_chars): return "Vietnamese" # Chinese detection chinese_chars = '中文汉字學習语言' if any(char in text for char in chinese_chars): return "Chinese" # Japanese detection japanese_chars = 'ひらがなカタカナ日本語' if any(char in text for char in japanese_chars): return "Japanese" # Korean detection korean_chars = '한국어문자' if any(char in text for char in korean_chars): return "Korean" # French detection french_words = ['le', 'la', 'les', 'de', 'et', 'à', 'un', 'une', 'ce', 'qui', 'que'] french_chars = 'àâäéèêëïîôöùûüÿç' if any(word in text.lower() for word in french_words) or any(char in text.lower() for char in french_chars): return "French" # German detection german_words = ['der', 'die', 'das', 'und', 'ist', 'ich', 'bin', 'haben', 'sein', 'werden'] german_chars = 'äöüß' if any(word in text.lower() for word in german_words) or any(char in text.lower() for char in german_chars): return "German" # Spanish detection spanish_words = ['el', 'la', 'de', 'que', 'y', 'a', 'en', 'un', 'es', 'se', 'no', 'te', 'lo'] spanish_chars = 'ñáéíóúü' if any(word in text.lower() for word in spanish_words) or any(char in text.lower() for char in spanish_chars): return "Spanish" # English detection (default) english_words = ['the', 'and', 'is', 'are', 'have', 'has', 'will', 'would', 'can', 'could'] if any(word in text.lower() for word in english_words): return "English" return "English" # Default fallback def process_with_gemini(text, question, answer_language="Vietnamese"): """Process text and question using Gemini with multi-language support""" try: api_key = configure_gemini_api() if not api_key: return "❌ Lỗi: Chưa cấu hình GEMINI_API_KEY hoặc GOOGLE_API_KEY trong environment variables" model = genai.GenerativeModel("gemini-2.0-flash") # Detect document language detected_doc_language = detect_language_from_text(text) prompt = f""" Based on the following document content, please answer the question in {answer_language}: Document Content (detected language: {detected_doc_language}): {text} Question: {question} Please provide a comprehensive and accurate answer in {answer_language}. If the document is in a different language than the question, please still answer in {answer_language}. Maintain the factual accuracy while adapting cultural context appropriately. """ response = model.generate_content(prompt) return response.text except Exception as e: return f"Error processing with Gemini: {str(e)}" def text_to_speech_rag(text, voice_selection): """Convert text to speech using Edge TTS for RAG results""" try: if not text or text.startswith("Error"): return None # Use global VOICE_MAP for performance voice_name = VOICE_MAP.get(voice_selection, "vi-VN-HoaiMyNeural") text_limited = text[:2000] if len(text) > 2000 else text # Generate speech using Edge TTS audio_data = asyncio.run(generate_speech(text_limited, voice_name, 0.0)) # Save to temporary file fd, temp_output_path = tempfile.mkstemp(suffix=".wav", prefix="voice_rag_audio_") os.close(fd) # Write raw audio data to temporary file with open(temp_output_path, 'wb') as f: f.write(audio_data) return temp_output_path except Exception as e: print(f"TTS Error: {str(e)}") return None def voice_rag_pipeline(uploaded_file, question, answer_language="Vietnamese", voice_selection="🇻🇳 HoaiMy - Nữ Việt Chuẩn", text_format="txt"): """Complete Voice RAG pipeline with multi-language support and downloadable text""" if uploaded_file is None: return "Please upload a document first.", "N/A", None, None if not question.strip(): return "Please enter a question.", "N/A", None, None # Extract text from uploaded file extracted_text = extract_text_from_file(uploaded_file) if extracted_text.startswith("Error"): return extracted_text, "Error", None, None # Detect document language detected_doc_language = detect_language_from_text(extracted_text) # Process with Gemini using selected answer language answer = process_with_gemini(extracted_text, question, answer_language) # Generate speech using selected voice audio_file = text_to_speech_rag(answer, voice_selection) # Create formatted content for download if text_format.lower() == "md": # Create beautiful Markdown format formatted_content = format_voice_rag_response( question, answer, detected_doc_language, voice_selection ) text_file_path = create_text_file(formatted_content, "md", "voice_rag_response") else: # Create standard text file text_file_path = create_text_file(answer, text_format, "voice_rag_answer") return answer, detected_doc_language, audio_file, text_file_path def detect_language(text): """Detect language of input text with improved accuracy""" if not text.strip(): return "unknown" text_lower = text.lower() # Vietnamese detection (more comprehensive) vietnamese_chars = 'àáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ' vietnamese_words = ['và', 'của', 'là', 'có', 'này', 'được', 'cho', 'từ', 'một', 'những', 'tôi', 'bạn'] vietnamese_score = sum(1 for char in text if char in vietnamese_chars) + sum(2 for word in vietnamese_words if word in text_lower) # English detection (more comprehensive) english_words = ['the', 'and', 'is', 'are', 'have', 'has', 'will', 'would', 'can', 'could', 'that', 'this', 'with', 'for', 'you', 'he', 'she', 'it', 'they', 'we'] english_score = sum(1 for word in english_words if word in text_lower) # German detection german_words = ['der', 'die', 'das', 'und', 'ist', 'ich', 'bin', 'haben', 'sein', 'werden', 'mit', 'auf', 'für', 'von'] german_chars = 'äöüß' german_score = sum(1 for word in german_words if word in text_lower) + sum(1 for char in text if char in german_chars) # French detection french_words = ['le', 'la', 'les', 'de', 'et', 'à', 'un', 'une', 'ce', 'qui', 'que', 'avec', 'pour', 'dans'] french_chars = 'àâäéèêëïîôöùûüÿç' french_score = sum(1 for word in french_words if word in text_lower) + sum(0.5 for char in text if char in french_chars) # Spanish detection spanish_words = ['el', 'la', 'de', 'que', 'y', 'a', 'en', 'un', 'es', 'se', 'no', 'te', 'lo', 'con', 'para'] spanish_chars = 'ñáéíóúü' spanish_score = sum(1 for word in spanish_words if word in text_lower) + sum(0.5 for char in text if char in spanish_chars) # Score-based detection scores = { 'Vietnamese': vietnamese_score, 'English': english_score, 'German': german_score, 'French': french_score, 'Spanish': spanish_score } # Find the language with highest score max_score = max(scores.values()) if max_score > 0: detected = max(scores, key=scores.get) print(f"🔍 Language detection scores: {scores}") print(f"🎯 Detected language: {detected} (score: {max_score})") return detected # Default fallback print(f"⚠️ Could not detect language, defaulting to English") return "English" async def generate_speech(text, voice_name, rate): """Generate speech using Edge TTS""" communicate = edge_tts.Communicate(text, voice_name, rate=f"{rate:+.0%}") # Create in-memory buffer audio_buffer = io.BytesIO() async for chunk in communicate.stream(): if chunk["type"] == "audio": audio_buffer.write(chunk["data"]) audio_buffer.seek(0) return audio_buffer.getvalue() def create_text_file(content, file_format="txt", filename_prefix="translated_text"): """ Create a downloadable text file from content in TXT, DOCX, or MD format """ if not content or content.startswith("Lỗi:") or content.startswith("❌"): return None try: if file_format.lower() == "docx" and DOCX_AVAILABLE: # Create Word document fd, temp_file_path = tempfile.mkstemp(suffix=".docx", prefix=f"{filename_prefix}_") os.close(fd) if not DOCX_AVAILABLE: return None from docx import Document doc = Document() doc.add_heading('Nội dung đã dịch', 0) doc.add_paragraph(content) doc.save(temp_file_path) return temp_file_path elif file_format.lower() == "md": # Create Markdown file fd, temp_file_path = tempfile.mkstemp(suffix=".md", prefix=f"{filename_prefix}_") os.close(fd) with open(temp_file_path, 'w', encoding='utf-8') as f: f.write(content) return temp_file_path else: # Create TXT file (default) fd, temp_file_path = tempfile.mkstemp(suffix=".txt", prefix=f"{filename_prefix}_") os.close(fd) with open(temp_file_path, 'w', encoding='utf-8') as f: f.write(content) return temp_file_path except Exception as e: return None def format_voice_rag_response(question, answer, detected_language, voice_selection, timestamp=None): """ Format Voice RAG response as beautiful Markdown """ if timestamp is None: timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # Clean and format the answer formatted_answer = answer.strip() # Create beautiful Markdown document markdown_content = f"""# 📚 Voice RAG - Intelligent Document Q&A --- ## 📄 **Session Information** | **Field** | **Details** | |-----------|-------------| | 🕒 **Timestamp** | {timestamp} | | 🌍 **Document Language** | {detected_language} | | 🎭 **Voice Selection** | {voice_selection} | | 🤖 **AI Model** | Google Gemini 2.0 Flash | --- ## ❓ **Question** > {question} --- ## 💬 **AI Response** {formatted_answer} --- --- ## 📱 **Generated by** **🎙️ Voice AI Platform** - Digitized Brains *Powered by Claude Code & Google Gemini 2.0 Flash* > 🌐 **Voice RAG Technology** - Combining document intelligence with premium voice synthesis --- *Generated on {timestamp} | Voice: {voice_selection} | Language: {detected_language}* """ return markdown_content def format_voice_studio_response(text, voice_selection, speed, detected_language="Auto-detected", timestamp=None): """ Format Voice Studio response as simple Markdown """ if timestamp is None: timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # Clean and format the text formatted_text = text.strip() # Create simple Markdown document markdown_content = f"""# Voice Studio Result ## Input Text ({detected_language}) {formatted_text} --- *Generated on {timestamp} | Voice: {voice_selection} | Speed: {speed:.1f}x* """ return markdown_content def format_audio_translation_response(original_text, translated_text, source_language, target_language, voice_selection, timestamp=None): """ Format Audio Translation response as simple Markdown """ if timestamp is None: timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # Clean and format the texts formatted_original = original_text.strip() formatted_translated = translated_text.strip() # Create simple Markdown document markdown_content = f"""# Audio Translation Result ## Original Text ({source_language}) {formatted_original} ## Translated Text ({target_language}) {formatted_translated} --- *Generated on {timestamp} | {source_language} → {target_language} | Voice: {voice_selection}* """ return markdown_content def create_audio_voice_studio(text, voice_selection, speed, text_format="txt"): """Voice Studio functionality with text file generation""" if not text.strip(): return "❌ Vui lòng nhập văn bản / Please enter text / Bitte Text eingeben", None try: # Use global VOICE_MAP for performance (avoiding recreation on each call) voice_name = VOICE_MAP.get(voice_selection, "vi-VN-HoaiMyNeural") text_limited = text[:1000] if len(text) > 1000 else text # Convert speed (0.5-2.0) to rate percentage (-50% to +100%) rate_percent = (speed - 1.0) # Generate speech using Edge TTS audio_data = asyncio.run(generate_speech(text_limited, voice_name, rate_percent)) # Convert to base64 audio_base64 = base64.b64encode(audio_data).decode('utf-8') timestamp = int(time.time()) filename = f"voice_{voice_name}_{speed}x_{timestamp}.mp3" # Detect language detected_lang = detect_language(text_limited) # Mobile-optimized HTML player html_player = f'''

🎵 Âm thanh hoàn thành!

🎭 Giọng: {voice_selection}
⚡ Tốc độ: {speed:.1f}x | 🌍 Ngôn ngữ: {detected_lang.title()}
📝 Độ dài: {len(text_limited)} ký tự
📥 TẢI XUỐNG MP3
''' # Create text file based on format text_file_path = None if text_format == "md": # Use Markdown formatting function detected_language = detect_language(text_limited) markdown_content = format_voice_studio_response(text_limited, voice_selection, speed, detected_language) text_file_path = create_text_file(markdown_content, "md", "voice_studio") elif text_format == "docx": # Create Word document with Voice Studio formatting detected_language = detect_language(text_limited) markdown_content = format_voice_studio_response(text_limited, voice_selection, speed, detected_language) text_file_path = create_text_file(markdown_content, "docx", "voice_studio") elif text_format == "txt": # Create simple text file text_file_path = create_text_file(text_limited, "txt", "voice_studio") return html_player, text_file_path except Exception as e: return f"❌ Error: {str(e)}", None # Language mapping for voices - defined once for performance VOICE_TO_LANGUAGE = { # Vietnamese "🇻🇳 HoaiMy - Nữ Việt Chuẩn": "Vietnamese", "🇻🇳 NamMinh - Nam Việt Chuẩn": "Vietnamese", # English "🇺🇸 Aria - Nữ Mỹ": "English", "🇺🇸 Guy - Nam Mỹ": "English", "🇬🇧 Sonia - Nữ Anh": "English", "🇬🇧 Ryan - Nam Anh": "English", # German "🇩🇪 Katja - Deutsche Frau": "German", "🇩🇪 Conrad - Deutscher Mann": "German", # French "🇫🇷 Denise - Française": "French", "🇫🇷 Henri - Français": "French", # Spanish "🇪🇸 Elvira - Española": "Spanish", "🇪🇸 Alvaro - Español": "Spanish", # Italian "🇮🇹 Elsa - Italiana": "Italian", "🇮🇹 Diego - Italiano": "Italian", # Japanese "🇯🇵 Nanami - 日本女性": "Japanese", "🇯🇵 Keita - 日本男性": "Japanese", # Korean "🇰🇷 SunHi - 한국 여성": "Korean", "🇰🇷 BongJin - 한국 남성": "Korean", # Chinese "🇨🇳 Xiaoxiao - 中文女声": "Chinese", "🇨🇳 Yunxi - 中文男声": "Chinese", # Russian "🇷🇺 Svetlana - Русская": "Russian", "🇷🇺 Dmitry - Русский": "Russian", # Portuguese "🇵🇹 Francisca - Portuguesa": "Portuguese", "🇵🇹 Antonio - Português": "Portuguese", # Arabic "🇸🇦 Zariyah - عربية": "Arabic", "🇸🇦 Hamed - عربي": "Arabic" } def get_target_language_from_voice(voice_selection): """Map voice selection to target language for translation""" return VOICE_TO_LANGUAGE.get(voice_selection, "Vietnamese") def translate_text_with_gemini(text, target_language): """Translate text using Gemini API""" try: api_key = configure_gemini_api() if not api_key: return f"❌ Lỗi: Chưa cấu hình GEMINI_API_KEY hoặc GOOGLE_API_KEY trong environment variables" if not text.strip(): return "" model = genai.GenerativeModel("gemini-2.0-flash") prompt = f"""Translate the following text to {target_language}. Return ONLY the translated text, nothing else: {text}""" response = model.generate_content(prompt) translated_text = response.text.strip() # Clean up any unwanted text that might be included if translated_text.lower().startswith("translation:"): translated_text = translated_text[12:].strip() if translated_text.lower().startswith("here is"): lines = translated_text.split('\n') if len(lines) > 1: translated_text = '\n'.join(lines[1:]).strip() return translated_text except Exception as e: return f"Lỗi dịch thuật: {str(e)}" def translate_audio(audio_file, target_country, voice_selection, text_format="txt"): """ Transcribe, translate and synthesize audio to target language with Voice Studio integration """ try: api_key = configure_gemini_api() if not api_key: return "❌ Lỗi: Chưa cấu hình GEMINI_API_KEY hoặc GOOGLE_API_KEY trong environment variables", "Không xác định", "", target_country, None, None, "", "", None if audio_file is None: return "Lỗi: Vui lòng tải lên file audio", "Không xác định", "", target_country, None, None, "", "", None # Save recorded audio to record_data directory print(f"🔍 Processing audio file type: {type(audio_file)}") saved_audio_path = save_recorded_audio(audio_file) if saved_audio_path: print(f"🎤 Audio saved to record_data: {os.path.basename(saved_audio_path)}") # Debug: check if file really exists if os.path.exists(saved_audio_path): file_size = os.path.getsize(saved_audio_path) print(f"✅ File confirmed: {saved_audio_path} ({file_size} bytes)") else: print(f"❌ File not found after save: {saved_audio_path}") return "❌ Lỗi: Không thể lưu file audio", "Không xác định", "", target_country, None, None, "", "", None else: print("❌ Failed to save audio file") return "❌ Lỗi: Không thể lưu file audio", "Không xác định", "", target_country, None, None, "", "", None # Get target language from voice selection target_language = get_target_language_from_voice(voice_selection) # Transcribe audio using Gemini model = genai.GenerativeModel("gemini-2.0-flash") # Read audio file using saved path with open(saved_audio_path, 'rb') as f: audio_data = f.read() # Create audio blob audio_blob = { 'mime_type': 'audio/wav', 'data': audio_data } # Step 1: Transcribe audio only first transcribe_prompt = """Transcribe this audio accurately in its original language. Return only the transcribed text, nothing else.""" response = model.generate_content([transcribe_prompt, audio_blob]) transcription = response.text.strip() # Step 2: Detect language of transcription detected_lang = detect_language(transcription) # Step 3: Translate if needed (only if source is different from target) if detected_lang.lower() != target_language.lower(): print(f"🔄 Translating from {detected_lang} to {target_language}") translated_text = translate_text_with_gemini(transcription, target_language) # Check if translation was successful if translated_text.startswith("❌") or translated_text.startswith("Lỗi"): print(f"❌ Translation failed: {translated_text}") # Use original transcription if translation fails translated_text = transcription else: print(f"✅ Translation successful") else: print(f"ℹ️ No translation needed - same language ({detected_lang})") translated_text = transcription # Generate audio using Edge TTS (use global VOICE_MAP for performance) edge_voice = VOICE_MAP.get(voice_selection, "vi-VN-HoaiMyNeural") print(f"🎙️ Generating audio with voice: {edge_voice}") audio_data = asyncio.run(generate_speech(translated_text, edge_voice, 0.0)) print(f"🎵 Generated audio data: {len(audio_data)} bytes") # Save audio file fd, temp_output_path = tempfile.mkstemp(suffix=".wav", prefix="translated_audio_") os.close(fd) print(f"📁 Created temp audio file: {temp_output_path}") # Write raw audio data to temporary file with open(temp_output_path, 'wb') as f: f.write(audio_data) # Verify file was created if os.path.exists(temp_output_path): file_size = os.path.getsize(temp_output_path) print(f"✅ Audio file created successfully: {file_size} bytes") else: print(f"❌ Failed to create audio file: {temp_output_path}") # Create text file for download with proper formatting text_file_path = None if text_format == "md": # Use Markdown formatting function for Audio Translation markdown_content = format_audio_translation_response( transcription, translated_text, detected_lang, target_language, voice_selection ) text_file_path = create_text_file(markdown_content, "md", "audio_translation") elif text_format == "docx": # Create Word document with Audio Translation formatting markdown_content = format_audio_translation_response( transcription, translated_text, detected_lang, target_language, voice_selection ) text_file_path = create_text_file(markdown_content, "docx", "audio_translation") else: # Create simple text file text_file_path = create_text_file(translated_text, "txt", "audio_translation") return transcription, detected_lang, translated_text, target_language, temp_output_path, temp_output_path, transcription, translated_text, text_file_path except Exception as e: # Get target language for error response target_language = get_target_language_from_voice(voice_selection) if 'voice_selection' in locals() else "Vietnamese" return f"Lỗi: {str(e)}", "Lỗi", "", target_language, None, None, "", "", None # Voice choices organized by country - ONLY OFFICIAL VOICES voice_choices_by_country = { "🇻🇳 Việt Nam": [ "🇻🇳 HoaiMy - Nữ Việt Chuẩn", "🇻🇳 NamMinh - Nam Việt Chuẩn" ], "🇺🇸 Hoa Kỳ": [ "🇺🇸 Aria - Nữ Mỹ", "🇺🇸 Guy - Nam Mỹ" ], "🇬🇧 Anh": [ "🇬🇧 Sonia - Nữ Anh", "🇬🇧 Ryan - Nam Anh" ], "🇩🇪 Đức": [ "🇩🇪 Katja - Deutsche Frau", "🇩🇪 Conrad - Deutscher Mann" ], "🇫🇷 Pháp": [ "🇫🇷 Denise - Française", "🇫🇷 Henri - Français" ], "🇪🇸 Tây Ban Nha": [ "🇪🇸 Elvira - Española", "🇪🇸 Alvaro - Español" ], "🇮🇹 Ý": [ "🇮🇹 Elsa - Italiana", "🇮🇹 Diego - Italiano" ], "🇯🇵 Nhật Bản": [ "🇯🇵 Nanami - 日本女性", "🇯🇵 Keita - 日本男性" ], "🇰🇷 Hàn Quốc": [ "🇰🇷 SunHi - 한국 여성", "🇰🇷 BongJin - 한국 남성" ], "🇨🇳 Trung Quốc": [ "🇨🇳 Xiaoxiao - 中文女声", "🇨🇳 Yunxi - 中文男声" ], "🇷🇺 Nga": [ "🇷🇺 Svetlana - Русская", "🇷🇺 Dmitry - Русский" ], "🇵🇹 Bồ Đào Nha": [ "🇵🇹 Francisca - Portuguesa", "🇵🇹 Antonio - Português" ], "🇸🇦 Ả Rập": [ "🇸🇦 Zariyah - عربية", "🇸🇦 Hamed - عربي" ] } def update_voices(country): """Update voice choices based on selected country""" if country in voice_choices_by_country: voices = voice_choices_by_country[country] return gr.Dropdown(choices=voices, value=voices[0]) else: # Default to Vietnamese voices default_voices = voice_choices_by_country["🇻🇳 Việt Nam"] return gr.Dropdown(choices=default_voices, value=default_voices[0]) # Lightweight CSS - optimized for performance css = """ * { font-family: system-ui, -apple-system, 'Segoe UI', Arial, sans-serif; } .gradio-container { max-width: 1200px; margin: 0 auto; position: relative; } /* Critical fix for dropdown interaction */ .gradio-container * { pointer-events: auto; } /* Hide Gradio footer */ .footer { display: none !important; } /* Pulsing animation for processing status */ @keyframes pulse-processing { 0% { opacity: 1; transform: scale(1); box-shadow: 0 4px 15px rgba(255, 193, 7, 0.3); } 50% { opacity: 0.8; transform: scale(1.02); box-shadow: 0 6px 25px rgba(255, 193, 7, 0.6); } 100% { opacity: 1; transform: scale(1); box-shadow: 0 4px 15px rgba(255, 193, 7, 0.3); } } .status-processing { animation: pulse-processing 1.5s ease-in-out infinite; background: linear-gradient(135deg, #FFC107 0%, #FF9800 100%) !important; } /* Success status animation */ @keyframes pulse-success { 0% { opacity: 1; transform: scale(1); } 50% { opacity: 0.9; transform: scale(1.01); } 100% { opacity: 1; transform: scale(1); } } .status-success { animation: pulse-success 2s ease-in-out 3; background: linear-gradient(135deg, #4CAF50 0%, #2E7D32 100%) !important; } /* Custom footer to cover Gradio attribution */ .custom-footer { position: fixed; bottom: 0; left: 0; right: 0; background: linear-gradient(135deg, #4A90E2 0%, #2E86AB 70%, #FF8A65 85%, #FF6B9D 100%); color: white; padding: 15px; text-align: center; font-weight: bold; z-index: 1000; box-shadow: 0 -2px 10px rgba(0,0,0,0.1); } /* Add padding to body to account for fixed footer */ body { padding-bottom: 60px; } /* Mobile-first responsive design */ .input-card { background: rgba(255,255,255,0.95); border-radius: 16px; padding: 16px; margin: 10px 0; box-shadow: 0 4px 20px rgba(0,0,0,0.1); backdrop-filter: blur(10px); } .output-area { background: rgba(255,255,255,0.95); border-radius: 16px; padding: 16px; margin: 15px 0; min-height: 200px; box-shadow: 0 4px 20px rgba(0,0,0,0.1); } .examples-section { background: rgba(255,255,255,0.9); border-radius: 16px; padding: 16px; margin: 20px 0; } .main-header { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 10px; margin-bottom: 20px; text-align: center; } .feature-box { background: #f8f9fa; padding: 15px; border-radius: 8px; margin: 10px 0; border-left: 4px solid #667eea; } .status-indicator { display: inline-block; padding: 5px 10px; border-radius: 15px; font-size: 12px; font-weight: bold; margin: 5px; } .status-success { background-color: #d4edda; color: #155724; } .status-processing { background-color: #fff3cd; color: #856404; } .comparison-section { border: 1px solid #e0e0e0; border-radius: 8px; padding: 15px; margin: 10px 0; background: #fafafa; } .language-label { font-weight: bold; color: #667eea; padding: 5px 10px; background: #f0f2ff; border-radius: 15px; display: inline-block; margin-bottom: 10px; font-size: 14px; } .content-compare { background: white; border: 1px solid #ddd; border-radius: 6px; padding: 12px; min-height: 120px; font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; line-height: 1.5; } /* Reset any problematic dropdown styles */ .gradio-container * { pointer-events: auto; } /* Remove any potential blocking overlays */ .gradio-container::before, .gradio-container::after { display: none; } /* Ensure all interactive elements work */ button, select, input, textarea, .gr-dropdown { pointer-events: auto !important; position: relative !important; } /* Simple dropdown fix without complex selectors */ [class*="dropdown"] { position: relative !important; z-index: 999 !important; } [class*="dropdown"] * { pointer-events: auto !important; } /* Make sure no overlay blocks clicks */ .gradio-container .gr-form { position: relative; z-index: 1; } .gradio-container .gr-block { position: relative; z-index: 1; } .mobile-button { width: 100% !important; padding: 15px !important; font-size: 1.1em !important; margin: 20px 0 !important; border-radius: 12px !important; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important; border: none !important; color: white !important; font-weight: bold !important; box-shadow: 0 4px 15px rgba(102, 126, 234, 0.3) !important; transition: all 0.3s ease !important; cursor: pointer !important; position: relative !important; overflow: hidden !important; } .mobile-button:hover { transform: translateY(-2px) !important; box-shadow: 0 8px 25px rgba(102, 126, 234, 0.4) !important; background: linear-gradient(135deg, #5a6fd8 0%, #6b4190 100%) !important; } .mobile-button:active { transform: translateY(0px) !important; box-shadow: 0 2px 10px rgba(102, 126, 234, 0.3) !important; } /* Ripple effect for button */ .mobile-button::before { content: ''; position: absolute; top: 50%; left: 50%; width: 0; height: 0; border-radius: 50%; background: rgba(255, 255, 255, 0.3); transform: translate(-50%, -50%); transition: width 0.6s, height 0.6s; } .mobile-button:active::before { width: 300px; height: 300px; } /* Loading spinner animation */ @keyframes spin { 0% { transform: rotate(0deg); } 100% { transform: rotate(360deg); } } .loading-spinner { display: inline-block; width: 20px; height: 20px; border: 3px solid rgba(255,255,255,0.3); border-radius: 50%; border-top-color: white; animation: spin 1s ease-in-out infinite; margin-right: 10px; } /* Button pulse effect when processing */ @keyframes pulse { 0% { box-shadow: 0 4px 15px rgba(102, 126, 234, 0.3); } 50% { box-shadow: 0 8px 25px rgba(102, 126, 234, 0.6); } 100% { box-shadow: 0 4px 15px rgba(102, 126, 234, 0.3); } } .button-processing { animation: pulse 2s ease-in-out infinite; background: linear-gradient(135deg, #FF8E53 0%, #FF6B6B 100%) !important; } .mobile-textbox textarea { border-radius: 10px !important; border: 2px solid #e0e0e0 !important; padding: 12px !important; font-size: 1em !important; line-height: 1.5 !important; } .mobile-compare textarea { border-radius: 8px !important; border: 1px solid #ddd !important; padding: 10px !important; background: #fafafa !important; font-size: 0.95em !important; } .mobile-audio { margin: 10px 0 !important; border-radius: 10px !important; } .mobile-file { margin: 10px 0 !important; border-radius: 10px !important; } /* Beautiful Markdown styling for Voice RAG responses */ .markdown-response { background: linear-gradient(135deg, #ffffff 0%, #f8fffe 100%); border-radius: 12px; padding: 20px; margin: 15px 0; box-shadow: 0 4px 20px rgba(0,0,0,0.1); border-left: 4px solid #4CAF50; } .markdown-response h1 { color: #2e7d32; border-bottom: 2px solid #4CAF50; padding-bottom: 10px; margin-bottom: 20px; font-size: 1.8em; } .markdown-response h2 { color: #388E3C; margin-top: 25px; margin-bottom: 15px; font-size: 1.4em; border-left: 3px solid #4CAF50; padding-left: 15px; } .markdown-response h3 { color: #43A047; margin-top: 20px; margin-bottom: 12px; font-size: 1.2em; } .markdown-response p { line-height: 1.6; margin-bottom: 12px; color: #333; } .markdown-response blockquote { background: linear-gradient(135deg, #e8f5e8 0%, #c8e6c9 100%); border-left: 4px solid #4CAF50; padding: 15px 20px; margin: 15px 0; border-radius: 8px; font-style: italic; color: #2e7d32; } .markdown-response table { width: 100%; border-collapse: collapse; margin: 15px 0; box-shadow: 0 2px 10px rgba(0,0,0,0.1); border-radius: 8px; overflow: hidden; } .markdown-response table th { background: linear-gradient(135deg, #4CAF50 0%, #45a049 100%); color: white; padding: 12px 15px; text-align: left; font-weight: bold; } .markdown-response table td { padding: 12px 15px; border-bottom: 1px solid #e0e0e0; background: white; } .markdown-response table tr:nth-child(even) td { background: #f8fffe; } .markdown-response table tr:hover td { background: #e8f5e8; transition: background 0.3s ease; } .markdown-response ul, .markdown-response ol { margin: 15px 0; padding-left: 25px; } .markdown-response li { margin-bottom: 8px; line-height: 1.5; } .markdown-response code { background: #f5f5f5; border: 1px solid #e0e0e0; border-radius: 4px; padding: 2px 6px; font-family: 'Courier New', monospace; color: #d32f2f; } .markdown-response pre { background: #f5f5f5; border: 1px solid #e0e0e0; border-radius: 8px; padding: 15px; overflow-x: auto; margin: 15px 0; } .markdown-response pre code { background: none; border: none; padding: 0; color: #333; } .markdown-response hr { border: none; height: 2px; background: linear-gradient(90deg, transparent, #4CAF50, transparent); margin: 25px 0; } .markdown-response strong { color: #2e7d32; font-weight: bold; } .markdown-response em { color: #388E3C; font-style: italic; } /* Responsive design for markdown */ @media (max-width: 768px) { .markdown-response { padding: 15px; margin: 10px 0; } .markdown-response table { font-size: 0.9em; } .markdown-response h1 { font-size: 1.6em; } .markdown-response h2 { font-size: 1.3em; } } /* Mobile responsive breakpoints */ @media (max-width: 768px) { .gradio-container { padding: 10px !important; } .input-card { padding: 12px !important; margin: 8px 0 !important; } .output-area { padding: 12px !important; margin: 10px 0 !important; } .examples-section { padding: 12px !important; } .main-header h2 { font-size: 1.5em !important; } .main-header p { font-size: 1em !important; } /* Mobile layout adjustments - less aggressive */ .gr-row { flex-direction: column; } .gr-column { width: 100%; margin-bottom: 15px; } } @media (max-width: 480px) { .gradio-container { padding: 5px !important; } .input-card { padding: 10px !important; margin: 5px 0 !important; } .main-header { padding: 15px !important; } .main-header h2 { font-size: 1.3em !important; } .mobile-button { padding: 12px !important; font-size: 1em !important; } } /* JavaScript for button interactions */ """ # Add JavaScript for button effects js_code = """ """ # Create interface with tabs with gr.Blocks(css=css, title="🎙️ Voice AI Platform - Voice RAG & Audio Translation") as demo: # Simplified header for faster loading on HF Spaces if not (os.environ.get("SPACE_ID") or os.environ.get("HF_SPACE_ID")): # Only load complex microphone permissions in local development gr.HTML("""

🎙️ Voice AI Platform

Voice RAG, Audio Translation và Voice Studio - Nền tảng AI giọng nói toàn diện

✨ Tính năng mới: Voice RAG với 24 giọng nói đa ngôn ngữ
🧠 Digitized Brains
""") else: # Production mode - minimal header gr.HTML('

🎙️ Voice AI Platform

') with gr.Tabs(): # Tab 1: Voice RAG with gr.TabItem("📚 Voice RAG"): # Header section with hf_voice style gr.HTML("""

📚 Voice RAG

Hỏi đáp tài liệu thông minh

🌍 Multi-Language

13 ngôn ngữ trả lời

🎤 Voice Output

24 giọng nói đa dạng

🔄 AI Gemini

Gemini 2.0 Flash

""") gr.Markdown("### 📝 Upload tài liệu và đặt câu hỏi") # Input section - Mobile optimized with gr.Column(): # Document upload with gr.Row(): file_upload_rag = gr.File( label="📎 Tải lên tài liệu (PDF, DOCX, TXT)", file_types=[".pdf", ".docx", ".txt"] ) # Question input with gr.Row(): question_input_rag = gr.Textbox( label="❓ Câu hỏi của bạn", placeholder="Hãy đặt câu hỏi về nội dung tài liệu...", lines=3 ) # Language selection for answer with gr.Row(): answer_language_dropdown_rag = gr.Dropdown( choices=SUPPORTED_LANGUAGES, value="Vietnamese", label="🌍 Ngôn ngữ trả lời" ) # Voice selection từ Voice Studio with gr.Row(): with gr.Column(scale=1): rag_country_dropdown = gr.Dropdown( choices=list(voice_choices_by_country.keys()), value="🇻🇳 Việt Nam", label="🌍 Chọn quốc gia giọng nói" ) with gr.Column(scale=1): rag_voice_dropdown = gr.Dropdown( choices=voice_choices_by_country["🇻🇳 Việt Nam"], value="🇻🇳 HoaiMy - Nữ Việt Chuẩn", label="🎭 Chọn giọng nói" ) # Format selection for download with gr.Row(): rag_text_format_dropdown = gr.Dropdown( choices=["Markdown (.md)", "TXT (.txt)", "Word (.docx)"] if DOCX_AVAILABLE else ["Markdown (.md)", "TXT (.txt)"], value="Markdown (.md)", label="📄 Định dạng file trả lời" ) # Process button with gr.Row(): submit_btn_rag = gr.Button( "🚀 Xử lý tài liệu và trả lời", variant="primary", size="lg" ) # Results section - Mobile optimized with gr.Column(): # Document info section with gr.Accordion("📄 Thông tin tài liệu", open=True): detected_doc_language_rag = gr.Textbox( label="🌐 Ngôn ngữ tài liệu được phát hiện", lines=1, interactive=False, placeholder="Tự động nhận diện ngôn ngữ tài liệu..." ) # Text answer section with gr.Accordion("💬 Câu trả lời", open=True): gr.HTML("""

💬 AI Response with Markdown Formatting

Formatted response with tables, headers, and beautiful layout

""") answer_output_rag = gr.Markdown( value="**Câu trả lời sẽ xuất hiện ở đây sau khi xử lý...**\n\n*Hỗ trợ format Markdown với tables, headers, lists và nhiều style khác*", label="", show_label=False, elem_classes=["markdown-response"] ) # Downloads section - Mobile optimized with gr.Accordion("💾 Tải xuống kết quả", open=True): gr.HTML("""

Tải xuống câu trả lời dưới dạng file và audio

""") # Stack vertically on mobile with gr.Column(): # Audio download section with gr.Row(): audio_output_rag = gr.Audio( label="🔊 Audio câu trả lời", type="filepath" ) # Text download section with gr.Row(): text_output_rag = gr.File( label="📄 Văn bản câu trả lời", file_count="single", file_types=[".md", ".txt", ".docx"] ) # Status indicator for RAG rag_status_text = gr.HTML("""
✅ Sẵn sàng xử lý tài liệu
""") # Helper function for RAG format def get_rag_format_from_dropdown(format_choice): if "Word" in format_choice or "docx" in format_choice: return "docx" elif "Markdown" in format_choice or "md" in format_choice: return "md" return "txt" # RAG processing function def update_rag_status_processing(): return """
⏳ Đang xử lý tài liệu...
""" def update_rag_status_complete(): return """
✅ Xử lý hoàn thành!
""" # Event handlers for Voice RAG rag_country_dropdown.change( fn=update_voices, inputs=[rag_country_dropdown], outputs=[rag_voice_dropdown] ) submit_btn_rag.click( fn=lambda: update_rag_status_processing(), outputs=[rag_status_text] ).then( fn=lambda file, question, lang, voice, fmt: voice_rag_pipeline(file, question, lang, voice, get_rag_format_from_dropdown(fmt)), inputs=[file_upload_rag, question_input_rag, answer_language_dropdown_rag, rag_voice_dropdown, rag_text_format_dropdown], outputs=[answer_output_rag, detected_doc_language_rag, audio_output_rag, text_output_rag] ).then( fn=lambda: update_rag_status_complete(), outputs=[rag_status_text] ) # Voice Studio Tab with gr.TabItem("🎤 Voice Studio"): gr.HTML("""

🇻🇳 Tiếng Việt

2 giọng chuẩn

HoaiMy • NamMinh

🇺🇸🇬🇧 English

4 giọng chuẩn

US • UK

🌍 Đa ngôn ngữ

20 giọng chuẩn

10 ngôn ngữ

""") gr.Markdown("### 📝 Nhập nội dung và chọn giọng nói") with gr.Row(): text_input = gr.Textbox( placeholder="Nhập văn bản cần chuyển thành giọng nói...", lines=4, label="Văn bản", scale=2 ) with gr.Row(): with gr.Column(scale=1): country_dropdown = gr.Dropdown( choices=list(voice_choices_by_country.keys()), value="🇻🇳 Việt Nam", label="🌍 Chọn quốc gia" ) with gr.Column(scale=1): voice_dropdown = gr.Dropdown( choices=voice_choices_by_country["🇻🇳 Việt Nam"], value="🇻🇳 HoaiMy - Nữ Việt Chuẩn", label="🎭 Chọn giọng nói" ) with gr.Row(): with gr.Column(scale=2): speed_slider = gr.Slider( minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="⚡ Tốc độ phát" ) with gr.Column(scale=1): voice_studio_format_dropdown = gr.Dropdown( choices=["Markdown (.md)", "TXT (.txt)", "Word (.docx)"] if DOCX_AVAILABLE else ["Markdown (.md)", "TXT (.txt)"], value="Markdown (.md)", label="📄 Định dạng file tải xuống" ) # Translation feature with gr.Row(): with gr.Column(scale=1): translate_checkbox = gr.Checkbox( label="🌍 Dịch văn bản trước khi tạo giọng nói", value=False ) with gr.Column(scale=2): translate_btn = gr.Button("🔄 DỊCH VĂN BẢN", variant="secondary", size="lg", visible=False) # Show translated text when translation is enabled translated_text_output = gr.Textbox( label="📝 Văn bản đã dịch", lines=3, interactive=True, visible=False, placeholder="Văn bản sau khi dịch sẽ hiển thị ở đây..." ) generate_btn = gr.Button("🎵 TẠO GIỌNG NÓI", variant="primary", size="lg") # Status indicator for Voice Studio studio_status_text = gr.HTML("""
⚡ Sẵn sàng tạo giọng nói
""") gr.Markdown("### 🎧 Kết quả âm thanh") audio_output_vs = gr.HTML( value="

Nhấn 'TẠO GIỌNG NÓI' để bắt đầu 🎤

" ) # Download section for Voice Studio with gr.Accordion("💾 Tải xuống kết quả", open=False): gr.HTML("""

📄 Tải xuống văn bản với Markdown formatting

File chứa thông tin session, cấu hình giọng nói và technical details

""") voice_studio_text_output = gr.File( label="📄 Văn bản với thông tin chi tiết", file_count="single", file_types=[".md", ".txt", ".docx"] ) # Examples section gr.Markdown("### 📚 Ví dụ nhanh") with gr.Row(): example_vn = gr.Button("🇻🇳 Tiếng Việt", size="sm") example_en = gr.Button("🇺🇸 English", size="sm") example_de = gr.Button("🇩🇪 Deutsch", size="sm") example_translate = gr.Button("🌍 Dịch thuật", size="sm") # Example button functions def load_vn_example(): return "Xin chào! Chào mừng bạn đến với studio giọng nói.", "🇻🇳 Việt Nam" def load_en_example(): return "Hello! Welcome to our voice studio.", "🇺🇸 Hoa Kỳ" def load_de_example(): return "Hallo! Willkommen in unserem Sprachstudio.", "🇩🇪 Đức" def load_translate_example(): return "Hello! This is an example text for translation.", "🇺🇸 Hoa Kỳ", True # Translation functions def toggle_translation_ui(translate_enabled): """Show/hide translation UI elements""" return ( gr.update(visible=translate_enabled), # translate_btn gr.update(visible=translate_enabled) # translated_text_output ) def translate_text_interface(text, voice_selection): """Translate text for Voice Studio""" if not text.strip(): return "Vui lòng nhập văn bản trước khi dịch" target_language = get_target_language_from_voice(voice_selection) translated = translate_text_with_gemini(text, target_language) return translated def create_voice_with_translation(original_text, translated_text, translate_enabled, voice_selection, speed, text_format="txt"): """Create voice using original or translated text""" if translate_enabled and translated_text.strip() and not translated_text.startswith("Lỗi"): # Use translated text return create_audio_voice_studio(translated_text, voice_selection, speed, text_format) else: # Use original text return create_audio_voice_studio(original_text, voice_selection, speed, text_format) # Event handlers for Voice Studio country_dropdown.change( fn=update_voices, inputs=[country_dropdown], outputs=[voice_dropdown] ) example_vn.click( fn=load_vn_example, outputs=[text_input, country_dropdown] ) example_en.click( fn=load_en_example, outputs=[text_input, country_dropdown] ) example_de.click( fn=load_de_example, outputs=[text_input, country_dropdown] ) example_translate.click( fn=load_translate_example, outputs=[text_input, country_dropdown, translate_checkbox] ) # Translation UI toggle translate_checkbox.change( fn=toggle_translation_ui, inputs=[translate_checkbox], outputs=[translate_btn, translated_text_output] ) # Translation button translate_btn.click( fn=translate_text_interface, inputs=[text_input, voice_dropdown], outputs=[translated_text_output] ) # Helper function to extract format and process Voice Studio def process_voice_studio(original_text, translated_text, translate_enabled, voice_selection, speed, format_choice): """Process Voice Studio with format support""" # Extract format from dropdown if "Markdown" in format_choice: text_format = "md" elif "Word" in format_choice: text_format = "docx" else: text_format = "txt" return create_voice_with_translation(original_text, translated_text, translate_enabled, voice_selection, speed, text_format) # Generate voice with translation support generate_btn.click( fn=process_voice_studio, inputs=[text_input, translated_text_output, translate_checkbox, voice_dropdown, speed_slider, voice_studio_format_dropdown], outputs=[audio_output_vs, voice_studio_text_output] ) # Audio Translation Tab with gr.TabItem("🎙️ Audio Translation"): # Colorful feature cards like Voice Studio gr.HTML("""

🎤 Ghi âm

Microphone

Real-time

📁 Upload

Audio Files

WAV • MP3

🔄 AI Dịch

13 ngôn ngữ

Gemini 2.0

🎵 Tổng hợp

Neural TTS

26 giọng

""") # Input section with colorful design gr.HTML("""

🎤 Tải lên file audio hoặc ghi âm trực tiếp

Hỗ trợ file WAV, MP3 hoặc ghi âm real-time qua microphone

""") # Enhanced microphone permission notice and controls if not (os.environ.get("SPACE_ID") or os.environ.get("HF_SPACE_ID")): gr.HTML("""
""") else: # Production mode - simple microphone notice gr.HTML('
📎 Upload audio file or use microphone
') audio_input = gr.Audio( label="📎 Tải lên file audio hoặc ghi âm trực tiếp", type="numpy", # Use numpy to avoid temp file issues sources=["upload", "microphone"], show_label=True, interactive=True, elem_id="audio-input-translation" ) # Audio Recording Control Buttons with gr.Row(): save_recording_btn = gr.Button( "💾 Save Recording", variant="secondary", size="sm" ) new_recording_btn = gr.Button( "🎙️ New Record", variant="primary", size="sm" ) # Button descriptions gr.HTML("""
💾 Lưu file audio hiện tại vào record_data 🎙️ Xóa audio hiện tại để ghi âm mới
""") # Status for recording actions recording_status = gr.HTML( value="

Sẵn sàng ghi âm hoặc tải lên file

" ) # === RECORDED FILES FUNCTIONS === def refresh_recorded_files(): """Refresh the list of recorded files""" files = get_recorded_files() print(f"🔄 Refreshing dropdown - found files: {files}") return gr.Dropdown(choices=files, value=None) def load_recorded_file(filename): """Load selected recorded file for playback""" print(f"🎵 Loading recorded file: {filename}") if filename and filename.strip(): file_path = get_recorded_file_path(filename) print(f"📁 Full path: {file_path}") if os.path.exists(file_path): file_size = os.path.getsize(file_path) print(f"✅ File exists, size: {file_size} bytes") try: # Load audio as numpy array for Gradio compatibility import soundfile as sf audio_data, sample_rate = sf.read(file_path) print(f"🎵 Loaded audio: shape={audio_data.shape}, sr={sample_rate}") # Return tuple (sample_rate, audio_data) for Gradio numpy type return (sample_rate, audio_data) except Exception as e: print(f"❌ Error loading audio: {e}") return None else: print(f"❌ File not found: {file_path}") print(f"📁 Directory contents: {os.listdir(os.path.dirname(file_path)) if os.path.exists(os.path.dirname(file_path)) else 'Directory not found'}") else: print("❌ No filename provided") return None def use_recorded_for_translation(filename, country, voice, fmt): """Use selected recorded file for translation""" print(f"🔄 Using recorded file for translation: {filename}") if filename and filename.strip(): file_path = get_recorded_file_path(filename) print(f"📁 Translation file path: {file_path}") if os.path.exists(file_path): print(f"✅ Starting translation for: {filename}") # Use the same translation function return translate_audio(file_path, country, voice, get_format_from_dropdown(fmt)) else: print(f"❌ File not found for translation: {file_path}") # Return empty results if no file selected print("❌ No file selected for translation") return "", "", "", "", None, "", "", None def prepare_recorded_file_download(filename): """Prepare recorded file for download""" print(f"📥 Preparing download for: {filename}") if filename and filename.strip(): file_path = get_recorded_file_path(filename) print(f"📁 Download file path: {file_path}") if os.path.exists(file_path): print(f"✅ File ready for download: {filename}") return file_path else: print(f"❌ Download file not found: {file_path}") print("❌ No file selected for download") return None def save_current_recording(audio_file): """Save current audio recording to record_data""" if audio_file is None: current_files = get_recorded_files() return ( "

❌ Không có file audio để lưu

", gr.Dropdown(choices=current_files, value=None) ) try: saved_path = save_recorded_audio(audio_file) if saved_path: saved_filename = os.path.basename(saved_path) # Get updated file list after saving updated_files = get_recorded_files() print(f"🔄 After save - updated files: {updated_files}") return ( f"

✅ Đã lưu: {saved_filename}

", gr.Dropdown(choices=updated_files, value=saved_filename) ) else: current_files = get_recorded_files() return ( "

❌ Lỗi khi lưu file

", gr.Dropdown(choices=current_files, value=None) ) except Exception as e: current_files = get_recorded_files() return ( f"

❌ Lỗi: {str(e)}

", gr.Dropdown(choices=current_files, value=None) ) def clear_audio_for_new_recording(): """Clear audio input for new recording""" return ( None, # Clear audio input "

🎙️ Sẵn sàng ghi âm mới

" ) def delete_selected_file(filename): """Delete selected file and refresh dropdown""" if not filename or not filename.strip(): current_files = get_recorded_files() return ( "

❌ Vui lòng chọn file để xóa

", gr.Dropdown(choices=current_files, value=None), None # Clear audio player ) # Delete the file delete_result = delete_recorded_file(filename) # Refresh file list updated_files = get_recorded_files() # Determine status color based on result if "✅" in delete_result: status_html = f"

{delete_result}

" else: status_html = f"

{delete_result}

" return ( status_html, gr.Dropdown(choices=updated_files, value=None), None # Clear audio player ) # Recorded Files Management Section with gr.Accordion("🎤 File đã ghi âm", open=False): gr.HTML("""

📁 Quản lý file đã ghi

Chọn file từ danh sách để phát lại hoặc dịch thuật

""") # Refresh button for recorded files refresh_files_btn = gr.Button( "🔄 Làm mới danh sách", variant="secondary", size="sm" ) # Status display for file operations file_operation_status = gr.HTML( value="

Chọn file để thực hiện thao tác

" ) # Dropdown for recorded files initial_files = get_recorded_files() print(f"🔍 Initial recorded files: {initial_files}") recorded_files_dropdown = gr.Dropdown( choices=initial_files, label="📂 Chọn file đã ghi", info="Các file audio đã được ghi âm trước đó" ) # Preview and controls for selected file with gr.Row(): with gr.Column(): # Audio player for selected file recorded_audio_player = gr.Audio( label="🎵 Phát lại file đã chọn", interactive=False, show_label=True, type="numpy" # Use numpy for better compatibility ) with gr.Column(): # Action buttons use_for_translation_btn = gr.Button( "🔄 Sử dụng để dịch thuật", variant="primary", size="sm" ) with gr.Row(): download_recorded_btn = gr.Button( "📥 Tải xuống", variant="secondary", size="sm" ) delete_recorded_btn = gr.Button( "🗑️ Xóa file", variant="stop", size="sm" ) # Download link for recorded file download_recorded_file = gr.File( label="📥 File tải xuống", visible=True, file_count="single" ) # Settings section with gradient header gr.HTML("""

🌍 Cài đặt dịch thuật

Chọn ngôn ngữ đích và giọng nói cho kết quả dịch thuật

""") # Separate dropdowns without complex wrappers to avoid CSS conflicts target_country_dropdown = gr.Dropdown( choices=list(voice_choices_by_country.keys()), value="🇻🇳 Việt Nam", label="🌍 Chọn quốc gia đích" ) target_voice_dropdown = gr.Dropdown( choices=voice_choices_by_country["🇻🇳 Việt Nam"], value="🇻🇳 HoaiMy - Nữ Việt Chuẩn", label="🎭 Chọn giọng nói đích" ) text_format_dropdown = gr.Dropdown( choices=["Markdown (.md)", "TXT (.txt)", "Word (.docx)"] if DOCX_AVAILABLE else ["Markdown (.md)", "TXT (.txt)"], value="Markdown (.md)", label="📄 Định dạng file văn bản" ) # Colorful action button gr.HTML(""" """) # Auto-translate on audio upload - no manual button needed # Results section with colorful headers gr.HTML("""

📊 Kết quả xử lý

Phiên âm, dịch thuật và tổng hợp giọng nói

""") # Dynamic status indicator status_text = gr.HTML("") # Card-based layout for mobile with gr.Column(elem_classes=["output-area"]): # Original content card gr.HTML("""

📝 Nội dung gốc từ audio

""") transcription_output = gr.Textbox( label="🎯 Phiên âm từ audio", lines=4, interactive=False, placeholder="Nội dung phiên âm từ file audio sẽ hiển thị ở đây...", elem_classes=["mobile-textbox"] ) detected_language = gr.Textbox( label="🌐 Ngôn ngữ được phát hiện", lines=1, interactive=False, placeholder="Tự động nhận diện...", elem_classes=["mobile-textbox"] ) # Translation result card gr.HTML("""

✨ Kết quả dịch thuật

""") translation_output = gr.Textbox( label="🔄 Nội dung đã dịch", lines=4, interactive=False, placeholder="Bản dịch sẽ hiển thị ở đây...", elem_classes=["mobile-textbox"] ) target_language_display = gr.Textbox( label="🎯 Ngôn ngữ đích", lines=1, interactive=False, placeholder="Chưa chọn...", elem_classes=["mobile-textbox"] ) # Mobile-friendly comparison section with gr.Accordion("🔍 So sánh nội dung", open=False): gr.HTML("""

Xem nội dung gốc và bản dịch để so sánh

""") # Stack vertically on mobile for better readability with gr.Column(): gr.HTML("""
📝 Ngôn ngữ gốc
""") original_compare = gr.Textbox( label="", lines=4, interactive=False, show_label=False, placeholder="Nội dung phiên âm từ audio sẽ hiển thị ở đây...", elem_classes=["mobile-compare"] ) gr.HTML("""
✨ Sau khi dịch
""") translated_compare = gr.Textbox( label="", lines=4, interactive=False, show_label=False, placeholder="Nội dung sau khi dịch sẽ hiển thị ở đây...", elem_classes=["mobile-compare"] ) # Mobile-optimized download section with gr.Accordion("💾 Tải xuống kết quả", open=True): gr.HTML("""

💾 Tải xuống kết quả

File audio và văn bản đã dịch

""") # Stack downloads vertically for mobile with gr.Column(): gr.HTML("""
🔊 Audio đã dịch
""") audio_output_at = gr.Audio( label="🎵 Audio đã dịch", type="filepath", show_label=True, elem_classes=["mobile-audio"], format="wav" # Specify format explicitly ) # Explicit download component for translated audio audio_download_at = gr.File( label="📥 Tải xuống audio đã dịch", file_count="single", file_types=[".wav"], visible=True ) gr.HTML("""
📄 Văn bản đã dịch
""") text_output = gr.File( label="", file_count="single", file_types=[".txt", ".docx"], show_label=False, elem_classes=["mobile-file"] ) # Event handlers for Audio Translation with colorful status def update_status_processing(): return """
⚡ Đang tự động dịch thuật...
""" def update_status_complete(): return """
✅ Dịch thuật hoàn thành!
""" target_country_dropdown.change( fn=update_voices, inputs=[target_country_dropdown], outputs=[target_voice_dropdown] ) # Update target language display when dropdown changes target_voice_dropdown.change( fn=lambda voice: voice, inputs=[target_voice_dropdown], outputs=[target_language_display] ) # Helper function to extract format def get_format_from_dropdown(format_choice): if "Markdown" in format_choice: return "md" elif "Word" in format_choice: return "docx" return "txt" # Auto-translate when audio is uploaded or changed audio_input.change( fn=lambda: update_status_processing(), outputs=[status_text] ).then( fn=lambda audio, country, voice, fmt: translate_audio(audio, country, voice, get_format_from_dropdown(fmt)) if audio is not None else ("", "", "📎 Vui lòng tải lên file audio hoặc ghi âm", country, None, "", "", None), inputs=[audio_input, target_country_dropdown, target_voice_dropdown, text_format_dropdown], outputs=[ transcription_output, detected_language, translation_output, target_language_display, audio_output_at, audio_download_at, original_compare, translated_compare, text_output ] ).then( fn=lambda: update_status_complete(), outputs=[status_text] ).then( fn=refresh_recorded_files, outputs=[recorded_files_dropdown] ) # === RECORDED FILES EVENT HANDLERS === # Save current recording save_recording_btn.click( fn=save_current_recording, inputs=[audio_input], outputs=[recording_status, recorded_files_dropdown] ) # New recording (clear audio) new_recording_btn.click( fn=clear_audio_for_new_recording, outputs=[audio_input, recording_status] ) refresh_files_btn.click( fn=refresh_recorded_files, outputs=[recorded_files_dropdown] ) recorded_files_dropdown.change( fn=load_recorded_file, inputs=[recorded_files_dropdown], outputs=[recorded_audio_player] ) use_for_translation_btn.click( fn=lambda: update_status_processing(), outputs=[status_text] ).then( fn=use_recorded_for_translation, inputs=[recorded_files_dropdown, target_country_dropdown, target_voice_dropdown, text_format_dropdown], outputs=[ transcription_output, detected_language, translation_output, target_language_display, audio_output_at, audio_download_at, original_compare, translated_compare, text_output ] ).then( fn=lambda: update_status_complete(), outputs=[status_text] ).then( fn=refresh_recorded_files, outputs=[recorded_files_dropdown] ) download_recorded_btn.click( fn=prepare_recorded_file_download, inputs=[recorded_files_dropdown], outputs=[download_recorded_file] ) delete_recorded_btn.click( fn=delete_selected_file, inputs=[recorded_files_dropdown], outputs=[file_operation_status, recorded_files_dropdown, recorded_audio_player] ) # Features section cho Voice RAG gr.Markdown("### 📚 Tính năng chính") with gr.Row(): with gr.Column(): gr.HTML("""

📚 Voice RAG

Upload tài liệu và đặt câu hỏi. Nhận trả lời bằng giọng nói đa ngôn ngữ.

✓ Hỗ trợ PDF, DOCX, TXT
✓ AI Gemini 2.0 Flash
✓ 24 giọng nói đa quốc gia
""") with gr.Column(): gr.HTML("""

🌍 Audio Translation

Dịch thuật âm thanh sang nhiều ngôn ngữ với giọng nói tự nhiên.

✓ Ghi âm real-time
✓ 13 ngôn ngữ chính
✓ Edge TTS Neural
""") with gr.Row(): with gr.Column(): gr.HTML("""

🎤 Voice Studio

Chuyển văn bản thành giọng nói với nhiều lựa chọn quốc gia và giọng nói.

✓ 13 quốc gia
✓ Tích hợp dịch thuật
✓ Điều chỉnh tốc độ
""") # Footer gr.HTML(""" """) # Add JavaScript for button effects gr.HTML(js_code) if __name__ == "__main__": import sys import locale import os # Ensure UTF-8 encoding if sys.platform == 'win32': os.environ['PYTHONIOENCODING'] = 'utf-8' # Optimize startup for HF Spaces print(f"===== Application Startup at {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} =====") # Only create record_data directory when actually needed to speed up startup if not os.environ.get("SPACE_ID") and not os.environ.get("HF_SPACE_ID"): create_record_data_directory() print(f"📁 Record data directory ready: {RECORD_DATA_DIR}") else: print(f"🏭 Production mode - record_data will be created on first use") # Set environment variables for iframe support os.environ['GRADIO_ALLOW_FLAGGING'] = 'never' # Disable Gradio temp directory to prevent file serving issues # os.environ['GRADIO_TEMP_DIR'] = '/tmp' # Hugging Face Spaces configuration - Use standard port 7860 for HF if os.environ.get("SPACE_ID") or os.environ.get("HF_SPACE_ID"): # HF Spaces standard configuration port = 7860 print("🏭 Using HF Spaces standard port 7860") else: # Local development port = int(os.environ.get("GRADIO_SERVER_PORT", 7880)) print(f"🖥️ Using local development port {port}") demo.launch( server_name="0.0.0.0", server_port=port, share=False, show_error=True, ssr_mode=False, # Disable SSR to prevent timeout issues on HF Spaces enable_monitoring=False # Disable monitoring for faster startup )