Michael Hu
remove all tts providers
6825e46
"""
Main entry point for the Audio Translation Web Application
Handles file upload, processing pipeline, and UI rendering using DDD architecture with Gradio
"""
import logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler("app.log"),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
import gradio as gr
import os
import json
from typing import Optional, Tuple, Dict, Any
# Import application services and DTOs
from src.application.services.audio_processing_service import AudioProcessingApplicationService
from src.application.services.configuration_service import ConfigurationApplicationService
from src.application.dtos.audio_upload_dto import AudioUploadDto
from src.application.dtos.processing_request_dto import ProcessingRequestDto
from src.application.dtos.processing_result_dto import ProcessingResultDto
# Import infrastructure setup
from src.infrastructure.config.container_setup import initialize_global_container, get_global_container
# Initialize environment configurations
os.makedirs("temp/uploads", exist_ok=True)
os.makedirs("temp/outputs", exist_ok=True)
# Global container initialization
container_initialized = False
def initialize_application():
"""Initialize the application with dependency injection container"""
global container_initialized
if not container_initialized:
try:
logger.info("Initializing application container")
initialize_global_container()
container_initialized = True
logger.info("Application container initialized successfully")
except Exception as e:
logger.error(f"Failed to initialize application: {e}")
raise RuntimeError(f"Application initialization failed: {str(e)}")
def create_audio_upload_dto(audio_file_path: str) -> AudioUploadDto:
"""
Create AudioUploadDto from audio file path.
Args:
audio_file_path: Path to the uploaded audio file
Returns:
AudioUploadDto: DTO containing upload information
"""
try:
if not audio_file_path or not os.path.exists(audio_file_path):
raise ValueError("No audio file provided or file does not exist")
filename = os.path.basename(audio_file_path)
with open(audio_file_path, 'rb') as f:
content = f.read()
# Determine content type based on file extension
file_ext = os.path.splitext(filename.lower())[1]
content_type_map = {
'.wav': 'audio/wav',
'.mp3': 'audio/mpeg',
'.m4a': 'audio/mp4',
'.flac': 'audio/flac',
'.ogg': 'audio/ogg'
}
content_type = content_type_map.get(file_ext, 'audio/wav')
return AudioUploadDto(
filename=filename,
content=content,
content_type=content_type,
size=len(content)
)
except Exception as e:
logger.error(f"Failed to create AudioUploadDto: {e}")
raise ValueError(f"Invalid audio file: {str(e)}")
def get_supported_configurations() -> dict:
"""
Get supported configurations from application service.
Returns:
dict: Supported configurations
"""
try:
logger.info("Getting global container...")
container = get_global_container()
logger.info("Resolving AudioProcessingApplicationService...")
audio_service = container.resolve(AudioProcessingApplicationService)
logger.info("Getting supported configurations from service...")
config = audio_service.get_supported_configurations()
logger.info(f"Retrieved configurations: {config}")
return config
except Exception as e:
logger.error(f"Failed to get configurations: {e}", exc_info=True)
# Return fallback configurations
return {
'asr_models': ['whisper-small', 'parakeet'],
'voices': ['kokoro', 'dia', 'cosyvoice2', 'dummy'],
'languages': ['en', 'zh', 'es', 'fr', 'de'],
'audio_formats': ['wav', 'mp3'],
'max_file_size_mb': 100,
'speed_range': {'min': 0.5, 'max': 2.0}
}
def process_audio_pipeline(
audio_file,
asr_model: str,
target_language: str,
voice: str,
speed: float,
source_language: str = "en"
) -> Tuple[str, str, str, str, str]:
"""
Execute the complete processing pipeline using application services.
Args:
audio_file: Gradio audio file input
asr_model: ASR model to use
target_language: Target language for translation
voice: Voice for TTS
speed: Speech speed
source_language: Source language
Returns:
Tuple: (status_message, original_text, translated_text, audio_output_path, processing_details)
"""
try:
if not audio_file:
return "❌ No audio file provided", "", "", None, ""
logger.info(f"Starting processing for: {audio_file} using {asr_model} model")
# Create audio upload DTO
audio_upload = create_audio_upload_dto(audio_file)
# Get application service from container
container = get_global_container()
audio_service = container.resolve(AudioProcessingApplicationService)
# Create processing request
request = ProcessingRequestDto(
audio=audio_upload,
asr_model=asr_model,
target_language=target_language,
voice=voice,
speed=speed,
source_language=source_language
)
# Process through application service
result = audio_service.process_audio_pipeline(request)
if result.success:
status_message = f"βœ… Processing Complete! ({result.processing_time:.2f}s)"
logger.info(f"Processing completed successfully in {result.processing_time:.2f}s")
# Prepare processing details
details = {
"processing_time": f"{result.processing_time:.2f}s",
"asr_model": asr_model,
"target_language": target_language,
"voice": voice,
"speed": speed
}
if result.metadata:
details.update(result.metadata)
processing_details = json.dumps(details, indent=2)
return (
status_message,
result.original_text or "",
result.translated_text or "",
result.audio_path if result.has_audio_output else None,
processing_details
)
else:
error_msg = f"❌ Processing Failed: {result.error_message}"
logger.error(f"Processing failed: {result.error_message}")
return error_msg, "", "", None, f"Error: {result.error_message}"
except Exception as e:
logger.error(f"Processing failed: {str(e)}", exc_info=True)
error_msg = f"❌ Processing Failed: {str(e)}"
return error_msg, "", "", None, f"System Error: {str(e)}"
def create_interface():
"""Create and configure the Gradio interface using gr.Interface for better compatibility"""
# Initialize application
initialize_application()
# Get supported configurations
config = get_supported_configurations()
# Language options mapping
language_options = {
"Chinese (Mandarin)": "zh",
"Spanish": "es",
"French": "fr",
"German": "de",
"English": "en"
}
def process_wrapper(audio_file, asr_model_val, target_lang_val, voice_val, speed_val):
"""Wrapper function for processing"""
# Map display language to code
target_lang_code = language_options.get(target_lang_val, "zh")
return process_audio_pipeline(
audio_file=audio_file,
asr_model=asr_model_val,
target_language=target_lang_code,
voice=voice_val,
speed=speed_val,
source_language="en"
)
# Create the interface using gr.Interface for better compatibility
interface = gr.Interface(
fn=process_wrapper,
inputs=[
gr.Audio(label="Upload Audio File", type="filepath"),
gr.Dropdown(
choices=config['asr_models'],
value=config['asr_models'][0] if config['asr_models'] else "parakeet",
label="Speech Recognition Model"
),
gr.Dropdown(
choices=list(language_options.keys()),
value="Chinese (Mandarin)",
label="Target Language"
),
gr.Dropdown(
choices=config['voices'],
value="chatterbox",
label="Voice"
),
gr.Slider(
minimum=config['speed_range']['min'],
maximum=config['speed_range']['max'],
value=1.0,
step=0.1,
label="Speech Speed"
)
],
outputs=[
gr.Textbox(label="Status"),
gr.Textbox(label="Recognition Results"),
gr.Textbox(label="Translation Results"),
gr.Audio(label="Audio Output"),
gr.Code(label="Processing Details", language="json")
],
title="🎧 High-Quality Audio Translation System",
description="Upload English Audio β†’ Get Chinese Speech Output",
examples=[
# Add example configurations if needed
]
)
return interface
def main():
"""Main application entry point"""
logger.info("Starting Gradio application")
try:
# Create interface
interface = create_interface()
# Launch the interface
interface.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
debug=False,
show_error=True,
quiet=False
)
except Exception as e:
logger.error(f"Failed to start application: {str(e)}", exc_info=True)
raise
if __name__ == "__main__":
main()