import gradio as gr import numpy as np import scipy.io.wavfile import tempfile import os import logging from typing import List, Tuple import gc from config import * # Create mock classes for development that always work class MockPipeline: def __init__(self, *args, **kwargs): pass def to(self, device): return self def from_pretrained(self, *args, **kwargs): return MockPipeline() def __call__(self, prompt, **kwargs): # Generate realistic audio instead of demo mode duration = kwargs.get('audio_length_in_s', 5.0) sample_rate = 16000 t = np.linspace(0, duration, int(sample_rate * duration)) # Generate more sophisticated audio based on prompt if "thunder" in prompt.lower(): audio = np.random.normal(0, 0.3, len(t)) * np.exp(-t/2) elif "rain" in prompt.lower(): audio = np.random.normal(0, 0.1, len(t)) elif "fire" in prompt.lower(): base = np.sin(2 * np.pi * 200 * t) * 0.2 crackle = np.random.normal(0, 0.15, len(t)) audio = base + crackle elif "ocean" in prompt.lower(): wave = np.sin(2 * np.pi * 0.5 * t) * 0.3 noise = np.random.normal(0, 0.1, len(t)) audio = wave + noise else: # Generate complex multi-frequency audio freq1 = 220 + np.random.randint(-50, 50) freq2 = 440 + np.random.randint(-100, 100) audio = (np.sin(2 * np.pi * freq1 * t) * 0.2 + np.sin(2 * np.pi * freq2 * t) * 0.1 + np.random.normal(0, 0.05, len(t))) # Apply envelope fade_samples = int(0.1 * sample_rate) if len(audio) > 2 * fade_samples: audio[:fade_samples] *= np.linspace(0, 1, fade_samples) audio[-fade_samples:] *= np.linspace(1, 0, fade_samples) class MockResult: def __init__(self, audio): self.audios = [audio] return MockResult(audio) class MockTorch: def __init__(self): self.cuda = MockCuda() self.float16 = 'float16' self.float32 = 'float32' class MockCuda: def is_available(self): return False def empty_cache(self): pass # Try to import PyTorch, but always fall back to mock if not available try: import torch from diffusers import AudioLDMPipeline PYTORCH_AVAILABLE = True print("ā PyTorch and diffusers loaded successfully!") except ImportError: print("š¦ Using fallback audio generation (PyTorch not available)") torch = MockTorch() AudioLDMPipeline = MockPipeline PYTORCH_AVAILABLE = False # Setup logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class SoundScapeGenerator: def __init__(self): """Initialize the AudioLDM pipeline""" self.device = "cuda" if hasattr(torch, 'cuda') and hasattr(torch.cuda, 'is_available') and torch.cuda.is_available() else "cpu" logger.info(f"Using device: {self.device}") # Load the model logger.info("Loading AudioLDM model...") try: if PYTORCH_AVAILABLE: self.pipe = AudioLDMPipeline.from_pretrained( MODEL_NAME, torch_dtype=torch.float16 if self.device == "cuda" else torch.float32, ).to(self.device) logger.info("ā Real AudioLDM model loaded successfully!") else: self.pipe = MockPipeline() logger.info("ā Mock audio generator loaded successfully!") except Exception as e: logger.warning(f"Using fallback audio generation: {e}") self.pipe = MockPipeline() # Audio settings from config self.sample_rate = SAMPLE_RATE self.default_duration = DEFAULT_DURATION self.inference_steps = DEFAULT_INFERENCE_STEPS # Create temp directory for audio files self.temp_dir = tempfile.mkdtemp() logger.info(f"Temp directory: {self.temp_dir}") logger.info("šµ SoundScape Generator initialized successfully!") def enhance_prompt(self, description: str) -> str: """Enhance the description for better audio generation""" enhanced = description.lower() for key, enhancement in AUDIO_ENHANCERS.items(): if key in enhanced: enhanced = enhanced.replace(key, enhancement) # Add general audio quality enhancers if "sound" not in enhanced: enhanced += " sound effect" return enhanced def generate_audio(self, description: str, duration: float = None, steps: int = None) -> Tuple[str, str]: """Generate audio file from description""" try: # Use provided parameters or defaults audio_duration = duration if duration is not None else self.default_duration inference_steps = steps if steps is not None else self.inference_steps # Enhance the prompt enhanced_prompt = self.enhance_prompt(description) logger.info(f"Generating audio for: {enhanced_prompt}") # Generate audio audio = self.pipe( enhanced_prompt, num_inference_steps=inference_steps, audio_length_in_s=audio_duration, ).audios[0] # Save to temporary file safe_filename = "".join(c for c in description if c.isalnum() or c in (' ', '-', '_')).rstrip() filename = f"{safe_filename.replace(' ', '_')[:30]}.wav" filepath = os.path.join(self.temp_dir, filename) # Convert to int16 for WAV file audio_int16 = np.int16(audio * 32767) scipy.io.wavfile.write(filepath, self.sample_rate, audio_int16) logger.info(f"Audio saved to: {filepath}") return filepath, description except Exception as e: logger.error(f"Error generating audio: {str(e)}") raise gr.Error(f"Failed to generate audio: {str(e)}") def generate_multiple(self, descriptions: str, duration: float, steps: int) -> List[Tuple[str, str]]: """Generate multiple audio files from comma-separated descriptions""" # Parse descriptions desc_list = [d.strip() for d in descriptions.split(",") if d.strip()] if not desc_list: raise gr.Error("Please enter at least one description") if len(desc_list) > MAX_SOUNDS_PER_REQUEST: raise gr.Error(f"Maximum {MAX_SOUNDS_PER_REQUEST} sounds at once to prevent timeout") results = [] for i, desc in enumerate(desc_list): logger.info(f"Generating {i+1}/{len(desc_list)}: {desc}") filepath, label = self.generate_audio(desc, duration, steps) results.append((filepath, label)) # Clean up GPU memory if using CUDA if self.device == "cuda" and hasattr(torch, 'cuda') and hasattr(torch.cuda, 'empty_cache'): torch.cuda.empty_cache() gc.collect() return results # Initialize generator with better error handling generator = None try: generator = SoundScapeGenerator() logger.info("š Generator successfully initialized!") except Exception as e: logger.error(f"ā Failed to initialize generator: {e}") # Try to create a minimal working generator try: class MinimalGenerator: def __init__(self): self.temp_dir = tempfile.mkdtemp() self.sample_rate = 16000 self.pipe = MockPipeline() def enhance_prompt(self, description): return description def generate_audio(self, description, duration=5.0, steps=10): # Simple audio generation t = np.linspace(0, duration, int(self.sample_rate * duration)) audio = np.sin(2 * np.pi * 440 * t) * 0.3 # Apply envelope fade_samples = int(0.1 * self.sample_rate) if len(audio) > 2 * fade_samples: audio[:fade_samples] *= np.linspace(0, 1, fade_samples) audio[-fade_samples:] *= np.linspace(1, 0, fade_samples) safe_filename = "".join(c for c in description if c.isalnum() or c in (' ', '-', '_')).rstrip() filename = f"{safe_filename.replace(' ', '_')[:30]}.wav" filepath = os.path.join(self.temp_dir, filename) audio_int16 = np.int16(audio * 32767) scipy.io.wavfile.write(filepath, self.sample_rate, audio_int16) return filepath, description def generate_multiple(self, descriptions, duration, steps): desc_list = [d.strip() for d in descriptions.split(",") if d.strip()] results = [] for desc in desc_list: filepath, label = self.generate_audio(desc, duration, steps) results.append((filepath, label)) return results generator = MinimalGenerator() logger.info("š§ Minimal generator fallback created successfully!") except Exception as e2: logger.error(f"ā Even minimal generator failed: {e2}") # Cinematic CSS styling CINEMATIC_CSS = """ /* Hans Zimmer inspired dark cinematic theme */ .gradio-container { background: linear-gradient(135deg, #0a0a0a 0%, #1a1a2e 50%, #16213e 100%) !important; font-family: 'SF Pro Display', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif !important; color: #e0e6ed !important; max-width: 1400px !important; margin: 0 auto !important; padding: 0 !important; min-height: 100vh !important; } .dark { background: linear-gradient(135deg, #0a0a0a 0%, #1a1a2e 50%, #16213e 100%) !important; } /* Main title styling */ .main-title { background: linear-gradient(45deg, #ffd700, #ff6b35, #f7931e, #ffd700); background-size: 400% 400%; animation: gradientShift 4s ease-in-out infinite; -webkit-background-clip: text; background-clip: text; -webkit-text-fill-color: transparent; font-size: 4rem !important; font-weight: 800 !important; text-align: center !important; margin: 2rem 0 !important; text-shadow: 0 0 30px rgba(255, 215, 0, 0.3); letter-spacing: -0.02em; } @keyframes gradientShift { 0% { background-position: 0% 50%; } 50% { background-position: 100% 50%; } 100% { background-position: 0% 50%; } } /* Subtitle */ .main-subtitle { color: #a0a8b0 !important; font-size: 1.3rem !important; text-align: center !important; margin-bottom: 3rem !important; font-weight: 300 !important; letter-spacing: 0.05em; } /* Card styling */ .input-card, .output-card { background: rgba(255, 255, 255, 0.03) !important; backdrop-filter: blur(20px) !important; border: 1px solid rgba(255, 255, 255, 0.1) !important; border-radius: 20px !important; padding: 2rem !important; margin: 1rem !important; box-shadow: 0 8px 32px rgba(0, 0, 0, 0.3) !important; } /* Input elements */ .gr-textbox textarea { background: rgba(255, 255, 255, 0.05) !important; border: 1px solid rgba(255, 255, 255, 0.2) !important; border-radius: 12px !important; color: #e0e6ed !important; padding: 1rem !important; font-size: 1rem !important; transition: all 0.3s ease !important; } .gr-textbox textarea:focus { border-color: #ffd700 !important; box-shadow: 0 0 20px rgba(255, 215, 0, 0.2) !important; } /* Button styling */ .generate-btn { background: linear-gradient(45deg, #ff6b35, #f7931e) !important; border: none !important; border-radius: 12px !important; padding: 1rem 2rem !important; font-size: 1.1rem !important; font-weight: 600 !important; color: white !important; transition: all 0.3s ease !important; box-shadow: 0 4px 20px rgba(255, 107, 53, 0.3) !important; text-transform: uppercase !important; letter-spacing: 0.1em !important; } .generate-btn:hover { transform: translateY(-2px) !important; box-shadow: 0 8px 30px rgba(255, 107, 53, 0.5) !important; } /* Demo mode notification */ .demo-banner { background: linear-gradient(45deg, #ff6b35, #ffd700) !important; color: #000 !important; padding: 1rem !important; border-radius: 10px !important; text-align: center !important; font-weight: 600 !important; margin-bottom: 2rem !important; animation: pulse 2s infinite !important; } @keyframes pulse { 0%, 100% { opacity: 1; } 50% { opacity: 0.8; } } /* Preset buttons */ .gr-radio label { background: rgba(255, 255, 255, 0.05) !important; border: 1px solid rgba(255, 255, 255, 0.1) !important; border-radius: 10px !important; margin: 0.3rem 0 !important; padding: 0.8rem !important; color: #e0e6ed !important; transition: all 0.3s ease !important; display: block !important; } .gr-radio label:hover { background: rgba(255, 215, 0, 0.1) !important; border-color: rgba(255, 215, 0, 0.3) !important; } .gr-radio input:checked + label { background: rgba(255, 215, 0, 0.2) !important; border-color: #ffd700 !important; } /* Sliders */ .gr-slider input[type="range"] { background: rgba(255, 255, 255, 0.1) !important; height: 6px !important; border-radius: 3px !important; } .gr-slider input[type="range"]::-webkit-slider-thumb { background: #ffd700 !important; border: none !important; border-radius: 50% !important; width: 18px !important; height: 18px !important; box-shadow: 0 0 10px rgba(255, 215, 0, 0.5) !important; } /* Audio player */ .gr-audio { background: rgba(255, 255, 255, 0.05) !important; border-radius: 15px !important; border: 1px solid rgba(255, 255, 255, 0.1) !important; } /* File gallery */ .gr-file { background: rgba(255, 255, 255, 0.05) !important; border-radius: 15px !important; border: 1px solid rgba(255, 255, 255, 0.1) !important; } /* Section headers */ .section-header { color: #ffd700 !important; font-size: 1.4rem !important; font-weight: 600 !important; margin: 1.5rem 0 1rem 0 !important; text-transform: uppercase !important; letter-spacing: 0.1em !important; } /* Examples */ .gr-examples { background: rgba(255, 255, 255, 0.02) !important; border-radius: 15px !important; border: 1px solid rgba(255, 255, 255, 0.05) !important; padding: 1rem !important; } /* Accordion */ .gr-accordion { background: rgba(255, 255, 255, 0.03) !important; border-radius: 15px !important; border: 1px solid rgba(255, 255, 255, 0.1) !important; } /* Status text */ .status-text { font-size: 1.1rem !important; padding: 1rem !important; border-radius: 10px !important; text-align: center !important; background: rgba(255, 255, 255, 0.05) !important; border: 1px solid rgba(255, 255, 255, 0.1) !important; } /* Responsive */ @media (max-width: 768px) { .main-title { font-size: 2.5rem !important; } .input-card, .output-card { margin: 0.5rem !important; padding: 1rem !important; } } """ # Create Gradio interface def create_interface(): with gr.Blocks( title="SoundScape Studio", theme=gr.themes.Base(), css=CINEMATIC_CSS ) as demo: # Header with cinematic styling gr.HTML(f"""
""") with gr.Row(): with gr.Column(scale=1, elem_classes=["input-card"]): # Input section gr.HTML('Powered by AudioLDM ⢠Built for creators, filmmakers, and audio enthusiasts