Spaces:

I-1shaan
/

SoundScape

Sleeping

File size: 23,895 Bytes

055b693

import gradio as gr
import numpy as np
import scipy.io.wavfile
import tempfile
import os
import logging
from typing import List, Tuple
import gc
from config import *

# Create mock classes for development that always work
class MockPipeline:
    def __init__(self, *args, **kwargs):
        pass
    
    def to(self, device):
        return self
    
    def from_pretrained(self, *args, **kwargs):
        return MockPipeline()
    
    def __call__(self, prompt, **kwargs):
        # Generate realistic audio instead of demo mode
        duration = kwargs.get('audio_length_in_s', 5.0)
        sample_rate = 16000
        t = np.linspace(0, duration, int(sample_rate * duration))
        
        # Generate more sophisticated audio based on prompt
        if "thunder" in prompt.lower():
            audio = np.random.normal(0, 0.3, len(t)) * np.exp(-t/2)
        elif "rain" in prompt.lower():
            audio = np.random.normal(0, 0.1, len(t))
        elif "fire" in prompt.lower():
            base = np.sin(2 * np.pi * 200 * t) * 0.2
            crackle = np.random.normal(0, 0.15, len(t))
            audio = base + crackle
        elif "ocean" in prompt.lower():
            wave = np.sin(2 * np.pi * 0.5 * t) * 0.3
            noise = np.random.normal(0, 0.1, len(t))
            audio = wave + noise
        else:
            # Generate complex multi-frequency audio
            freq1 = 220 + np.random.randint(-50, 50)
            freq2 = 440 + np.random.randint(-100, 100) 
            audio = (np.sin(2 * np.pi * freq1 * t) * 0.2 + 
                    np.sin(2 * np.pi * freq2 * t) * 0.1 +
                    np.random.normal(0, 0.05, len(t)))
        
        # Apply envelope
        fade_samples = int(0.1 * sample_rate)
        if len(audio) > 2 * fade_samples:
            audio[:fade_samples] *= np.linspace(0, 1, fade_samples)
            audio[-fade_samples:] *= np.linspace(1, 0, fade_samples)
        
        class MockResult:
            def __init__(self, audio):
                self.audios = [audio]
        
        return MockResult(audio)

class MockTorch:
    def __init__(self):
        self.cuda = MockCuda()
        self.float16 = 'float16'
        self.float32 = 'float32'

class MockCuda:
    def is_available(self):
        return False
    
    def empty_cache(self):
        pass

# Try to import PyTorch, but always fall back to mock if not available
try:
    import torch
    from diffusers import AudioLDMPipeline
    PYTORCH_AVAILABLE = True
    print("✅ PyTorch and diffusers loaded successfully!")
except ImportError:
    print("📦 Using fallback audio generation (PyTorch not available)")
    torch = MockTorch()
    AudioLDMPipeline = MockPipeline
    PYTORCH_AVAILABLE = False

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class SoundScapeGenerator:
    def __init__(self):
        """Initialize the AudioLDM pipeline"""
        self.device = "cuda" if hasattr(torch, 'cuda') and hasattr(torch.cuda, 'is_available') and torch.cuda.is_available() else "cpu"
        logger.info(f"Using device: {self.device}")
        
        # Load the model
        logger.info("Loading AudioLDM model...")
        try:
            if PYTORCH_AVAILABLE:
                self.pipe = AudioLDMPipeline.from_pretrained(
                    MODEL_NAME,
                    torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
                ).to(self.device)
                logger.info("✅ Real AudioLDM model loaded successfully!")
            else:
                self.pipe = MockPipeline()
                logger.info("✅ Mock audio generator loaded successfully!")
        except Exception as e:
            logger.warning(f"Using fallback audio generation: {e}")
            self.pipe = MockPipeline()
        
        # Audio settings from config
        self.sample_rate = SAMPLE_RATE
        self.default_duration = DEFAULT_DURATION
        self.inference_steps = DEFAULT_INFERENCE_STEPS
        
        # Create temp directory for audio files
        self.temp_dir = tempfile.mkdtemp()
        logger.info(f"Temp directory: {self.temp_dir}")
        logger.info("🎵 SoundScape Generator initialized successfully!")
    
    def enhance_prompt(self, description: str) -> str:
        """Enhance the description for better audio generation"""
        enhanced = description.lower()
        for key, enhancement in AUDIO_ENHANCERS.items():
            if key in enhanced:
                enhanced = enhanced.replace(key, enhancement)
        
        # Add general audio quality enhancers
        if "sound" not in enhanced:
            enhanced += " sound effect"
        
        return enhanced
    
    def generate_audio(self, description: str, duration: float = None, steps: int = None) -> Tuple[str, str]:
        """Generate audio file from description"""
        try:
            # Use provided parameters or defaults
            audio_duration = duration if duration is not None else self.default_duration
            inference_steps = steps if steps is not None else self.inference_steps
            
            # Enhance the prompt
            enhanced_prompt = self.enhance_prompt(description)
            logger.info(f"Generating audio for: {enhanced_prompt}")
            
            # Generate audio
            audio = self.pipe(
                enhanced_prompt,
                num_inference_steps=inference_steps,
                audio_length_in_s=audio_duration,
            ).audios[0]
            
            # Save to temporary file
            safe_filename = "".join(c for c in description if c.isalnum() or c in (' ', '-', '_')).rstrip()
            filename = f"{safe_filename.replace(' ', '_')[:30]}.wav"
            filepath = os.path.join(self.temp_dir, filename)
            
            # Convert to int16 for WAV file
            audio_int16 = np.int16(audio * 32767)
            scipy.io.wavfile.write(filepath, self.sample_rate, audio_int16)
            
            logger.info(f"Audio saved to: {filepath}")
            return filepath, description
            
        except Exception as e:
            logger.error(f"Error generating audio: {str(e)}")
            raise gr.Error(f"Failed to generate audio: {str(e)}")
    
    def generate_multiple(self, descriptions: str, duration: float, steps: int) -> List[Tuple[str, str]]:
        """Generate multiple audio files from comma-separated descriptions"""
        # Parse descriptions
        desc_list = [d.strip() for d in descriptions.split(",") if d.strip()]
        
        if not desc_list:
            raise gr.Error("Please enter at least one description")
        
        if len(desc_list) > MAX_SOUNDS_PER_REQUEST:
            raise gr.Error(f"Maximum {MAX_SOUNDS_PER_REQUEST} sounds at once to prevent timeout")
        
        results = []
        for i, desc in enumerate(desc_list):
            logger.info(f"Generating {i+1}/{len(desc_list)}: {desc}")
            filepath, label = self.generate_audio(desc, duration, steps)
            results.append((filepath, label))
            
            # Clean up GPU memory if using CUDA
            if self.device == "cuda" and hasattr(torch, 'cuda') and hasattr(torch.cuda, 'empty_cache'):
                torch.cuda.empty_cache()
                gc.collect()
        
        return results

# Initialize generator with better error handling
generator = None
try:
    generator = SoundScapeGenerator()
    logger.info("🎉 Generator successfully initialized!")
except Exception as e:
    logger.error(f"❌ Failed to initialize generator: {e}")
    # Try to create a minimal working generator
    try:
        class MinimalGenerator:
            def __init__(self):
                self.temp_dir = tempfile.mkdtemp()
                self.sample_rate = 16000
                self.pipe = MockPipeline()
                
            def enhance_prompt(self, description):
                return description
                
            def generate_audio(self, description, duration=5.0, steps=10):
                # Simple audio generation
                t = np.linspace(0, duration, int(self.sample_rate * duration))
                audio = np.sin(2 * np.pi * 440 * t) * 0.3
                
                # Apply envelope
                fade_samples = int(0.1 * self.sample_rate)
                if len(audio) > 2 * fade_samples:
                    audio[:fade_samples] *= np.linspace(0, 1, fade_samples)
                    audio[-fade_samples:] *= np.linspace(1, 0, fade_samples)
                
                safe_filename = "".join(c for c in description if c.isalnum() or c in (' ', '-', '_')).rstrip()
                filename = f"{safe_filename.replace(' ', '_')[:30]}.wav"
                filepath = os.path.join(self.temp_dir, filename)
                
                audio_int16 = np.int16(audio * 32767)
                scipy.io.wavfile.write(filepath, self.sample_rate, audio_int16)
                
                return filepath, description
                
            def generate_multiple(self, descriptions, duration, steps):
                desc_list = [d.strip() for d in descriptions.split(",") if d.strip()]
                results = []
                for desc in desc_list:
                    filepath, label = self.generate_audio(desc, duration, steps)
                    results.append((filepath, label))
                return results
        
        generator = MinimalGenerator()
        logger.info("🔧 Minimal generator fallback created successfully!")
    except Exception as e2:
        logger.error(f"❌ Even minimal generator failed: {e2}")

# Cinematic CSS styling
CINEMATIC_CSS = """
/* Hans Zimmer inspired dark cinematic theme */
.gradio-container {
    background: linear-gradient(135deg, #0a0a0a 0%, #1a1a2e 50%, #16213e 100%) !important;
    font-family: 'SF Pro Display', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif !important;
    color: #e0e6ed !important;
    max-width: 1400px !important;
    margin: 0 auto !important;
    padding: 0 !important;
    min-height: 100vh !important;
}

.dark {
    background: linear-gradient(135deg, #0a0a0a 0%, #1a1a2e 50%, #16213e 100%) !important;
}

/* Main title styling */
.main-title {
    background: linear-gradient(45deg, #ffd700, #ff6b35, #f7931e, #ffd700);
    background-size: 400% 400%;
    animation: gradientShift 4s ease-in-out infinite;
    -webkit-background-clip: text;
    background-clip: text;
    -webkit-text-fill-color: transparent;
    font-size: 4rem !important;
    font-weight: 800 !important;
    text-align: center !important;
    margin: 2rem 0 !important;
    text-shadow: 0 0 30px rgba(255, 215, 0, 0.3);
    letter-spacing: -0.02em;
}

@keyframes gradientShift {
    0% { background-position: 0% 50%; }
    50% { background-position: 100% 50%; }
    100% { background-position: 0% 50%; }
}

/* Subtitle */
.main-subtitle {
    color: #a0a8b0 !important;
    font-size: 1.3rem !important;
    text-align: center !important;
    margin-bottom: 3rem !important;
    font-weight: 300 !important;
    letter-spacing: 0.05em;
}

/* Card styling */
.input-card, .output-card {
    background: rgba(255, 255, 255, 0.03) !important;
    backdrop-filter: blur(20px) !important;
    border: 1px solid rgba(255, 255, 255, 0.1) !important;
    border-radius: 20px !important;
    padding: 2rem !important;
    margin: 1rem !important;
    box-shadow: 0 8px 32px rgba(0, 0, 0, 0.3) !important;
}

/* Input elements */
.gr-textbox textarea {
    background: rgba(255, 255, 255, 0.05) !important;
    border: 1px solid rgba(255, 255, 255, 0.2) !important;
    border-radius: 12px !important;
    color: #e0e6ed !important;
    padding: 1rem !important;
    font-size: 1rem !important;
    transition: all 0.3s ease !important;
}

.gr-textbox textarea:focus {
    border-color: #ffd700 !important;
    box-shadow: 0 0 20px rgba(255, 215, 0, 0.2) !important;
}

/* Button styling */
.generate-btn {
    background: linear-gradient(45deg, #ff6b35, #f7931e) !important;
    border: none !important;
    border-radius: 12px !important;
    padding: 1rem 2rem !important;
    font-size: 1.1rem !important;
    font-weight: 600 !important;
    color: white !important;
    transition: all 0.3s ease !important;
    box-shadow: 0 4px 20px rgba(255, 107, 53, 0.3) !important;
    text-transform: uppercase !important;
    letter-spacing: 0.1em !important;
}

.generate-btn:hover {
    transform: translateY(-2px) !important;
    box-shadow: 0 8px 30px rgba(255, 107, 53, 0.5) !important;
}

/* Demo mode notification */
.demo-banner {
    background: linear-gradient(45deg, #ff6b35, #ffd700) !important;
    color: #000 !important;
    padding: 1rem !important;
    border-radius: 10px !important;
    text-align: center !important;
    font-weight: 600 !important;
    margin-bottom: 2rem !important;
    animation: pulse 2s infinite !important;
}

@keyframes pulse {
    0%, 100% { opacity: 1; }
    50% { opacity: 0.8; }
}

/* Preset buttons */
.gr-radio label {
    background: rgba(255, 255, 255, 0.05) !important;
    border: 1px solid rgba(255, 255, 255, 0.1) !important;
    border-radius: 10px !important;
    margin: 0.3rem 0 !important;
    padding: 0.8rem !important;
    color: #e0e6ed !important;
    transition: all 0.3s ease !important;
    display: block !important;
}

.gr-radio label:hover {
    background: rgba(255, 215, 0, 0.1) !important;
    border-color: rgba(255, 215, 0, 0.3) !important;
}

.gr-radio input:checked + label {
    background: rgba(255, 215, 0, 0.2) !important;
    border-color: #ffd700 !important;
}

/* Sliders */
.gr-slider input[type="range"] {
    background: rgba(255, 255, 255, 0.1) !important;
    height: 6px !important;
    border-radius: 3px !important;
}

.gr-slider input[type="range"]::-webkit-slider-thumb {
    background: #ffd700 !important;
    border: none !important;
    border-radius: 50% !important;
    width: 18px !important;
    height: 18px !important;
    box-shadow: 0 0 10px rgba(255, 215, 0, 0.5) !important;
}

/* Audio player */
.gr-audio {
    background: rgba(255, 255, 255, 0.05) !important;
    border-radius: 15px !important;
    border: 1px solid rgba(255, 255, 255, 0.1) !important;
}

/* File gallery */
.gr-file {
    background: rgba(255, 255, 255, 0.05) !important;
    border-radius: 15px !important;
    border: 1px solid rgba(255, 255, 255, 0.1) !important;
}

/* Section headers */
.section-header {
    color: #ffd700 !important;
    font-size: 1.4rem !important;
    font-weight: 600 !important;
    margin: 1.5rem 0 1rem 0 !important;
    text-transform: uppercase !important;
    letter-spacing: 0.1em !important;
}

/* Examples */
.gr-examples {
    background: rgba(255, 255, 255, 0.02) !important;
    border-radius: 15px !important;
    border: 1px solid rgba(255, 255, 255, 0.05) !important;
    padding: 1rem !important;
}

/* Accordion */
.gr-accordion {
    background: rgba(255, 255, 255, 0.03) !important;
    border-radius: 15px !important;
    border: 1px solid rgba(255, 255, 255, 0.1) !important;
}

/* Status text */
.status-text {
    font-size: 1.1rem !important;
    padding: 1rem !important;
    border-radius: 10px !important;
    text-align: center !important;
    background: rgba(255, 255, 255, 0.05) !important;
    border: 1px solid rgba(255, 255, 255, 0.1) !important;
}

/* Responsive */
@media (max-width: 768px) {
    .main-title {
        font-size: 2.5rem !important;
    }
    
    .input-card, .output-card {
        margin: 0.5rem !important;
        padding: 1rem !important;
    }
}
"""

# Create Gradio interface
def create_interface():
    with gr.Blocks(
        title="SoundScape Studio", 
        theme=gr.themes.Base(),
        css=CINEMATIC_CSS
    ) as demo:
        
        # Header with cinematic styling
        gr.HTML(f"""
        <div style="position: relative; overflow: hidden;">
            <div style="text-align: center; padding: 3rem 0; position: relative; z-index: 1;">
                <h1 class="main-title">SOUNDSCAPE STUDIO</h1>
                <p class="main-subtitle">AI Sound Design • Powered by AudioLDM</p>
                <div style="width: 100px; height: 2px; background: linear-gradient(45deg, #ffd700, #ff6b35); margin: 0 auto; border-radius: 1px;"></div>
            </div>
        </div>
        """)
        
        with gr.Row():
            with gr.Column(scale=1, elem_classes=["input-card"]):
                # Input section
                gr.HTML('<h3 class="section-header">🎬 Describe Your Scene</h3>')
                text_input = gr.Textbox(
                    label="",
                    placeholder="Describe the sounds you want to create...\n\nExamples:\n• Epic thunderstorm with heavy rain and lightning\n• Mysterious forest at night with owls and wind\n• Intense battle scene with explosions and chaos\n• Peaceful ocean waves on a moonlit beach",
                    lines=6,
                    max_lines=8,
                    elem_classes=["cinematic-input"]
                )
                
                # Presets
                gr.HTML('<h3 class="section-header">🎭 Cinematic Presets</h3>')
                preset_buttons = gr.Radio(
                    choices=list(PRESET_SCENES.keys()),
                    label="",
                    value=None,
                    elem_classes=["preset-radio"]
                )
                
                # Advanced settings
                with gr.Accordion("⚙️ Advanced Controls", open=False):
                    duration_slider = gr.Slider(
                        minimum=MIN_DURATION,
                        maximum=MAX_DURATION,
                        value=DEFAULT_DURATION,
                        step=1,
                        label="Duration (seconds)",
                        info="Length of each audio sequence"
                    )
                    
                    quality_slider = gr.Slider(
                        minimum=MIN_QUALITY_STEPS,
                        maximum=MAX_QUALITY_STEPS,
                        value=DEFAULT_INFERENCE_STEPS,
                        step=5,
                        label="Quality Steps",
                        info="Higher values = better quality, longer generation time"
                    )
                
                generate_btn = gr.Button(
                    "🎵 CREATE SOUNDSCAPE", 
                    variant="primary", 
                    size="lg",
                    elem_classes=["generate-btn"]
                )
                
            with gr.Column(scale=1, elem_classes=["output-card"]):
                # Output section
                gr.HTML('<h3 class="section-header">📂 Generated Audio</h3>')
                output_gallery = gr.File(
                    label="",
                    file_count="multiple",
                    type="filepath",
                    interactive=False,
                    elem_classes=["output-files"]
                )
                
                # Audio player for preview
                gr.HTML('<h3 class="section-header">🔊 Audio Preview</h3>')
                audio_preview = gr.Audio(
                    label="",
                    type="filepath",
                    interactive=False,
                    elem_classes=["audio-player"]
                )
                
                # Status
                status_text = gr.Markdown(
                    "*Ready to create your soundscape...*",
                    elem_classes=["status-text"]
                )
        
        # Examples section with cinematic flair
        gr.HTML('<div style="margin-top: 3rem;"><h3 class="section-header">💡 Inspiration Gallery</h3></div>')
        
        with gr.Row():
            with gr.Column():
                gr.Examples(
                    examples=[
                        ["Epic thunderstorm with lightning strikes"],
                        ["Mysterious forest with owl calls and rustling leaves"],
                        ["Intense battlefield with explosions and gunfire"],
                    ],
                    inputs=text_input,
                    label="🎬 Cinematic"
                )
            with gr.Column():
                gr.Examples(
                    examples=[
                        ["Peaceful ocean waves on a quiet beach"],
                        ["Cozy fireplace with crackling wood"],
                        ["Gentle rain on a window with distant thunder"],
                    ],
                    inputs=text_input,
                    label="🌅 Ambient"
                )
            with gr.Column():
                gr.Examples(
                    examples=[
                        ["Busy city street with traffic and sirens"],
                        ["Industrial factory with machinery sounds"],
                        ["Haunted house with creaking doors and chains"],
                    ],
                    inputs=text_input,
                    label="🏙️ Urban/Horror"
                )
        
        # Footer
        gr.HTML("""
        <div style="text-align: center; margin-top: 3rem; padding: 2rem; border-top: 1px solid rgba(255,255,255,0.1);">
            <p style="color: #666; font-size: 0.9rem;">
                Powered by AudioLDM • Built for creators, filmmakers, and audio enthusiasts
            </p>
        </div>
        """)
        
        # Event handlers - PROPERLY FIXED FOR MODERN GRADIO
        def load_preset(preset):
            if preset and preset in PRESET_SCENES:
                return PRESET_SCENES[preset]
            return ""
        
        def generate_sounds(descriptions, duration, quality):
            if generator is None:
                return [], None, "❌ **Error**: Generator not initialized. Please restart the application."
            
            if not descriptions.strip():
                return [], None, "❌ **Please describe the sounds you want to create.**"
            
            try:
                # Generate audio files
                results = generator.generate_multiple(descriptions, duration, quality)
                
                # Return files and set first as preview
                file_paths = [r[0] for r in results]
                preview_path = file_paths[0] if file_paths else None
                
                status = f"✅ **Successfully generated {len(file_paths)} AI audio file(s)!**"
                
                return file_paths, preview_path, status
                
            except Exception as e:
                logger.error(f"Generation error: {e}")
                return [], None, f"❌ **Error**: {str(e)}"
        
        def preview_audio(files):
            if files and len(files) > 0:
                return files[0]
            return None
        
        # Connect all events using the correct Gradio syntax
        preset_buttons.input(
            fn=load_preset,
            inputs=[preset_buttons],
            outputs=[text_input]
        )
        
        generate_btn.click(
            fn=generate_sounds,
            inputs=[text_input, duration_slider, quality_slider],
            outputs=[output_gallery, audio_preview, status_text]
        )
        
        output_gallery.change(
            fn=preview_audio,
            inputs=[output_gallery],
            outputs=[audio_preview]
        )
    
    return demo

# Create and launch the app
if __name__ == "__main__":
    interface = create_interface()
    interface.launch(
        server_name="0.0.0.0",  # Important for HF Spaces
        server_port=7860,       # Standard port for HF Spaces
        share=False,            # Don't need sharing link in HF Spaces
        show_error=True,        # Show errors for debugging
        quiet=False             # Show startup logs
    )