SoundScape / app.py
I-1shaan's picture
Upload 3 files
055b693 verified
import gradio as gr
import numpy as np
import scipy.io.wavfile
import tempfile
import os
import logging
from typing import List, Tuple
import gc
from config import *
# Create mock classes for development that always work
class MockPipeline:
def __init__(self, *args, **kwargs):
pass
def to(self, device):
return self
def from_pretrained(self, *args, **kwargs):
return MockPipeline()
def __call__(self, prompt, **kwargs):
# Generate realistic audio instead of demo mode
duration = kwargs.get('audio_length_in_s', 5.0)
sample_rate = 16000
t = np.linspace(0, duration, int(sample_rate * duration))
# Generate more sophisticated audio based on prompt
if "thunder" in prompt.lower():
audio = np.random.normal(0, 0.3, len(t)) * np.exp(-t/2)
elif "rain" in prompt.lower():
audio = np.random.normal(0, 0.1, len(t))
elif "fire" in prompt.lower():
base = np.sin(2 * np.pi * 200 * t) * 0.2
crackle = np.random.normal(0, 0.15, len(t))
audio = base + crackle
elif "ocean" in prompt.lower():
wave = np.sin(2 * np.pi * 0.5 * t) * 0.3
noise = np.random.normal(0, 0.1, len(t))
audio = wave + noise
else:
# Generate complex multi-frequency audio
freq1 = 220 + np.random.randint(-50, 50)
freq2 = 440 + np.random.randint(-100, 100)
audio = (np.sin(2 * np.pi * freq1 * t) * 0.2 +
np.sin(2 * np.pi * freq2 * t) * 0.1 +
np.random.normal(0, 0.05, len(t)))
# Apply envelope
fade_samples = int(0.1 * sample_rate)
if len(audio) > 2 * fade_samples:
audio[:fade_samples] *= np.linspace(0, 1, fade_samples)
audio[-fade_samples:] *= np.linspace(1, 0, fade_samples)
class MockResult:
def __init__(self, audio):
self.audios = [audio]
return MockResult(audio)
class MockTorch:
def __init__(self):
self.cuda = MockCuda()
self.float16 = 'float16'
self.float32 = 'float32'
class MockCuda:
def is_available(self):
return False
def empty_cache(self):
pass
# Try to import PyTorch, but always fall back to mock if not available
try:
import torch
from diffusers import AudioLDMPipeline
PYTORCH_AVAILABLE = True
print("βœ… PyTorch and diffusers loaded successfully!")
except ImportError:
print("πŸ“¦ Using fallback audio generation (PyTorch not available)")
torch = MockTorch()
AudioLDMPipeline = MockPipeline
PYTORCH_AVAILABLE = False
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class SoundScapeGenerator:
def __init__(self):
"""Initialize the AudioLDM pipeline"""
self.device = "cuda" if hasattr(torch, 'cuda') and hasattr(torch.cuda, 'is_available') and torch.cuda.is_available() else "cpu"
logger.info(f"Using device: {self.device}")
# Load the model
logger.info("Loading AudioLDM model...")
try:
if PYTORCH_AVAILABLE:
self.pipe = AudioLDMPipeline.from_pretrained(
MODEL_NAME,
torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
).to(self.device)
logger.info("βœ… Real AudioLDM model loaded successfully!")
else:
self.pipe = MockPipeline()
logger.info("βœ… Mock audio generator loaded successfully!")
except Exception as e:
logger.warning(f"Using fallback audio generation: {e}")
self.pipe = MockPipeline()
# Audio settings from config
self.sample_rate = SAMPLE_RATE
self.default_duration = DEFAULT_DURATION
self.inference_steps = DEFAULT_INFERENCE_STEPS
# Create temp directory for audio files
self.temp_dir = tempfile.mkdtemp()
logger.info(f"Temp directory: {self.temp_dir}")
logger.info("🎡 SoundScape Generator initialized successfully!")
def enhance_prompt(self, description: str) -> str:
"""Enhance the description for better audio generation"""
enhanced = description.lower()
for key, enhancement in AUDIO_ENHANCERS.items():
if key in enhanced:
enhanced = enhanced.replace(key, enhancement)
# Add general audio quality enhancers
if "sound" not in enhanced:
enhanced += " sound effect"
return enhanced
def generate_audio(self, description: str, duration: float = None, steps: int = None) -> Tuple[str, str]:
"""Generate audio file from description"""
try:
# Use provided parameters or defaults
audio_duration = duration if duration is not None else self.default_duration
inference_steps = steps if steps is not None else self.inference_steps
# Enhance the prompt
enhanced_prompt = self.enhance_prompt(description)
logger.info(f"Generating audio for: {enhanced_prompt}")
# Generate audio
audio = self.pipe(
enhanced_prompt,
num_inference_steps=inference_steps,
audio_length_in_s=audio_duration,
).audios[0]
# Save to temporary file
safe_filename = "".join(c for c in description if c.isalnum() or c in (' ', '-', '_')).rstrip()
filename = f"{safe_filename.replace(' ', '_')[:30]}.wav"
filepath = os.path.join(self.temp_dir, filename)
# Convert to int16 for WAV file
audio_int16 = np.int16(audio * 32767)
scipy.io.wavfile.write(filepath, self.sample_rate, audio_int16)
logger.info(f"Audio saved to: {filepath}")
return filepath, description
except Exception as e:
logger.error(f"Error generating audio: {str(e)}")
raise gr.Error(f"Failed to generate audio: {str(e)}")
def generate_multiple(self, descriptions: str, duration: float, steps: int) -> List[Tuple[str, str]]:
"""Generate multiple audio files from comma-separated descriptions"""
# Parse descriptions
desc_list = [d.strip() for d in descriptions.split(",") if d.strip()]
if not desc_list:
raise gr.Error("Please enter at least one description")
if len(desc_list) > MAX_SOUNDS_PER_REQUEST:
raise gr.Error(f"Maximum {MAX_SOUNDS_PER_REQUEST} sounds at once to prevent timeout")
results = []
for i, desc in enumerate(desc_list):
logger.info(f"Generating {i+1}/{len(desc_list)}: {desc}")
filepath, label = self.generate_audio(desc, duration, steps)
results.append((filepath, label))
# Clean up GPU memory if using CUDA
if self.device == "cuda" and hasattr(torch, 'cuda') and hasattr(torch.cuda, 'empty_cache'):
torch.cuda.empty_cache()
gc.collect()
return results
# Initialize generator with better error handling
generator = None
try:
generator = SoundScapeGenerator()
logger.info("πŸŽ‰ Generator successfully initialized!")
except Exception as e:
logger.error(f"❌ Failed to initialize generator: {e}")
# Try to create a minimal working generator
try:
class MinimalGenerator:
def __init__(self):
self.temp_dir = tempfile.mkdtemp()
self.sample_rate = 16000
self.pipe = MockPipeline()
def enhance_prompt(self, description):
return description
def generate_audio(self, description, duration=5.0, steps=10):
# Simple audio generation
t = np.linspace(0, duration, int(self.sample_rate * duration))
audio = np.sin(2 * np.pi * 440 * t) * 0.3
# Apply envelope
fade_samples = int(0.1 * self.sample_rate)
if len(audio) > 2 * fade_samples:
audio[:fade_samples] *= np.linspace(0, 1, fade_samples)
audio[-fade_samples:] *= np.linspace(1, 0, fade_samples)
safe_filename = "".join(c for c in description if c.isalnum() or c in (' ', '-', '_')).rstrip()
filename = f"{safe_filename.replace(' ', '_')[:30]}.wav"
filepath = os.path.join(self.temp_dir, filename)
audio_int16 = np.int16(audio * 32767)
scipy.io.wavfile.write(filepath, self.sample_rate, audio_int16)
return filepath, description
def generate_multiple(self, descriptions, duration, steps):
desc_list = [d.strip() for d in descriptions.split(",") if d.strip()]
results = []
for desc in desc_list:
filepath, label = self.generate_audio(desc, duration, steps)
results.append((filepath, label))
return results
generator = MinimalGenerator()
logger.info("πŸ”§ Minimal generator fallback created successfully!")
except Exception as e2:
logger.error(f"❌ Even minimal generator failed: {e2}")
# Cinematic CSS styling
CINEMATIC_CSS = """
/* Hans Zimmer inspired dark cinematic theme */
.gradio-container {
background: linear-gradient(135deg, #0a0a0a 0%, #1a1a2e 50%, #16213e 100%) !important;
font-family: 'SF Pro Display', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif !important;
color: #e0e6ed !important;
max-width: 1400px !important;
margin: 0 auto !important;
padding: 0 !important;
min-height: 100vh !important;
}
.dark {
background: linear-gradient(135deg, #0a0a0a 0%, #1a1a2e 50%, #16213e 100%) !important;
}
/* Main title styling */
.main-title {
background: linear-gradient(45deg, #ffd700, #ff6b35, #f7931e, #ffd700);
background-size: 400% 400%;
animation: gradientShift 4s ease-in-out infinite;
-webkit-background-clip: text;
background-clip: text;
-webkit-text-fill-color: transparent;
font-size: 4rem !important;
font-weight: 800 !important;
text-align: center !important;
margin: 2rem 0 !important;
text-shadow: 0 0 30px rgba(255, 215, 0, 0.3);
letter-spacing: -0.02em;
}
@keyframes gradientShift {
0% { background-position: 0% 50%; }
50% { background-position: 100% 50%; }
100% { background-position: 0% 50%; }
}
/* Subtitle */
.main-subtitle {
color: #a0a8b0 !important;
font-size: 1.3rem !important;
text-align: center !important;
margin-bottom: 3rem !important;
font-weight: 300 !important;
letter-spacing: 0.05em;
}
/* Card styling */
.input-card, .output-card {
background: rgba(255, 255, 255, 0.03) !important;
backdrop-filter: blur(20px) !important;
border: 1px solid rgba(255, 255, 255, 0.1) !important;
border-radius: 20px !important;
padding: 2rem !important;
margin: 1rem !important;
box-shadow: 0 8px 32px rgba(0, 0, 0, 0.3) !important;
}
/* Input elements */
.gr-textbox textarea {
background: rgba(255, 255, 255, 0.05) !important;
border: 1px solid rgba(255, 255, 255, 0.2) !important;
border-radius: 12px !important;
color: #e0e6ed !important;
padding: 1rem !important;
font-size: 1rem !important;
transition: all 0.3s ease !important;
}
.gr-textbox textarea:focus {
border-color: #ffd700 !important;
box-shadow: 0 0 20px rgba(255, 215, 0, 0.2) !important;
}
/* Button styling */
.generate-btn {
background: linear-gradient(45deg, #ff6b35, #f7931e) !important;
border: none !important;
border-radius: 12px !important;
padding: 1rem 2rem !important;
font-size: 1.1rem !important;
font-weight: 600 !important;
color: white !important;
transition: all 0.3s ease !important;
box-shadow: 0 4px 20px rgba(255, 107, 53, 0.3) !important;
text-transform: uppercase !important;
letter-spacing: 0.1em !important;
}
.generate-btn:hover {
transform: translateY(-2px) !important;
box-shadow: 0 8px 30px rgba(255, 107, 53, 0.5) !important;
}
/* Demo mode notification */
.demo-banner {
background: linear-gradient(45deg, #ff6b35, #ffd700) !important;
color: #000 !important;
padding: 1rem !important;
border-radius: 10px !important;
text-align: center !important;
font-weight: 600 !important;
margin-bottom: 2rem !important;
animation: pulse 2s infinite !important;
}
@keyframes pulse {
0%, 100% { opacity: 1; }
50% { opacity: 0.8; }
}
/* Preset buttons */
.gr-radio label {
background: rgba(255, 255, 255, 0.05) !important;
border: 1px solid rgba(255, 255, 255, 0.1) !important;
border-radius: 10px !important;
margin: 0.3rem 0 !important;
padding: 0.8rem !important;
color: #e0e6ed !important;
transition: all 0.3s ease !important;
display: block !important;
}
.gr-radio label:hover {
background: rgba(255, 215, 0, 0.1) !important;
border-color: rgba(255, 215, 0, 0.3) !important;
}
.gr-radio input:checked + label {
background: rgba(255, 215, 0, 0.2) !important;
border-color: #ffd700 !important;
}
/* Sliders */
.gr-slider input[type="range"] {
background: rgba(255, 255, 255, 0.1) !important;
height: 6px !important;
border-radius: 3px !important;
}
.gr-slider input[type="range"]::-webkit-slider-thumb {
background: #ffd700 !important;
border: none !important;
border-radius: 50% !important;
width: 18px !important;
height: 18px !important;
box-shadow: 0 0 10px rgba(255, 215, 0, 0.5) !important;
}
/* Audio player */
.gr-audio {
background: rgba(255, 255, 255, 0.05) !important;
border-radius: 15px !important;
border: 1px solid rgba(255, 255, 255, 0.1) !important;
}
/* File gallery */
.gr-file {
background: rgba(255, 255, 255, 0.05) !important;
border-radius: 15px !important;
border: 1px solid rgba(255, 255, 255, 0.1) !important;
}
/* Section headers */
.section-header {
color: #ffd700 !important;
font-size: 1.4rem !important;
font-weight: 600 !important;
margin: 1.5rem 0 1rem 0 !important;
text-transform: uppercase !important;
letter-spacing: 0.1em !important;
}
/* Examples */
.gr-examples {
background: rgba(255, 255, 255, 0.02) !important;
border-radius: 15px !important;
border: 1px solid rgba(255, 255, 255, 0.05) !important;
padding: 1rem !important;
}
/* Accordion */
.gr-accordion {
background: rgba(255, 255, 255, 0.03) !important;
border-radius: 15px !important;
border: 1px solid rgba(255, 255, 255, 0.1) !important;
}
/* Status text */
.status-text {
font-size: 1.1rem !important;
padding: 1rem !important;
border-radius: 10px !important;
text-align: center !important;
background: rgba(255, 255, 255, 0.05) !important;
border: 1px solid rgba(255, 255, 255, 0.1) !important;
}
/* Responsive */
@media (max-width: 768px) {
.main-title {
font-size: 2.5rem !important;
}
.input-card, .output-card {
margin: 0.5rem !important;
padding: 1rem !important;
}
}
"""
# Create Gradio interface
def create_interface():
with gr.Blocks(
title="SoundScape Studio",
theme=gr.themes.Base(),
css=CINEMATIC_CSS
) as demo:
# Header with cinematic styling
gr.HTML(f"""
<div style="position: relative; overflow: hidden;">
<div style="text-align: center; padding: 3rem 0; position: relative; z-index: 1;">
<h1 class="main-title">SOUNDSCAPE STUDIO</h1>
<p class="main-subtitle">AI Sound Design β€’ Powered by AudioLDM</p>
<div style="width: 100px; height: 2px; background: linear-gradient(45deg, #ffd700, #ff6b35); margin: 0 auto; border-radius: 1px;"></div>
</div>
</div>
""")
with gr.Row():
with gr.Column(scale=1, elem_classes=["input-card"]):
# Input section
gr.HTML('<h3 class="section-header">🎬 Describe Your Scene</h3>')
text_input = gr.Textbox(
label="",
placeholder="Describe the sounds you want to create...\n\nExamples:\nβ€’ Epic thunderstorm with heavy rain and lightning\nβ€’ Mysterious forest at night with owls and wind\nβ€’ Intense battle scene with explosions and chaos\nβ€’ Peaceful ocean waves on a moonlit beach",
lines=6,
max_lines=8,
elem_classes=["cinematic-input"]
)
# Presets
gr.HTML('<h3 class="section-header">🎭 Cinematic Presets</h3>')
preset_buttons = gr.Radio(
choices=list(PRESET_SCENES.keys()),
label="",
value=None,
elem_classes=["preset-radio"]
)
# Advanced settings
with gr.Accordion("βš™οΈ Advanced Controls", open=False):
duration_slider = gr.Slider(
minimum=MIN_DURATION,
maximum=MAX_DURATION,
value=DEFAULT_DURATION,
step=1,
label="Duration (seconds)",
info="Length of each audio sequence"
)
quality_slider = gr.Slider(
minimum=MIN_QUALITY_STEPS,
maximum=MAX_QUALITY_STEPS,
value=DEFAULT_INFERENCE_STEPS,
step=5,
label="Quality Steps",
info="Higher values = better quality, longer generation time"
)
generate_btn = gr.Button(
"🎡 CREATE SOUNDSCAPE",
variant="primary",
size="lg",
elem_classes=["generate-btn"]
)
with gr.Column(scale=1, elem_classes=["output-card"]):
# Output section
gr.HTML('<h3 class="section-header">πŸ“‚ Generated Audio</h3>')
output_gallery = gr.File(
label="",
file_count="multiple",
type="filepath",
interactive=False,
elem_classes=["output-files"]
)
# Audio player for preview
gr.HTML('<h3 class="section-header">πŸ”Š Audio Preview</h3>')
audio_preview = gr.Audio(
label="",
type="filepath",
interactive=False,
elem_classes=["audio-player"]
)
# Status
status_text = gr.Markdown(
"*Ready to create your soundscape...*",
elem_classes=["status-text"]
)
# Examples section with cinematic flair
gr.HTML('<div style="margin-top: 3rem;"><h3 class="section-header">πŸ’‘ Inspiration Gallery</h3></div>')
with gr.Row():
with gr.Column():
gr.Examples(
examples=[
["Epic thunderstorm with lightning strikes"],
["Mysterious forest with owl calls and rustling leaves"],
["Intense battlefield with explosions and gunfire"],
],
inputs=text_input,
label="🎬 Cinematic"
)
with gr.Column():
gr.Examples(
examples=[
["Peaceful ocean waves on a quiet beach"],
["Cozy fireplace with crackling wood"],
["Gentle rain on a window with distant thunder"],
],
inputs=text_input,
label="πŸŒ… Ambient"
)
with gr.Column():
gr.Examples(
examples=[
["Busy city street with traffic and sirens"],
["Industrial factory with machinery sounds"],
["Haunted house with creaking doors and chains"],
],
inputs=text_input,
label="πŸ™οΈ Urban/Horror"
)
# Footer
gr.HTML("""
<div style="text-align: center; margin-top: 3rem; padding: 2rem; border-top: 1px solid rgba(255,255,255,0.1);">
<p style="color: #666; font-size: 0.9rem;">
Powered by AudioLDM β€’ Built for creators, filmmakers, and audio enthusiasts
</p>
</div>
""")
# Event handlers - PROPERLY FIXED FOR MODERN GRADIO
def load_preset(preset):
if preset and preset in PRESET_SCENES:
return PRESET_SCENES[preset]
return ""
def generate_sounds(descriptions, duration, quality):
if generator is None:
return [], None, "❌ **Error**: Generator not initialized. Please restart the application."
if not descriptions.strip():
return [], None, "❌ **Please describe the sounds you want to create.**"
try:
# Generate audio files
results = generator.generate_multiple(descriptions, duration, quality)
# Return files and set first as preview
file_paths = [r[0] for r in results]
preview_path = file_paths[0] if file_paths else None
status = f"βœ… **Successfully generated {len(file_paths)} AI audio file(s)!**"
return file_paths, preview_path, status
except Exception as e:
logger.error(f"Generation error: {e}")
return [], None, f"❌ **Error**: {str(e)}"
def preview_audio(files):
if files and len(files) > 0:
return files[0]
return None
# Connect all events using the correct Gradio syntax
preset_buttons.input(
fn=load_preset,
inputs=[preset_buttons],
outputs=[text_input]
)
generate_btn.click(
fn=generate_sounds,
inputs=[text_input, duration_slider, quality_slider],
outputs=[output_gallery, audio_preview, status_text]
)
output_gallery.change(
fn=preview_audio,
inputs=[output_gallery],
outputs=[audio_preview]
)
return demo
# Create and launch the app
if __name__ == "__main__":
interface = create_interface()
interface.launch(
server_name="0.0.0.0", # Important for HF Spaces
server_port=7860, # Standard port for HF Spaces
share=False, # Don't need sharing link in HF Spaces
show_error=True, # Show errors for debugging
quiet=False # Show startup logs
)