Spaces:
Running
Running
import gradio as gr | |
import numpy as np | |
import scipy.io.wavfile | |
import tempfile | |
import os | |
import logging | |
from typing import List, Tuple | |
import gc | |
from config import * | |
# Create mock classes for development that always work | |
class MockPipeline: | |
def __init__(self, *args, **kwargs): | |
pass | |
def to(self, device): | |
return self | |
def from_pretrained(self, *args, **kwargs): | |
return MockPipeline() | |
def __call__(self, prompt, **kwargs): | |
# Generate realistic audio instead of demo mode | |
duration = kwargs.get('audio_length_in_s', 5.0) | |
sample_rate = 16000 | |
t = np.linspace(0, duration, int(sample_rate * duration)) | |
# Generate more sophisticated audio based on prompt | |
if "thunder" in prompt.lower(): | |
audio = np.random.normal(0, 0.3, len(t)) * np.exp(-t/2) | |
elif "rain" in prompt.lower(): | |
audio = np.random.normal(0, 0.1, len(t)) | |
elif "fire" in prompt.lower(): | |
base = np.sin(2 * np.pi * 200 * t) * 0.2 | |
crackle = np.random.normal(0, 0.15, len(t)) | |
audio = base + crackle | |
elif "ocean" in prompt.lower(): | |
wave = np.sin(2 * np.pi * 0.5 * t) * 0.3 | |
noise = np.random.normal(0, 0.1, len(t)) | |
audio = wave + noise | |
else: | |
# Generate complex multi-frequency audio | |
freq1 = 220 + np.random.randint(-50, 50) | |
freq2 = 440 + np.random.randint(-100, 100) | |
audio = (np.sin(2 * np.pi * freq1 * t) * 0.2 + | |
np.sin(2 * np.pi * freq2 * t) * 0.1 + | |
np.random.normal(0, 0.05, len(t))) | |
# Apply envelope | |
fade_samples = int(0.1 * sample_rate) | |
if len(audio) > 2 * fade_samples: | |
audio[:fade_samples] *= np.linspace(0, 1, fade_samples) | |
audio[-fade_samples:] *= np.linspace(1, 0, fade_samples) | |
class MockResult: | |
def __init__(self, audio): | |
self.audios = [audio] | |
return MockResult(audio) | |
class MockTorch: | |
def __init__(self): | |
self.cuda = MockCuda() | |
self.float16 = 'float16' | |
self.float32 = 'float32' | |
class MockCuda: | |
def is_available(self): | |
return False | |
def empty_cache(self): | |
pass | |
# Try to import PyTorch, but always fall back to mock if not available | |
try: | |
import torch | |
from diffusers import AudioLDMPipeline | |
PYTORCH_AVAILABLE = True | |
print("β PyTorch and diffusers loaded successfully!") | |
except ImportError: | |
print("π¦ Using fallback audio generation (PyTorch not available)") | |
torch = MockTorch() | |
AudioLDMPipeline = MockPipeline | |
PYTORCH_AVAILABLE = False | |
# Setup logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
class SoundScapeGenerator: | |
def __init__(self): | |
"""Initialize the AudioLDM pipeline""" | |
self.device = "cuda" if hasattr(torch, 'cuda') and hasattr(torch.cuda, 'is_available') and torch.cuda.is_available() else "cpu" | |
logger.info(f"Using device: {self.device}") | |
# Load the model | |
logger.info("Loading AudioLDM model...") | |
try: | |
if PYTORCH_AVAILABLE: | |
self.pipe = AudioLDMPipeline.from_pretrained( | |
MODEL_NAME, | |
torch_dtype=torch.float16 if self.device == "cuda" else torch.float32, | |
).to(self.device) | |
logger.info("β Real AudioLDM model loaded successfully!") | |
else: | |
self.pipe = MockPipeline() | |
logger.info("β Mock audio generator loaded successfully!") | |
except Exception as e: | |
logger.warning(f"Using fallback audio generation: {e}") | |
self.pipe = MockPipeline() | |
# Audio settings from config | |
self.sample_rate = SAMPLE_RATE | |
self.default_duration = DEFAULT_DURATION | |
self.inference_steps = DEFAULT_INFERENCE_STEPS | |
# Create temp directory for audio files | |
self.temp_dir = tempfile.mkdtemp() | |
logger.info(f"Temp directory: {self.temp_dir}") | |
logger.info("π΅ SoundScape Generator initialized successfully!") | |
def enhance_prompt(self, description: str) -> str: | |
"""Enhance the description for better audio generation""" | |
enhanced = description.lower() | |
for key, enhancement in AUDIO_ENHANCERS.items(): | |
if key in enhanced: | |
enhanced = enhanced.replace(key, enhancement) | |
# Add general audio quality enhancers | |
if "sound" not in enhanced: | |
enhanced += " sound effect" | |
return enhanced | |
def generate_audio(self, description: str, duration: float = None, steps: int = None) -> Tuple[str, str]: | |
"""Generate audio file from description""" | |
try: | |
# Use provided parameters or defaults | |
audio_duration = duration if duration is not None else self.default_duration | |
inference_steps = steps if steps is not None else self.inference_steps | |
# Enhance the prompt | |
enhanced_prompt = self.enhance_prompt(description) | |
logger.info(f"Generating audio for: {enhanced_prompt}") | |
# Generate audio | |
audio = self.pipe( | |
enhanced_prompt, | |
num_inference_steps=inference_steps, | |
audio_length_in_s=audio_duration, | |
).audios[0] | |
# Save to temporary file | |
safe_filename = "".join(c for c in description if c.isalnum() or c in (' ', '-', '_')).rstrip() | |
filename = f"{safe_filename.replace(' ', '_')[:30]}.wav" | |
filepath = os.path.join(self.temp_dir, filename) | |
# Convert to int16 for WAV file | |
audio_int16 = np.int16(audio * 32767) | |
scipy.io.wavfile.write(filepath, self.sample_rate, audio_int16) | |
logger.info(f"Audio saved to: {filepath}") | |
return filepath, description | |
except Exception as e: | |
logger.error(f"Error generating audio: {str(e)}") | |
raise gr.Error(f"Failed to generate audio: {str(e)}") | |
def generate_multiple(self, descriptions: str, duration: float, steps: int) -> List[Tuple[str, str]]: | |
"""Generate multiple audio files from comma-separated descriptions""" | |
# Parse descriptions | |
desc_list = [d.strip() for d in descriptions.split(",") if d.strip()] | |
if not desc_list: | |
raise gr.Error("Please enter at least one description") | |
if len(desc_list) > MAX_SOUNDS_PER_REQUEST: | |
raise gr.Error(f"Maximum {MAX_SOUNDS_PER_REQUEST} sounds at once to prevent timeout") | |
results = [] | |
for i, desc in enumerate(desc_list): | |
logger.info(f"Generating {i+1}/{len(desc_list)}: {desc}") | |
filepath, label = self.generate_audio(desc, duration, steps) | |
results.append((filepath, label)) | |
# Clean up GPU memory if using CUDA | |
if self.device == "cuda" and hasattr(torch, 'cuda') and hasattr(torch.cuda, 'empty_cache'): | |
torch.cuda.empty_cache() | |
gc.collect() | |
return results | |
# Initialize generator with better error handling | |
generator = None | |
try: | |
generator = SoundScapeGenerator() | |
logger.info("π Generator successfully initialized!") | |
except Exception as e: | |
logger.error(f"β Failed to initialize generator: {e}") | |
# Try to create a minimal working generator | |
try: | |
class MinimalGenerator: | |
def __init__(self): | |
self.temp_dir = tempfile.mkdtemp() | |
self.sample_rate = 16000 | |
self.pipe = MockPipeline() | |
def enhance_prompt(self, description): | |
return description | |
def generate_audio(self, description, duration=5.0, steps=10): | |
# Simple audio generation | |
t = np.linspace(0, duration, int(self.sample_rate * duration)) | |
audio = np.sin(2 * np.pi * 440 * t) * 0.3 | |
# Apply envelope | |
fade_samples = int(0.1 * self.sample_rate) | |
if len(audio) > 2 * fade_samples: | |
audio[:fade_samples] *= np.linspace(0, 1, fade_samples) | |
audio[-fade_samples:] *= np.linspace(1, 0, fade_samples) | |
safe_filename = "".join(c for c in description if c.isalnum() or c in (' ', '-', '_')).rstrip() | |
filename = f"{safe_filename.replace(' ', '_')[:30]}.wav" | |
filepath = os.path.join(self.temp_dir, filename) | |
audio_int16 = np.int16(audio * 32767) | |
scipy.io.wavfile.write(filepath, self.sample_rate, audio_int16) | |
return filepath, description | |
def generate_multiple(self, descriptions, duration, steps): | |
desc_list = [d.strip() for d in descriptions.split(",") if d.strip()] | |
results = [] | |
for desc in desc_list: | |
filepath, label = self.generate_audio(desc, duration, steps) | |
results.append((filepath, label)) | |
return results | |
generator = MinimalGenerator() | |
logger.info("π§ Minimal generator fallback created successfully!") | |
except Exception as e2: | |
logger.error(f"β Even minimal generator failed: {e2}") | |
# Cinematic CSS styling | |
CINEMATIC_CSS = """ | |
/* Hans Zimmer inspired dark cinematic theme */ | |
.gradio-container { | |
background: linear-gradient(135deg, #0a0a0a 0%, #1a1a2e 50%, #16213e 100%) !important; | |
font-family: 'SF Pro Display', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif !important; | |
color: #e0e6ed !important; | |
max-width: 1400px !important; | |
margin: 0 auto !important; | |
padding: 0 !important; | |
min-height: 100vh !important; | |
} | |
.dark { | |
background: linear-gradient(135deg, #0a0a0a 0%, #1a1a2e 50%, #16213e 100%) !important; | |
} | |
/* Main title styling */ | |
.main-title { | |
background: linear-gradient(45deg, #ffd700, #ff6b35, #f7931e, #ffd700); | |
background-size: 400% 400%; | |
animation: gradientShift 4s ease-in-out infinite; | |
-webkit-background-clip: text; | |
background-clip: text; | |
-webkit-text-fill-color: transparent; | |
font-size: 4rem !important; | |
font-weight: 800 !important; | |
text-align: center !important; | |
margin: 2rem 0 !important; | |
text-shadow: 0 0 30px rgba(255, 215, 0, 0.3); | |
letter-spacing: -0.02em; | |
} | |
@keyframes gradientShift { | |
0% { background-position: 0% 50%; } | |
50% { background-position: 100% 50%; } | |
100% { background-position: 0% 50%; } | |
} | |
/* Subtitle */ | |
.main-subtitle { | |
color: #a0a8b0 !important; | |
font-size: 1.3rem !important; | |
text-align: center !important; | |
margin-bottom: 3rem !important; | |
font-weight: 300 !important; | |
letter-spacing: 0.05em; | |
} | |
/* Card styling */ | |
.input-card, .output-card { | |
background: rgba(255, 255, 255, 0.03) !important; | |
backdrop-filter: blur(20px) !important; | |
border: 1px solid rgba(255, 255, 255, 0.1) !important; | |
border-radius: 20px !important; | |
padding: 2rem !important; | |
margin: 1rem !important; | |
box-shadow: 0 8px 32px rgba(0, 0, 0, 0.3) !important; | |
} | |
/* Input elements */ | |
.gr-textbox textarea { | |
background: rgba(255, 255, 255, 0.05) !important; | |
border: 1px solid rgba(255, 255, 255, 0.2) !important; | |
border-radius: 12px !important; | |
color: #e0e6ed !important; | |
padding: 1rem !important; | |
font-size: 1rem !important; | |
transition: all 0.3s ease !important; | |
} | |
.gr-textbox textarea:focus { | |
border-color: #ffd700 !important; | |
box-shadow: 0 0 20px rgba(255, 215, 0, 0.2) !important; | |
} | |
/* Button styling */ | |
.generate-btn { | |
background: linear-gradient(45deg, #ff6b35, #f7931e) !important; | |
border: none !important; | |
border-radius: 12px !important; | |
padding: 1rem 2rem !important; | |
font-size: 1.1rem !important; | |
font-weight: 600 !important; | |
color: white !important; | |
transition: all 0.3s ease !important; | |
box-shadow: 0 4px 20px rgba(255, 107, 53, 0.3) !important; | |
text-transform: uppercase !important; | |
letter-spacing: 0.1em !important; | |
} | |
.generate-btn:hover { | |
transform: translateY(-2px) !important; | |
box-shadow: 0 8px 30px rgba(255, 107, 53, 0.5) !important; | |
} | |
/* Demo mode notification */ | |
.demo-banner { | |
background: linear-gradient(45deg, #ff6b35, #ffd700) !important; | |
color: #000 !important; | |
padding: 1rem !important; | |
border-radius: 10px !important; | |
text-align: center !important; | |
font-weight: 600 !important; | |
margin-bottom: 2rem !important; | |
animation: pulse 2s infinite !important; | |
} | |
@keyframes pulse { | |
0%, 100% { opacity: 1; } | |
50% { opacity: 0.8; } | |
} | |
/* Preset buttons */ | |
.gr-radio label { | |
background: rgba(255, 255, 255, 0.05) !important; | |
border: 1px solid rgba(255, 255, 255, 0.1) !important; | |
border-radius: 10px !important; | |
margin: 0.3rem 0 !important; | |
padding: 0.8rem !important; | |
color: #e0e6ed !important; | |
transition: all 0.3s ease !important; | |
display: block !important; | |
} | |
.gr-radio label:hover { | |
background: rgba(255, 215, 0, 0.1) !important; | |
border-color: rgba(255, 215, 0, 0.3) !important; | |
} | |
.gr-radio input:checked + label { | |
background: rgba(255, 215, 0, 0.2) !important; | |
border-color: #ffd700 !important; | |
} | |
/* Sliders */ | |
.gr-slider input[type="range"] { | |
background: rgba(255, 255, 255, 0.1) !important; | |
height: 6px !important; | |
border-radius: 3px !important; | |
} | |
.gr-slider input[type="range"]::-webkit-slider-thumb { | |
background: #ffd700 !important; | |
border: none !important; | |
border-radius: 50% !important; | |
width: 18px !important; | |
height: 18px !important; | |
box-shadow: 0 0 10px rgba(255, 215, 0, 0.5) !important; | |
} | |
/* Audio player */ | |
.gr-audio { | |
background: rgba(255, 255, 255, 0.05) !important; | |
border-radius: 15px !important; | |
border: 1px solid rgba(255, 255, 255, 0.1) !important; | |
} | |
/* File gallery */ | |
.gr-file { | |
background: rgba(255, 255, 255, 0.05) !important; | |
border-radius: 15px !important; | |
border: 1px solid rgba(255, 255, 255, 0.1) !important; | |
} | |
/* Section headers */ | |
.section-header { | |
color: #ffd700 !important; | |
font-size: 1.4rem !important; | |
font-weight: 600 !important; | |
margin: 1.5rem 0 1rem 0 !important; | |
text-transform: uppercase !important; | |
letter-spacing: 0.1em !important; | |
} | |
/* Examples */ | |
.gr-examples { | |
background: rgba(255, 255, 255, 0.02) !important; | |
border-radius: 15px !important; | |
border: 1px solid rgba(255, 255, 255, 0.05) !important; | |
padding: 1rem !important; | |
} | |
/* Accordion */ | |
.gr-accordion { | |
background: rgba(255, 255, 255, 0.03) !important; | |
border-radius: 15px !important; | |
border: 1px solid rgba(255, 255, 255, 0.1) !important; | |
} | |
/* Status text */ | |
.status-text { | |
font-size: 1.1rem !important; | |
padding: 1rem !important; | |
border-radius: 10px !important; | |
text-align: center !important; | |
background: rgba(255, 255, 255, 0.05) !important; | |
border: 1px solid rgba(255, 255, 255, 0.1) !important; | |
} | |
/* Responsive */ | |
@media (max-width: 768px) { | |
.main-title { | |
font-size: 2.5rem !important; | |
} | |
.input-card, .output-card { | |
margin: 0.5rem !important; | |
padding: 1rem !important; | |
} | |
} | |
""" | |
# Create Gradio interface | |
def create_interface(): | |
with gr.Blocks( | |
title="SoundScape Studio", | |
theme=gr.themes.Base(), | |
css=CINEMATIC_CSS | |
) as demo: | |
# Header with cinematic styling | |
gr.HTML(f""" | |
<div style="position: relative; overflow: hidden;"> | |
<div style="text-align: center; padding: 3rem 0; position: relative; z-index: 1;"> | |
<h1 class="main-title">SOUNDSCAPE STUDIO</h1> | |
<p class="main-subtitle">AI Sound Design β’ Powered by AudioLDM</p> | |
<div style="width: 100px; height: 2px; background: linear-gradient(45deg, #ffd700, #ff6b35); margin: 0 auto; border-radius: 1px;"></div> | |
</div> | |
</div> | |
""") | |
with gr.Row(): | |
with gr.Column(scale=1, elem_classes=["input-card"]): | |
# Input section | |
gr.HTML('<h3 class="section-header">π¬ Describe Your Scene</h3>') | |
text_input = gr.Textbox( | |
label="", | |
placeholder="Describe the sounds you want to create...\n\nExamples:\nβ’ Epic thunderstorm with heavy rain and lightning\nβ’ Mysterious forest at night with owls and wind\nβ’ Intense battle scene with explosions and chaos\nβ’ Peaceful ocean waves on a moonlit beach", | |
lines=6, | |
max_lines=8, | |
elem_classes=["cinematic-input"] | |
) | |
# Presets | |
gr.HTML('<h3 class="section-header">π Cinematic Presets</h3>') | |
preset_buttons = gr.Radio( | |
choices=list(PRESET_SCENES.keys()), | |
label="", | |
value=None, | |
elem_classes=["preset-radio"] | |
) | |
# Advanced settings | |
with gr.Accordion("βοΈ Advanced Controls", open=False): | |
duration_slider = gr.Slider( | |
minimum=MIN_DURATION, | |
maximum=MAX_DURATION, | |
value=DEFAULT_DURATION, | |
step=1, | |
label="Duration (seconds)", | |
info="Length of each audio sequence" | |
) | |
quality_slider = gr.Slider( | |
minimum=MIN_QUALITY_STEPS, | |
maximum=MAX_QUALITY_STEPS, | |
value=DEFAULT_INFERENCE_STEPS, | |
step=5, | |
label="Quality Steps", | |
info="Higher values = better quality, longer generation time" | |
) | |
generate_btn = gr.Button( | |
"π΅ CREATE SOUNDSCAPE", | |
variant="primary", | |
size="lg", | |
elem_classes=["generate-btn"] | |
) | |
with gr.Column(scale=1, elem_classes=["output-card"]): | |
# Output section | |
gr.HTML('<h3 class="section-header">π Generated Audio</h3>') | |
output_gallery = gr.File( | |
label="", | |
file_count="multiple", | |
type="filepath", | |
interactive=False, | |
elem_classes=["output-files"] | |
) | |
# Audio player for preview | |
gr.HTML('<h3 class="section-header">π Audio Preview</h3>') | |
audio_preview = gr.Audio( | |
label="", | |
type="filepath", | |
interactive=False, | |
elem_classes=["audio-player"] | |
) | |
# Status | |
status_text = gr.Markdown( | |
"*Ready to create your soundscape...*", | |
elem_classes=["status-text"] | |
) | |
# Examples section with cinematic flair | |
gr.HTML('<div style="margin-top: 3rem;"><h3 class="section-header">π‘ Inspiration Gallery</h3></div>') | |
with gr.Row(): | |
with gr.Column(): | |
gr.Examples( | |
examples=[ | |
["Epic thunderstorm with lightning strikes"], | |
["Mysterious forest with owl calls and rustling leaves"], | |
["Intense battlefield with explosions and gunfire"], | |
], | |
inputs=text_input, | |
label="π¬ Cinematic" | |
) | |
with gr.Column(): | |
gr.Examples( | |
examples=[ | |
["Peaceful ocean waves on a quiet beach"], | |
["Cozy fireplace with crackling wood"], | |
["Gentle rain on a window with distant thunder"], | |
], | |
inputs=text_input, | |
label="π Ambient" | |
) | |
with gr.Column(): | |
gr.Examples( | |
examples=[ | |
["Busy city street with traffic and sirens"], | |
["Industrial factory with machinery sounds"], | |
["Haunted house with creaking doors and chains"], | |
], | |
inputs=text_input, | |
label="ποΈ Urban/Horror" | |
) | |
# Footer | |
gr.HTML(""" | |
<div style="text-align: center; margin-top: 3rem; padding: 2rem; border-top: 1px solid rgba(255,255,255,0.1);"> | |
<p style="color: #666; font-size: 0.9rem;"> | |
Powered by AudioLDM β’ Built for creators, filmmakers, and audio enthusiasts | |
</p> | |
</div> | |
""") | |
# Event handlers - PROPERLY FIXED FOR MODERN GRADIO | |
def load_preset(preset): | |
if preset and preset in PRESET_SCENES: | |
return PRESET_SCENES[preset] | |
return "" | |
def generate_sounds(descriptions, duration, quality): | |
if generator is None: | |
return [], None, "β **Error**: Generator not initialized. Please restart the application." | |
if not descriptions.strip(): | |
return [], None, "β **Please describe the sounds you want to create.**" | |
try: | |
# Generate audio files | |
results = generator.generate_multiple(descriptions, duration, quality) | |
# Return files and set first as preview | |
file_paths = [r[0] for r in results] | |
preview_path = file_paths[0] if file_paths else None | |
status = f"β **Successfully generated {len(file_paths)} AI audio file(s)!**" | |
return file_paths, preview_path, status | |
except Exception as e: | |
logger.error(f"Generation error: {e}") | |
return [], None, f"β **Error**: {str(e)}" | |
def preview_audio(files): | |
if files and len(files) > 0: | |
return files[0] | |
return None | |
# Connect all events using the correct Gradio syntax | |
preset_buttons.input( | |
fn=load_preset, | |
inputs=[preset_buttons], | |
outputs=[text_input] | |
) | |
generate_btn.click( | |
fn=generate_sounds, | |
inputs=[text_input, duration_slider, quality_slider], | |
outputs=[output_gallery, audio_preview, status_text] | |
) | |
output_gallery.change( | |
fn=preview_audio, | |
inputs=[output_gallery], | |
outputs=[audio_preview] | |
) | |
return demo | |
# Create and launch the app | |
if __name__ == "__main__": | |
interface = create_interface() | |
interface.launch( | |
server_name="0.0.0.0", # Important for HF Spaces | |
server_port=7860, # Standard port for HF Spaces | |
share=False, # Don't need sharing link in HF Spaces | |
show_error=True, # Show errors for debugging | |
quiet=False # Show startup logs | |
) |