File size: 3,057 Bytes
5c83790
bf76c51
 
 
99711d6
bf76c51
 
6a6cb15
4d91972
bf76c51
244dbca
 
 
 
bf76c51
244dbca
bf76c51
 
244dbca
99711d6
 
244dbca
99711d6
 
 
 
 
 
 
 
 
 
244dbca
 
 
 
bf76c51
 
ceaf5e7
99711d6
fd34583
 
4d91972
5c83790
fd34583
5c83790
 
 
bf76c51
5c83790
 
99711d6
f22e219
99711d6
c8fce57
bf76c51
c8fce57
 
99711d6
244dbca
5c83790
 
244dbca
 
99711d6
244dbca
 
 
bf76c51
244dbca
 
c8fce57
bf76c51
244dbca
c8fce57
244dbca
 
 
bf76c51
5c83790
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# MusicGen + Gradio + GPT Demo App (CPU-Optimized with MCP Server)

import gradio as gr
import os
import numpy as np
import torch
from transformers import AutoProcessor, MusicgenForConditionalGeneration
from openai import OpenAI
import scipy.io.wavfile

# Force CPU device (no GPU required)
device = torch.device("cpu")

# Load MusicGen model onto CPU
model_name = "facebook/musicgen-small"
model = MusicgenForConditionalGeneration.from_pretrained(model_name).to(device)
processor = AutoProcessor.from_pretrained(model_name)

# Initialize OpenAI client (set OPENAI_API_KEY in HF Spaces Secrets)
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Refine user prompt via GPT
def refine_prompt(user_input):
    completion = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a music assistant. Make the user's input more descriptive for an AI music generator."},
            {"role": "user", "content": user_input}
        ]
    )
    return completion.choices[0].message.content.strip()

# Generate music (shorter tokens for CPU speed)
def generate_music(prompt, max_new_tokens: int = 128):
    inputs = processor(text=[prompt], return_tensors="pt").to(device)
    audio_values = model.generate(**inputs, max_new_tokens=max_new_tokens)
    sampling_rate = model.config.audio_encoder.sampling_rate
    audio = audio_values[0].cpu().numpy()

    # Normalize to float32 in -1.0 to 1.0 range for Gradio
    audio = audio / np.max(np.abs(audio))
    audio = audio.astype(np.float32)

    # Prepare int16 version and ensure 1D for WAV
    int_audio = (audio * 32767).astype(np.int16)
    int_audio = np.squeeze(int_audio)
    if int_audio.ndim > 1:
        int_audio = int_audio[:, 0]

    # Save as .wav file (in /tmp for Spaces)
    scipy.io.wavfile.write("/tmp/output.wav", sampling_rate, int_audio)
    return sampling_rate, audio

# Combined Gradio function
def main(user_input, max_new_tokens):
    detailed_prompt = refine_prompt(user_input)
    sampling_rate, audio = generate_music(detailed_prompt, max_new_tokens)
    return detailed_prompt, (sampling_rate, audio), "/tmp/output.wav"

# Build Gradio UI
demo = gr.Blocks()
with demo:
    gr.Markdown("""# 🎡 AI Music Generator  
Enter a music idea or mood and get a short AI-generated track. (CPU mode)""")

    user_input = gr.Textbox(label="Describe the mood or style of music")
    max_tokens = gr.Slider(32, 256, value=128, step=32, label="Length (tokens) for CPU")
    generate_btn = gr.Button("Generate Music")

    refined_output = gr.Textbox(label="Enhanced Prompt by GPT")
    audio_output = gr.Audio(label="Generated Audio", type="numpy")
    download_wav = gr.File(label="Download .wav file")

    generate_btn.click(
        main,
        inputs=[user_input, max_tokens],
        outputs=[refined_output, audio_output, download_wav]
    )

# Launch with Gradio MCP Server
from gradio.mcp_server import MCPServer

if __name__ == "__main__":
    server = MCPServer(demo, host="0.0.0.0", port=7860)
    server.run()