Spaces:
Running
Running
File size: 2,168 Bytes
66273c7 983a9af 0a405d7 66273c7 d05ade5 66273c7 d05ade5 66273c7 cea403e c559bf7 cea403e 66e96b0 9137245 cea403e 36dded3 cea403e 0a405d7 cea403e 0a405d7 983a9af 0a405d7 d5c8e58 983a9af 0a405d7 983a9af 770bf76 983a9af 0a405d7 66273c7 0a405d7 66273c7 0a405d7 66273c7 bb7ea5f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import scipy
import numpy as np
from diffusers import AudioLDMPipeline
model = AutoModelForCausalLM.from_pretrained("kenooo/multisense-tinyllama-finetune")
tokenizer = AutoTokenizer.from_pretrained("kenooo/multisense-tinyllama-finetune")
# Function to generate sound description
def generate_sound_description(text_input):
prompt = f"### Instruction:\nDescribe the sound of the following action.\n{text_input}\n### Sound:"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(
**inputs,
max_new_tokens=100,
temperature=0.9,
do_sample=True,
top_p=0.95,
repetition_penalty=1.2
)
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
result = decoded.split("### Sound:")[-1].split("###")[0].strip()
if not isinstance(result, str) or len(result.strip()) == 0:
return "[Error: No valid sound description returned]"
return result
pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm-m-full", torch_dtype=torch.float32).to("cpu")
def generate_audio_from_description(description, output_path="output.wav"):
audio = pipe(description, num_inference_steps=50).audios[0]
audio_np = (audio * 32767).astype(np.int16)
scipy.io.wavfile.write(output_path, rate=16000, data=audio_np)
print(f"[📝 DESCRIPTION]: {repr(description)}")
return output_path
async def multisense_pipeline(text_input):
description = generate_sound_description(text_input)
audio_file = generate_audio_from_description(description)
return description, audio_file
# Gradio interface
iface = gr.Interface(
fn=multisense_pipeline,
inputs=gr.Textbox(lines=2, placeholder="e.g., Stirring onions in a hot pan"),
outputs=[
gr.Textbox(label="Sound Description"),
gr.Audio(label="Generated Audio", type="filepath")
],
title="🍳 Cooking Sound Description Generator",
description="Enter a cooking action. The model will describe the sound it would make."
)
iface.launch(share=True)
|