import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig import torch import scipy import numpy as np from diffusers import AudioLDMPipeline model = AutoModelForCausalLM.from_pretrained("kenooo/multisense-tinyllama-finetune") tokenizer = AutoTokenizer.from_pretrained("kenooo/multisense-tinyllama-finetune") # Function to generate sound description def generate_sound_description(text_input): prompt = f"### Instruction:\nDescribe the sound of the following action.\n{text_input}\n### Sound:" inputs = tokenizer(prompt, return_tensors="pt").to(model.device) outputs = model.generate( **inputs, max_new_tokens=100, temperature=0.9, do_sample=True, top_p=0.95, repetition_penalty=1.2 ) decoded = tokenizer.decode(outputs[0], skip_special_tokens=True) result = decoded.split("### Sound:")[-1].split("###")[0].strip() if not isinstance(result, str) or len(result.strip()) == 0: return "[Error: No valid sound description returned]" return result pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm-m-full", torch_dtype=torch.float32).to("cpu") def generate_audio_from_description(description, output_path="output.wav"): audio = pipe(description, num_inference_steps=50).audios[0] audio_np = (audio * 32767).astype(np.int16) scipy.io.wavfile.write(output_path, rate=16000, data=audio_np) print(f"[📝 DESCRIPTION]: {repr(description)}") return output_path async def multisense_pipeline(text_input): description = generate_sound_description(text_input) audio_file = generate_audio_from_description(description) return description, audio_file # Gradio interface iface = gr.Interface( fn=multisense_pipeline, inputs=gr.Textbox(lines=2, placeholder="e.g., Stirring onions in a hot pan"), outputs=[ gr.Textbox(label="Sound Description"), gr.Audio(label="Generated Audio", type="filepath") ], title="🍳 Cooking Sound Description Generator", description="Enter a cooking action. The model will describe the sound it would make." ) iface.launch(share=True)