File size: 2,168 Bytes
66273c7
 
 
983a9af
 
0a405d7
66273c7
d05ade5
66273c7
d05ade5
66273c7
cea403e
 
c559bf7
cea403e
 
 
66e96b0
9137245
cea403e
 
36dded3
cea403e
 
 
 
0a405d7
 
 
cea403e
0a405d7
 
983a9af
0a405d7
d5c8e58
983a9af
0a405d7
983a9af
 
770bf76
983a9af
 
 
0a405d7
66273c7
 
0a405d7
66273c7
0a405d7
 
 
 
66273c7
 
 
 
bb7ea5f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import scipy
import numpy as np
from diffusers import AudioLDMPipeline

model = AutoModelForCausalLM.from_pretrained("kenooo/multisense-tinyllama-finetune")

tokenizer = AutoTokenizer.from_pretrained("kenooo/multisense-tinyllama-finetune")

# Function to generate sound description
def generate_sound_description(text_input):
    prompt = f"### Instruction:\nDescribe the sound of the following action.\n{text_input}\n### Sound:"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        temperature=0.9,
        do_sample=True,
        top_p=0.95,
        repetition_penalty=1.2
    )
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

    result = decoded.split("### Sound:")[-1].split("###")[0].strip()
    
    if not isinstance(result, str) or len(result.strip()) == 0:
        return "[Error: No valid sound description returned]"
    return result

pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm-m-full", torch_dtype=torch.float32).to("cpu")
def generate_audio_from_description(description, output_path="output.wav"):
    audio = pipe(description, num_inference_steps=50).audios[0]
    audio_np = (audio * 32767).astype(np.int16)
    scipy.io.wavfile.write(output_path, rate=16000, data=audio_np)
    print(f"[📝 DESCRIPTION]: {repr(description)}")
    return output_path

async def multisense_pipeline(text_input):
    description = generate_sound_description(text_input)
    audio_file = generate_audio_from_description(description)
    return description, audio_file

# Gradio interface
iface = gr.Interface(
    fn=multisense_pipeline,
    inputs=gr.Textbox(lines=2, placeholder="e.g., Stirring onions in a hot pan"),
    outputs=[
        gr.Textbox(label="Sound Description"),
        gr.Audio(label="Generated Audio", type="filepath")
    ],
    title="🍳 Cooking Sound Description Generator",
    description="Enter a cooking action. The model will describe the sound it would make."
)

iface.launch(share=True)