Martí Umbert
app_dcase.py: bugfix
a37b5fd
# import gradio as gr
# from msclap import CLAP
# clap_model = CLAP(version = 'clapcap', use_cuda=False)
# def clap_inference(mic=None, file=None):
# if mic is not None:
# audio = mic
# elif file is not None:
# audio = file
# else:
# return "You must either provide a mic recording or a file"
# # Generate captions for the recording
# captions = clap_model.generate_caption([audio],
# resample=True,
# beam_size=5,
# entry_length=67,
# temperature=0.01)
# return captions[0]
import gradio as gr
from dcase24t6.nn.hub import baseline_pipeline
import librosa
import torch
model = baseline_pipeline()
def dcase_inference(mic=None, file=None):
if mic is not None:
audio = mic
sr = 48000
gr.Info(f"sr 1: {sr}")
elif file is not None:
gr.Info(f"file 1: {file}")
audio, sr = librosa.load(file, sr=None)
audio = torch.from_numpy(audio)
gr.Info(f"file 1: {sr}")
else:
return "You must either provide a mic recording or a file"
# Generate captions for the recording
item = {"audio": audio, "sr": sr}
outputs = model(item)
candidate = outputs["candidates"][0]
return candidate
def create_app():
with gr.Blocks() as demo:
gr.Markdown(
"""
# DCASE demo for automatic audio captioning
"""
)
gr.Interface(
fn=dcase_inference,
inputs=[
gr.Audio(sources="microphone", type="filepath"),
gr.Audio(sources="upload", type="filepath"),
],
outputs="text",
)
return demo
def main():
app = create_app()
app.launch(debug=True)
if __name__ == "__main__":
main()