|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import gradio as gr |
|
from dcase24t6.nn.hub import baseline_pipeline |
|
import librosa |
|
import torch |
|
|
|
model = baseline_pipeline() |
|
|
|
def dcase_inference(mic=None, file=None): |
|
|
|
if mic is not None: |
|
audio = mic |
|
sr = 48000 |
|
gr.Info(f"sr 1: {sr}") |
|
elif file is not None: |
|
gr.Info(f"file 1: {file}") |
|
audio, sr = librosa.load(file, sr=None) |
|
audio = torch.from_numpy(audio) |
|
gr.Info(f"file 1: {sr}") |
|
else: |
|
return "You must either provide a mic recording or a file" |
|
|
|
|
|
item = {"audio": audio, "sr": sr} |
|
outputs = model(item) |
|
candidate = outputs["candidates"][0] |
|
|
|
return candidate |
|
|
|
def create_app(): |
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown( |
|
""" |
|
# DCASE demo for automatic audio captioning |
|
""" |
|
) |
|
gr.Interface( |
|
fn=dcase_inference, |
|
inputs=[ |
|
gr.Audio(sources="microphone", type="filepath"), |
|
gr.Audio(sources="upload", type="filepath"), |
|
], |
|
outputs="text", |
|
) |
|
|
|
return demo |
|
|
|
def main(): |
|
|
|
app = create_app() |
|
app.launch(debug=True) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|
|
|