import gradio as gr from msclap import CLAP clap_model = CLAP(version = 'clapcap', use_cuda=False) def clap_inference(mic=None, file=None): if mic is not None: audio = mic elif file is not None: audio = file else: return "You must either provide a mic recording or a file" # Generate captions for the recording captions = clap_model.generate_caption([audio], resample=True, beam_size=5, entry_length=67, temperature=0.01) return captions[0] def create_app(): with gr.Blocks() as demo: gr.Markdown( """ # CLAP demo for automatic audio captioning """ ) gr.Interface( fn=clap_inference, inputs=[ gr.Audio(sources="microphone", type="filepath"), gr.Audio(sources="upload", type="filepath"), ], outputs="text", ) return demo def main(): app = create_app() app.launch(debug=True) if __name__ == "__main__": main()