import gradio as gr
from msclap import CLAP

clap_model = CLAP(version = 'clapcap', use_cuda=False)

def clap_inference(mic=None, file=None):

    if mic is not None:
        audio = mic
    elif file is not None:
        audio = file
    else:
        return "You must either provide a mic recording or a file"

    # Generate captions for the recording
    captions = clap_model.generate_caption([audio], 
                                           resample=True, 
                                           beam_size=5, 
                                           entry_length=67, 
                                           temperature=0.01)

    return captions[0]


def create_app():

    with gr.Blocks() as demo:
        gr.Markdown(
            """
            # CLAP demo for automatic audio captioning
            """
        )
        gr.Interface(
            fn=clap_inference,
            inputs=[
                gr.Audio(sources="microphone", type="filepath"),
                gr.Audio(sources="upload", type="filepath"),
            ],
            outputs="text",
        )

    return demo

def main():
    
    app = create_app()
    app.launch(debug=True)

    
if __name__ == "__main__":
    main()