from transformers import BlipProcessor, BlipForQuestionAnswering import torch import gradio as gr from PIL import Image # Load the processor and model processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base") model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base") # Set device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) def vqa_answer(image, question): # Preprocess the inputs inputs = processor(image, question, return_tensors="pt").to(device) # Generate the answer with torch.no_grad(): generated_ids = model.generate(**inputs) answer = processor.decode(generated_ids[0], skip_special_tokens=True) return answer # Define the input components image_input = gr.components.Image(type="pil", label="Upload an Image") question_input = gr.components.Textbox(lines=1, placeholder="Enter your question here...", label="Question") # Define the output component answer_output = gr.components.Textbox(label="Answer") # Create the interface iface = gr.Interface( fn=vqa_answer, inputs=[image_input, question_input], outputs=answer_output, title="Visual Question Answering App", description="Ask a question about the uploaded image.", article="This app uses the BLIP model to answer questions about images." ) # Launch the app iface.launch(share=True)