import gradio as gr import torch from PIL import Image from transformers import AutoModel, AutoTokenizer import spaces # Initialize model and tokenizer torch.manual_seed(100) model = AutoModel.from_pretrained( 'openbmb/MiniCPM-V-4_5', trust_remote_code=True, attn_implementation='sdpa', torch_dtype=torch.bfloat16 ) model = model.eval().cuda() tokenizer = AutoTokenizer.from_pretrained( 'openbmb/MiniCPM-V-4_5', trust_remote_code=True ) @spaces.GPU(duration=120) def respond(message, history, enable_thinking): """ Process user message and generate response """ # Build conversation history in the format expected by the model msgs = [] # Add previous conversation history for h in history: user_msg = h[0] assistant_msg = h[1] # Parse user message for images and text user_content = [] if isinstance(user_msg, tuple): # If user message contains an image img_path, text = user_msg img = Image.open(img_path).convert('RGB') user_content = [img, text] if text else [img] else: # Text only message user_content = [user_msg] msgs.append({"role": "user", "content": user_content}) if assistant_msg: msgs.append({"role": "assistant", "content": [assistant_msg]}) # Add current message current_content = [] if isinstance(message, dict): # Handle multimodal input if message.get("files"): for file_path in message["files"]: img = Image.open(file_path).convert('RGB') current_content.append(img) if message.get("text"): current_content.append(message["text"]) else: # Handle text-only input current_content = [message] msgs.append({"role": "user", "content": current_content}) # Generate response try: answer = model.chat( msgs=msgs, tokenizer=tokenizer, enable_thinking=enable_thinking ) return answer except Exception as e: return f"Error: {str(e)}" # Create Gradio interface with gr.Blocks(title="MiniCPM-V Chatbot") as demo: gr.Markdown( """ # 🤖 MiniCPM-V Multimodal Chatbot Upload images and ask questions about them, or have a text conversation. The model supports multi-turn conversations with context memory. """ ) with gr.Row(): with gr.Column(scale=4): chatbot = gr.Chatbot( height=500, show_label=False, container=True, type="tuples" ) with gr.Row(): msg = gr.MultimodalTextbox( interactive=True, file_types=["image"], placeholder="Type a message or upload an image...", show_label=False, container=False ) with gr.Row(): clear = gr.Button("🗑️ Clear", size="sm") submit = gr.Button("📤 Send", variant="primary", size="sm") with gr.Column(scale=1): gr.Markdown("### Settings") enable_thinking = gr.Checkbox( label="Enable Thinking Mode", value=False, info="Enable the model's thinking process" ) gr.Markdown( """ ### Examples - Upload an image and ask "What is in this picture?" - Ask "What are the main objects visible?" - Follow up with "What should I pay attention to here?" """ ) # Handle message submission def user_submit(message, history, enable_thinking): # Format the user message for display if isinstance(message, dict) and message.get("files"): # If there are files, create tuple format for chatbot display user_msg = (message["files"][0], message.get("text", "")) else: user_msg = message.get("text", "") if isinstance(message, dict) else message # Add user message to history history = history + [(user_msg, None)] # Generate response response = respond(message, history[:-1], enable_thinking) # Update history with response history[-1] = (history[-1][0], response) return "", history # Event handlers msg.submit( user_submit, inputs=[msg, chatbot, enable_thinking], outputs=[msg, chatbot] ) submit.click( user_submit, inputs=[msg, chatbot, enable_thinking], outputs=[msg, chatbot] ) clear.click( lambda: (None, []), outputs=[msg, chatbot] ) if __name__ == "__main__": demo.launch(share=True)