MiniCPM-V-4_5 / app.py
akhaliq's picture
akhaliq HF Staff
Upload app.py with huggingface_hub
ffb7a8f verified
import gradio as gr
import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer
import spaces
# Initialize model and tokenizer
torch.manual_seed(100)
model = AutoModel.from_pretrained(
'openbmb/MiniCPM-V-4_5',
trust_remote_code=True,
attn_implementation='sdpa',
torch_dtype=torch.bfloat16
)
model = model.eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(
'openbmb/MiniCPM-V-4_5',
trust_remote_code=True
)
@spaces.GPU(duration=120)
def respond(message, history, enable_thinking):
"""
Process user message and generate response
"""
# Build conversation history in the format expected by the model
msgs = []
# Add previous conversation history
for h in history:
user_msg = h[0]
assistant_msg = h[1]
# Parse user message for images and text
user_content = []
if isinstance(user_msg, tuple):
# If user message contains an image
img_path, text = user_msg
img = Image.open(img_path).convert('RGB')
user_content = [img, text] if text else [img]
else:
# Text only message
user_content = [user_msg]
msgs.append({"role": "user", "content": user_content})
if assistant_msg:
msgs.append({"role": "assistant", "content": [assistant_msg]})
# Add current message
current_content = []
if isinstance(message, dict):
# Handle multimodal input
if message.get("files"):
for file_path in message["files"]:
img = Image.open(file_path).convert('RGB')
current_content.append(img)
if message.get("text"):
current_content.append(message["text"])
else:
# Handle text-only input
current_content = [message]
msgs.append({"role": "user", "content": current_content})
# Generate response
try:
answer = model.chat(
msgs=msgs,
tokenizer=tokenizer,
enable_thinking=enable_thinking
)
return answer
except Exception as e:
return f"Error: {str(e)}"
# Create Gradio interface
with gr.Blocks(title="MiniCPM-V Chatbot") as demo:
gr.Markdown(
"""
# πŸ€– MiniCPM-V Multimodal Chatbot
Upload images and ask questions about them, or have a text conversation.
The model supports multi-turn conversations with context memory.
"""
)
with gr.Row():
with gr.Column(scale=4):
chatbot = gr.Chatbot(
height=500,
show_label=False,
container=True,
type="tuples"
)
with gr.Row():
msg = gr.MultimodalTextbox(
interactive=True,
file_types=["image"],
placeholder="Type a message or upload an image...",
show_label=False,
container=False
)
with gr.Row():
clear = gr.Button("πŸ—‘οΈ Clear", size="sm")
submit = gr.Button("πŸ“€ Send", variant="primary", size="sm")
with gr.Column(scale=1):
gr.Markdown("### Settings")
enable_thinking = gr.Checkbox(
label="Enable Thinking Mode",
value=False,
info="Enable the model's thinking process"
)
gr.Markdown(
"""
### Examples
- Upload an image and ask "What is in this picture?"
- Ask "What are the main objects visible?"
- Follow up with "What should I pay attention to here?"
"""
)
# Handle message submission
def user_submit(message, history, enable_thinking):
# Format the user message for display
if isinstance(message, dict) and message.get("files"):
# If there are files, create tuple format for chatbot display
user_msg = (message["files"][0], message.get("text", ""))
else:
user_msg = message.get("text", "") if isinstance(message, dict) else message
# Add user message to history
history = history + [(user_msg, None)]
# Generate response
response = respond(message, history[:-1], enable_thinking)
# Update history with response
history[-1] = (history[-1][0], response)
return "", history
# Event handlers
msg.submit(
user_submit,
inputs=[msg, chatbot, enable_thinking],
outputs=[msg, chatbot]
)
submit.click(
user_submit,
inputs=[msg, chatbot, enable_thinking],
outputs=[msg, chatbot]
)
clear.click(
lambda: (None, []),
outputs=[msg, chatbot]
)
if __name__ == "__main__":
demo.launch(share=True)