Spaces:
Running
Running
import base64 | |
import mimetypes | |
import os | |
from pathlib import Path | |
from typing import Any, Dict, List | |
import gradio as gr | |
from openai import OpenAI | |
DEFAULT_MODEL = os.getenv("DEFAULT_MODEL", "LLaVA-OneVision-1.5-8B-Instruct") | |
_client = OpenAI( | |
base_url=os.getenv("BASE_URL", ""), | |
api_key=os.getenv("API_KEY", ""), | |
) | |
def _data_url(path: str) -> str: | |
mime, _ = mimetypes.guess_type(path) | |
mime = mime or "application/octet-stream" | |
data = base64.b64encode(Path(path).read_bytes()).decode("utf-8") | |
return f"data:{mime};base64,{data}" | |
def _image_content(path: str) -> Dict[str, Any]: | |
return {"type": "image_url", "image_url": {"url": _data_url(path)}} | |
def _text_content(text: str) -> Dict[str, Any]: | |
return {"type": "text", "text": text} | |
def _message(role: str, content: Any) -> Dict[str, Any]: | |
return {"role": role, "content": content} | |
def _build_user_message(message: Dict[str, Any]) -> Dict[str, Any]: | |
files = message.get("files") or [] | |
text = (message.get("text") or "").strip() | |
content: List[Dict[str, Any]] = [_image_content(p) for p in files] | |
if text: | |
content.append(_text_content(text)) | |
return _message("user", content) | |
def _convert_history(history: List[Dict[str, Any]]) -> List[Dict[str, Any]]: | |
msgs: List[Dict[str, Any]] = [] | |
user_content: List[Dict[str, Any]] = [] | |
for turn in history or []: | |
role, content = turn.get("role"), turn.get("content") | |
if role == "user": | |
if isinstance(content, str): | |
user_content.append(_text_content(content)) | |
elif isinstance(content, tuple): | |
user_content.extend(_image_content(path) | |
for path in content if path) | |
elif role == "assistant": | |
msgs.append(_message("user", user_content.copy())) | |
user_content.clear() | |
msgs.append(_message("assistant", content)) | |
return msgs | |
def stream_response(message: Dict[str, Any], history: List[Dict[str, Any]], model_name: str = DEFAULT_MODEL): | |
messages = _convert_history(history) | |
messages.append(_build_user_message(message)) | |
try: | |
stream = _client.chat.completions.create( | |
model=model_name, | |
messages=messages, | |
temperature=0.000001, | |
top_p=1, | |
extra_body={ | |
"repetition_penalty": 1.05, | |
"frequency_penalty": 0, | |
"presence_penalty": 0 | |
}, | |
stream=True | |
) | |
partial = "" | |
for chunk in stream: | |
delta = chunk.choices[0].delta.content | |
if delta: | |
partial += delta | |
yield partial | |
except Exception as e: | |
yield f"Failed to get response: {e}" | |
def build_demo() -> gr.Blocks: | |
chatbot = gr.Chatbot(type="messages", allow_tags=["think"]) | |
textbox = gr.MultimodalTextbox( | |
show_label=False, | |
placeholder="Enter text, or upload one or more images...", | |
file_types=["image"], | |
file_count="single", | |
max_plain_text_length=32768 | |
) | |
model_selector = gr.Dropdown( | |
label="Model", | |
choices=[ | |
("LLaVA-OneVision-1.5-8B-Instruct", "LLaVA-OneVision-1.5-8B-Instruct"), | |
("LLaVA-OneVision-1.5-4B-Instruct", "LLaVA-OneVision-1.5-4B-Instruct"), | |
], | |
value=DEFAULT_MODEL, | |
) | |
return gr.ChatInterface( | |
fn=stream_response, | |
type="messages", | |
multimodal=True, | |
chatbot=chatbot, | |
textbox=textbox, | |
title="LLaVA-OneVision-1.5: Fully Open Framework for Democratized Multimodal Training", | |
description="""**LLaVA-OneVision1.5** introduces a novel family of fully open-source Large Multimodal Models (LMMs) that achieves state-of-the-art performance with substantially lower cost through training on native resolution images. | |
π **Links**: [GitHub](https://github.com/EvolvingLMMs-Lab/LLaVA-OneVision-1.5) | [HuggingFace](https://huggingface.co/lmms-lab)""", | |
additional_inputs=[model_selector], | |
additional_inputs_accordion=gr.Accordion("Options", open=True), | |
).queue(default_concurrency_limit=8) | |
def main(): | |
build_demo().launch() | |
if __name__ == "__main__": | |
main() | |