import base64 import mimetypes import os from pathlib import Path from typing import Any, Dict, List import gradio as gr from openai import OpenAI DEFAULT_MODEL = os.getenv("DEFAULT_MODEL", "LLaVA-OneVision-1.5-8B-Instruct") _client = OpenAI( base_url=os.getenv("BASE_URL", ""), api_key=os.getenv("API_KEY", ""), ) def _data_url(path: str) -> str: mime, _ = mimetypes.guess_type(path) mime = mime or "application/octet-stream" data = base64.b64encode(Path(path).read_bytes()).decode("utf-8") return f"data:{mime};base64,{data}" def _image_content(path: str) -> Dict[str, Any]: return {"type": "image_url", "image_url": {"url": _data_url(path)}} def _text_content(text: str) -> Dict[str, Any]: return {"type": "text", "text": text} def _message(role: str, content: Any) -> Dict[str, Any]: return {"role": role, "content": content} def _build_user_message(message: Dict[str, Any]) -> Dict[str, Any]: files = message.get("files") or [] text = (message.get("text") or "").strip() content: List[Dict[str, Any]] = [_image_content(p) for p in files] if text: content.append(_text_content(text)) return _message("user", content) def _convert_history(history: List[Dict[str, Any]]) -> List[Dict[str, Any]]: msgs: List[Dict[str, Any]] = [] user_content: List[Dict[str, Any]] = [] for turn in history or []: role, content = turn.get("role"), turn.get("content") if role == "user": if isinstance(content, str): user_content.append(_text_content(content)) elif isinstance(content, tuple): user_content.extend(_image_content(path) for path in content if path) elif role == "assistant": msgs.append(_message("user", user_content.copy())) user_content.clear() msgs.append(_message("assistant", content)) return msgs def stream_response(message: Dict[str, Any], history: List[Dict[str, Any]], model_name: str = DEFAULT_MODEL): messages = _convert_history(history) messages.append(_build_user_message(message)) try: stream = _client.chat.completions.create( model=model_name, messages=messages, temperature=0.000001, top_p=1, extra_body={ "repetition_penalty": 1.05, "frequency_penalty": 0, "presence_penalty": 0 }, stream=True ) partial = "" for chunk in stream: delta = chunk.choices[0].delta.content if delta: partial += delta yield partial except Exception as e: yield f"Failed to get response: {e}" def build_demo() -> gr.Blocks: chatbot = gr.Chatbot(type="messages", allow_tags=["think"]) textbox = gr.MultimodalTextbox( show_label=False, placeholder="Enter text, or upload one or more images...", file_types=["image"], file_count="single", max_plain_text_length=32768 ) model_selector = gr.Dropdown( label="Model", choices=[ ("LLaVA-OneVision-1.5-8B-Instruct", "LLaVA-OneVision-1.5-8B-Instruct"), ("LLaVA-OneVision-1.5-4B-Instruct", "LLaVA-OneVision-1.5-4B-Instruct"), ], value=DEFAULT_MODEL, ) return gr.ChatInterface( fn=stream_response, type="messages", multimodal=True, chatbot=chatbot, textbox=textbox, title="LLaVA-OneVision-1.5: Fully Open Framework for Democratized Multimodal Training", description="""**LLaVA-OneVision1.5** introduces a novel family of fully open-source Large Multimodal Models (LMMs) that achieves state-of-the-art performance with substantially lower cost through training on native resolution images. 🔗 **Links**: [GitHub](https://github.com/EvolvingLMMs-Lab/LLaVA-OneVision-1.5) | [HuggingFace](https://huggingface.co/lmms-lab)""", additional_inputs=[model_selector], additional_inputs_accordion=gr.Accordion("Options", open=True), ).queue(default_concurrency_limit=8) def main(): build_demo().launch() if __name__ == "__main__": main()