Spaces:

John6666
/

llama_cpp_python_test1

Paused

File size: 4,505 Bytes

c500d6d

import os
import threading
from typing import Any, Dict, Iterable, List, Union

import gradio as gr
from huggingface_hub import hf_hub_download

from llama_cpp import Llama

# -----------------------------
# Model (HF GGUF)
# -----------------------------
MODEL_REPO_ID = os.getenv("MODEL_REPO_ID", "Qwen/Qwen2.5-0.5B-Instruct-GGUF")
MODEL_FILENAME = os.getenv("MODEL_FILENAME", "qwen2.5-0.5b-instruct-q4_k_m.gguf")

SYSTEM_PROMPT = os.getenv(
    "SYSTEM_PROMPT",
    "You are Qwen, created by Alibaba Cloud. You are a helpful assistant.",
)

# Keep modest on free CPU (KV cache grows with context).
N_CTX = int(os.getenv("N_CTX", "4096"))

# Generation defaults
TEMPERATURE = float(os.getenv("TEMPERATURE", "0.7"))
TOP_P = float(os.getenv("TOP_P", "0.9"))
MAX_TOKENS = int(os.getenv("MAX_TOKENS", "512"))

# -----------------------------
# Lazy singleton model loader
# -----------------------------
_llm: Llama | None = None
_llm_lock = threading.Lock()


def _load_llm() -> Llama:
    global _llm
    if _llm is not None:
        return _llm

    with _llm_lock:
        if _llm is not None:
            return _llm

        model_path = hf_hub_download(repo_id=MODEL_REPO_ID, filename=MODEL_FILENAME)

        # Qwen instruct GGUFs commonly use ChatML-style formatting.
        _llm = Llama(
            model_path=model_path,
            n_ctx=N_CTX,
            n_threads=os.cpu_count() or 4,
            n_gpu_layers=0,
            chat_format="chatml",
            verbose=False,
        )
        return _llm


# -----------------------------
# Gradio message normalization
# -----------------------------
Content = Union[str, List[Any], Dict[str, Any]]


def _content_to_text(content: Content) -> str:
    if isinstance(content, str):
        return content
    if isinstance(content, list):
        parts: List[str] = []
        for item in content:
            if isinstance(item, str):
                parts.append(item)
            elif isinstance(item, dict) and item.get("type") == "text":
                parts.append(str(item.get("text", "")))
        return "".join(parts).strip()
    if isinstance(content, dict):
        for k in ("text", "content"):
            v = content.get(k)
            if isinstance(v, str):
                return v
    return str(content)


def _history_to_messages(history: Any) -> List[Dict[str, str]]:
    if not history:
        return []

    msgs: List[Dict[str, str]] = []

    # Old format: list[(user, assistant), ...]
    if isinstance(history, list) and history and isinstance(history[0], (tuple, list)) and len(history[0]) == 2:
        for user, assistant in history:
            if user:
                msgs.append({"role": "user", "content": str(user)})
            if assistant:
                msgs.append({"role": "assistant", "content": str(assistant)})
        return msgs

    # Newer format: list[{"role": "...", "content": ...}, ...]
    if isinstance(history, list) and history and isinstance(history[0], dict):
        for m in history:
            role = m.get("role")
            if role not in ("user", "assistant", "system"):
                continue
            text = _content_to_text(m.get("content", ""))
            if text:
                msgs.append({"role": role, "content": text})
        return msgs

    return []


def _stream_chat(llm: Llama, messages: List[Dict[str, str]]) -> Iterable[str]:
    # llama-cpp-python yields OpenAI-like streaming chunks.
    stream = llm.create_chat_completion(
        messages=messages,
        temperature=TEMPERATURE,
        top_p=TOP_P,
        max_tokens=MAX_TOKENS,
        stream=True,
    )

    partial = ""
    for chunk in stream:
        token = ""
        try:
            choice = chunk["choices"][0]
            delta = choice.get("delta") or {}
            token = delta.get("content") or ""
        except Exception:
            token = ""
        if token:
            partial += token
            yield partial


def respond(message: str, history: Any):
    llm = _load_llm()

    msgs: List[Dict[str, str]] = [{"role": "system", "content": SYSTEM_PROMPT}]
    prior = _history_to_messages(history)

    # Simple history trim
    if len(prior) > 20:
        prior = prior[-20:]

    msgs.extend(prior)
    msgs.append({"role": "user", "content": message})

    for partial in _stream_chat(llm, msgs):
        yield partial


demo = gr.ChatInterface(
    fn=respond,
    title="GGUF Chatbot (llama-cpp-python)",
)

if __name__ == "__main__":
    demo.launch()