File size: 4,505 Bytes
c500d6d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
import os
import threading
from typing import Any, Dict, Iterable, List, Union
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
# -----------------------------
# Model (HF GGUF)
# -----------------------------
MODEL_REPO_ID = os.getenv("MODEL_REPO_ID", "Qwen/Qwen2.5-0.5B-Instruct-GGUF")
MODEL_FILENAME = os.getenv("MODEL_FILENAME", "qwen2.5-0.5b-instruct-q4_k_m.gguf")
SYSTEM_PROMPT = os.getenv(
"SYSTEM_PROMPT",
"You are Qwen, created by Alibaba Cloud. You are a helpful assistant.",
)
# Keep modest on free CPU (KV cache grows with context).
N_CTX = int(os.getenv("N_CTX", "4096"))
# Generation defaults
TEMPERATURE = float(os.getenv("TEMPERATURE", "0.7"))
TOP_P = float(os.getenv("TOP_P", "0.9"))
MAX_TOKENS = int(os.getenv("MAX_TOKENS", "512"))
# -----------------------------
# Lazy singleton model loader
# -----------------------------
_llm: Llama | None = None
_llm_lock = threading.Lock()
def _load_llm() -> Llama:
global _llm
if _llm is not None:
return _llm
with _llm_lock:
if _llm is not None:
return _llm
model_path = hf_hub_download(repo_id=MODEL_REPO_ID, filename=MODEL_FILENAME)
# Qwen instruct GGUFs commonly use ChatML-style formatting.
_llm = Llama(
model_path=model_path,
n_ctx=N_CTX,
n_threads=os.cpu_count() or 4,
n_gpu_layers=0,
chat_format="chatml",
verbose=False,
)
return _llm
# -----------------------------
# Gradio message normalization
# -----------------------------
Content = Union[str, List[Any], Dict[str, Any]]
def _content_to_text(content: Content) -> str:
if isinstance(content, str):
return content
if isinstance(content, list):
parts: List[str] = []
for item in content:
if isinstance(item, str):
parts.append(item)
elif isinstance(item, dict) and item.get("type") == "text":
parts.append(str(item.get("text", "")))
return "".join(parts).strip()
if isinstance(content, dict):
for k in ("text", "content"):
v = content.get(k)
if isinstance(v, str):
return v
return str(content)
def _history_to_messages(history: Any) -> List[Dict[str, str]]:
if not history:
return []
msgs: List[Dict[str, str]] = []
# Old format: list[(user, assistant), ...]
if isinstance(history, list) and history and isinstance(history[0], (tuple, list)) and len(history[0]) == 2:
for user, assistant in history:
if user:
msgs.append({"role": "user", "content": str(user)})
if assistant:
msgs.append({"role": "assistant", "content": str(assistant)})
return msgs
# Newer format: list[{"role": "...", "content": ...}, ...]
if isinstance(history, list) and history and isinstance(history[0], dict):
for m in history:
role = m.get("role")
if role not in ("user", "assistant", "system"):
continue
text = _content_to_text(m.get("content", ""))
if text:
msgs.append({"role": role, "content": text})
return msgs
return []
def _stream_chat(llm: Llama, messages: List[Dict[str, str]]) -> Iterable[str]:
# llama-cpp-python yields OpenAI-like streaming chunks.
stream = llm.create_chat_completion(
messages=messages,
temperature=TEMPERATURE,
top_p=TOP_P,
max_tokens=MAX_TOKENS,
stream=True,
)
partial = ""
for chunk in stream:
token = ""
try:
choice = chunk["choices"][0]
delta = choice.get("delta") or {}
token = delta.get("content") or ""
except Exception:
token = ""
if token:
partial += token
yield partial
def respond(message: str, history: Any):
llm = _load_llm()
msgs: List[Dict[str, str]] = [{"role": "system", "content": SYSTEM_PROMPT}]
prior = _history_to_messages(history)
# Simple history trim
if len(prior) > 20:
prior = prior[-20:]
msgs.extend(prior)
msgs.append({"role": "user", "content": message})
for partial in _stream_chat(llm, msgs):
yield partial
demo = gr.ChatInterface(
fn=respond,
title="GGUF Chatbot (llama-cpp-python)",
)
if __name__ == "__main__":
demo.launch()
|