vadkos12's picture
Update app.py
9a4a0f6 verified
# app.py — Gradio UI + llama-server wrapper для Qwen3-Embedding-0.6B GGUF
import os, time, subprocess, sys, signal
import requests
import json
import numpy as np
import gradio as gr
from huggingface_hub import hf_hub_download
# --- Параметры модели (конкретные имена; при необходимости замените)
REPO = "Qwen/Qwen3-Embedding-0.6B-GGUF"
FNAME = "Qwen3-Embedding-0.6B-Q8_0.gguf"
LOCAL_MODEL_PATH = os.path.join(os.getcwd(), FNAME)
# --- llama-server бинари (из собранного llama.cpp)
LLAMA_SERVER_BIN = os.path.join("llama.cpp", "build", "bin", "llama-server")
LLAMA_PORT = 8080
LLAMA_HOST = "127.0.0.1"
LLAMA_URL = f"http://{LLAMA_HOST}:{LLAMA_PORT}"
def download_model():
if os.path.exists(LOCAL_MODEL_PATH) and os.path.getsize(LOCAL_MODEL_PATH) > 1000_000_000:
print("Model already exists:", LOCAL_MODEL_PATH)
return LOCAL_MODEL_PATH
print("Downloading model (this can take a while)...")
path = hf_hub_download(repo_id=REPO, filename=FNAME, local_dir=".", resume_download=True)
print("Downloaded to:", path)
return path
def build_llama_if_needed():
# Если бинарник уже есть и исполняемый — пропускаем
if os.path.exists(LLAMA_SERVER_BIN) and os.access(LLAMA_SERVER_BIN, os.X_OK):
print("llama-server already built:", LLAMA_SERVER_BIN)
return
# Иначе запускаем build скрипт (должен быть в репо)
print("Building llama.cpp (may take many minutes)...")
res = subprocess.run(["bash", "build_llama.sh"], check=False)
if res.returncode != 0:
print("Build failed with code", res.returncode)
raise SystemExit("Failed to build llama.cpp")
def start_llama_server(model_path):
cmd = [
LLAMA_SERVER_BIN,
"-m", model_path,
"--embedding",
"--pooling", "last",
"--host", LLAMA_HOST,
"--port", str(LLAMA_PORT),
"--verbose"
]
print("Starting llama-server:", " ".join(cmd))
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
return proc
def wait_server_ready(timeout=180):
start = time.time()
while time.time() - start < timeout:
try:
r = requests.get(LLAMA_URL + "/v1/models", timeout=3)
if r.status_code == 200:
print("Server ready")
return True
except Exception:
pass
time.sleep(1)
return False
def get_embeddings_from_server(texts):
url = LLAMA_URL + "/v1/embeddings"
payload = {"input": texts}
headers = {"Content-Type": "application/json"}
r = requests.post(url, json=payload, headers=headers, timeout=120)
if r.status_code != 200:
raise RuntimeError(f"Embeddings request failed: {r.status_code} {r.text}")
data = r.json()
# OpenAI-like response parsing
if "data" in data and len(data["data"]) >= 1 and "embedding" in data["data"][0]:
embeddings = [np.array(item["embedding"], dtype=np.float32) for item in data["data"]]
return embeddings
if "embedding" in data:
return [np.array(data["embedding"], dtype=np.float32)]
raise RuntimeError("Unexpected embeddings response: " + str(data))
def cosine(a, b):
# safe cosine
na = np.linalg.norm(a)
nb = np.linalg.norm(b)
if na == 0 or nb == 0:
return 0.0
return float(np.dot(a, b) / (na * nb))
# --- Startup sequence: download -> build if needed -> start server
model_path = download_model()
build_llama_if_needed()
proc = start_llama_server(model_path)
if not wait_server_ready(timeout=240):
stderr = ""
try:
stderr = proc.stderr.read()[:2000]
except Exception:
pass
raise SystemExit("llama-server did not become ready in time. Stderr head:\n" + stderr)
# --- Boolean-pаттерны (простая проверка ключевых слов) ---
import re
def _norm_words(text):
text = text.lower()
text = re.sub(r"[^0-9a-zа-яё\s]", " ", text, flags=re.I)
words = [w for w in text.split() if len(w) > 1]
return words
def match_boolean_pattern(pattern: str, message: str) -> bool:
msg_words = set(_norm_words(message))
pat = pattern.strip()
def check_or_group(group_text):
parts = [p.strip() for p in re.split(r"\bOR\b", group_text, flags=re.I)]
for p in parts:
if p.lower() in msg_words:
return True
return False
remaining = pat
and_conditions = []
for m in re.finditer(r"\((.*?)\)", pat):
group = m.group(1)
and_conditions.append(("or_group", group))
remaining = remaining.replace(m.group(0), " ")
top_tokens = re.split(r"\bAND\b", remaining, flags=re.I)
for t in top_tokens:
t = t.strip()
if not t:
continue
if re.search(r"\bOR\b", t, flags=re.I):
and_conditions.append(("or_group", t))
else:
w = t.split()[0].strip()
if w:
and_conditions.append(("word", w))
for typ, val in and_conditions:
if typ == "or_group":
if not check_or_group(val):
return False
else:
if val.lower() not in msg_words:
return False
return True
# --- Gradio UI ---
def similarity_ui(pattern, message, use_boolean=False, show_raw=False):
if use_boolean:
ok = match_boolean_pattern(pattern, message)
if not ok:
return "Boolean check: FAILED (no keyword match)"
emb_list = get_embeddings_from_server([pattern, message])
s = cosine(emb_list[0], emb_list[1])
if show_raw:
return f"cosine={s:.4f}\n\npattern_emb(first10)={emb_list[0][:10].tolist()}\nmessage_emb(first10)={emb_list[1][:10].tolist()}"
return f"{s:.4f}"
def search_ui(query, docs_text, topk):
docs = [d.strip() for d in docs_text.splitlines() if d.strip()]
if not docs:
return "Empty corpus"
embs = get_embeddings_from_server(docs + [query])
D = np.stack(embs[:-1])
q = embs[-1]
scores = (D @ q) / (np.linalg.norm(D, axis=1) * np.linalg.norm(q))
order = np.argsort(scores)[::-1][:int(topk)]
out_lines = []
for rank, idx in enumerate(order, start=1):
out_lines.append(f"{rank}. score={scores[idx]:.4f}\n{docs[idx]}")
return "\n\n".join(out_lines)
demo = gr.Blocks()
with demo:
gr.Markdown("# Qwen3-Embedding-0.6B GGUF — тест паттерн ↔ сообщение")
with gr.Tab("Сходство (cosine)"):
p = gr.Textbox(label="Паттерн", value="Meeting between Trump and Putin")
m = gr.Textbox(label="Сообщение", value="Встреча Трампа и Путина прошла в Женеве.")
use_bool = gr.Checkbox(label="Boolean pattern match (быстрая фильтрация)", value=False)
show_raw = gr.Checkbox(label="Показать первые значения embedding (debug)", value=False)
btn = gr.Button("Сравнить")
out = gr.Textbox(label="Результат (cosine или debug)", interactive=False, lines=6)
btn.click(similarity_ui, inputs=[p, m, use_bool, show_raw], outputs=out)
with gr.Tab("Семантический поиск"):
q = gr.Textbox(label="Запрос", value="саммит Трамп Путин")
corpus = gr.Textbox(label="Корпус (по строкам)", lines=12, value=(
"Встреча президентов России и США прошла в Женеве.\n"
"Лукашенко провёл переговоры с Евросоюзом.\n"
"Джо Байден выступал в Давосе.\n"
"Футбольный чемпионат прошёл на стадионе."
))
k = gr.Number(label="Top-K", value=3, precision=0)
btn2 = gr.Button("Найти")
out2 = gr.Textbox(label="Результаты", lines=12)
btn2.click(search_ui, inputs=[q, corpus, k], outputs=out2)
demo.launch(server_name="0.0.0.0", server_port=7860)