import gradio as gr import numpy as np import faiss from sentence_transformers import SentenceTransformer # --- minimal core (in-memory only) --- MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" _model = SentenceTransformer(MODEL_NAME) _dim = int(_model.encode(["_probe_"], convert_to_numpy=True).shape[1]) # 384 _index = faiss.IndexFlatIP(_dim) # cosine via L2-normalized IP _ids, _texts, _metas = [], [], [] def _normalize(v: np.ndarray) -> np.ndarray: n = np.linalg.norm(v, axis=1, keepdims=True) + 1e-12 return (v / n).astype("float32") def _chunk(text: str, size: int, overlap: int): t = " ".join((text or "").split()) n = len(t); s = 0; out = [] if overlap >= size: overlap = max(size - 1, 0) while s < n: e = min(s + size, n) out.append((t[s:e], s, e)) if e == n: break s = max(e - overlap, 0) return out def reset(): global _index, _ids, _texts, _metas _index = faiss.IndexFlatIP(_dim) _ids, _texts, _metas = [], [], [] return gr.update(value="Index reset."), gr.update(value=0) def load_sample(): docs = [ ("a", "PySpark scales ETL across clusters.", {"tag":"spark"}), ("b", "FAISS powers fast vector similarity search used in retrieval.", {"tag":"faiss"}) ] return "\n".join([d[1] for d in docs]) def ingest(docs_text, size, overlap): if not docs_text.strip(): return "Provide at least one line of text.", len(_ids) # one document per line lines = [ln.strip() for ln in docs_text.splitlines() if ln.strip()] rows = [] for i, ln in enumerate(lines): pid = f"doc-{len(_ids)}-{i}" for ctext, s, e in _chunk(ln, size, overlap): rows.append((f"{pid}::offset:{s}-{e}", ctext, {"parent_id": pid, "start": s, "end": e})) if not rows: return "No chunks produced.", len(_ids) vecs = _normalize(_model.encode([r[1] for r in rows], convert_to_numpy=True)) _index.add(vecs) for rid, txt, meta in rows: _ids.append(rid); _texts.append(txt); _metas.append(meta) return f"Ingested docs={len(lines)} chunks={len(rows)}", len(_ids) def answer(q, k, max_context_chars): if _index.ntotal == 0: return {"answer": "Index is empty. Ingest first.", "matches": []} qv = _normalize(_model.encode([q], convert_to_numpy=True)) D, I = _index.search(qv, int(k)) matches = [] for i, s in zip(I[0].tolist(), D[0].tolist()): if i < 0: continue matches.append({ "id": _ids[i], "score": float(s), "text": _texts[i], "meta": _metas[i] }) if not matches: out = "No relevant context." else: # 👇 only use the top match for the answer top = matches[0]["text"] out = f"Based on retrieved context:\n- {top}" return {"answer": out, "matches": matches} with gr.Blocks(title="RAG-as-a-Service") as demo: gr.Markdown("### RAG-as-a-Service - Gradio\nIn-memory FAISS + MiniLM\n; one-line-per-doc ingest\n; quick answers.") with gr.Row(): with gr.Column(): docs = gr.Textbox(label="Documents (one per line)", lines=6, placeholder="One document per line…") with gr.Row(): chunk_size = gr.Slider(64, 1024, value=256, step=16, label="Chunk size") overlap = gr.Slider(0, 256, value=32, step=8, label="Overlap") with gr.Row(): ingest_btn = gr.Button("Ingest") sample_btn = gr.Button("Load sample") reset_btn = gr.Button("Reset") ingest_status = gr.Textbox(label="Ingest status", interactive=False) index_size = gr.Number(label="Index size", interactive=False, value=0) with gr.Column(): q = gr.Textbox(label="Query", placeholder="Ask something...") k = gr.Slider(1, 10, value=5, step=1, label="Top-K") max_chars = gr.Slider(200, 4000, value=1000, step=100, label="Max context chars") run = gr.Button("Answer") out = gr.JSON(label="Answer + matches") ingest_btn.click( ingest, [docs, chunk_size, overlap], [ingest_status, index_size], api_name="ingest" # exposes POST /api/ingest ) sample_btn.click(load_sample, None, docs) reset_btn.click( reset, None, [ingest_status, index_size], api_name="reset" # exposes POST /api/reset (optional) ) run.click( answer, [q, k, max_chars], out, api_name="answer" # exposes POST /api/answer ) if __name__ == "__main__": demo.launch(share=True)