# ✅ app.py — Hugging Face Space Version (Finalized) # RAG over local PDFs/DOCX using Hugging Face-hosted models with Chroma import os import json import fitz # PyMuPDF import nltk import chromadb from tqdm import tqdm from nltk.tokenize import sent_tokenize from sentence_transformers import SentenceTransformer, util import numpy as np import torch from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline import pytesseract from PIL import Image import io import docx2txt import gradio as gr # --------------------------- # ✅ Configuration # --------------------------- MANUALS_DIR = "./Manuals" # Folder containing all PDF and DOCX files CHROMA_PATH = "./chroma_store" CHUNKS_PATH = "chunks.jsonl" COLLECTION_NAME = "manual_chunks" MAX_CONTEXT_CHUNKS = 3 CHUNK_SIZE = 750 CHUNK_OVERLAP = 100 HF_TOKEN = os.environ.get("HF_TOKEN") LLM_MODELS = { "LLaMA 3.1 8B": "meta-llama/Llama-3.1-8B-Instruct", "LLaMA 3 8B": "meta-llama/Llama-3-8B-Instruct", "LLaMA 4 Scout": "meta-llama/Llama-4-Scout-17B-16E-Instruct", "Mistral": "mistralai/Mistral-7B-Instruct-v0.3", "Gemma": "google/gemma-1.1-7b-it", "Qwen 3 30B": "Qwen/Qwen3-30B-A3B", } # --------------------------- # ✅ Setup # --------------------------- nltk.download('punkt') embedder = SentenceTransformer("all-MiniLM-L6-v2") client = chromadb.PersistentClient(path=CHROMA_PATH) collection = None # --------------------------- # 📄 Load all PDFs and DOCX content # --------------------------- def extract_all_documents(): chunks = [] for fname in os.listdir(MANUALS_DIR): path = os.path.join(MANUALS_DIR, fname) if fname.lower().endswith(".pdf"): doc = fitz.open(path) for i, page in enumerate(doc): text = page.get_text().strip() if not text: pix = page.get_pixmap(dpi=300) img = Image.open(io.BytesIO(pix.tobytes("png"))) text = pytesseract.image_to_string(img) if text.strip(): chunks.append((fname, i + 1, text.strip())) elif fname.lower().endswith(".docx"): text = docx2txt.process(path) if text.strip(): chunks.append((fname, 1, text.strip())) return chunks # --------------------------- # ✂️ Chunk text # --------------------------- def split_chunks(text, size=CHUNK_SIZE, overlap=CHUNK_OVERLAP): sentences = sent_tokenize(text) chunks, curr, curr_len = [], [], 0 for sent in sentences: tok_len = len(sent.split()) if curr_len + tok_len > size: chunks.append(" ".join(curr)) curr = curr[-overlap:] curr_len = sum(len(s.split()) for s in curr) curr.append(sent) curr_len += tok_len if curr: chunks.append(" ".join(curr)) return chunks # --------------------------- # 💾 Embed into Chroma # --------------------------- def embed_documents(): global collection if collection: client.delete_collection(COLLECTION_NAME) collection = client.create_collection(COLLECTION_NAME) docs = extract_all_documents() records = [] for fname, page, text in docs: for i, chunk in enumerate(split_chunks(text)): if not chunk.strip(): continue records.append({ "id": f"{fname}::p{page}::c{i}", "text": chunk, "metadata": {"source_file": fname, "page": page} }) for i in tqdm(range(0, len(records), 16)): batch = records[i:i + 16] texts = [b["text"] for b in batch] ids = [b["id"] for b in batch] metas = [b["metadata"] for b in batch] embs = embedder.encode(texts).tolist() collection.add(documents=texts, ids=ids, metadatas=metas, embeddings=embs) return f"✅ Embedded {len(records)} chunks" # --------------------------- # 🔎 Query # --------------------------- def search_context(query, top_k=MAX_CONTEXT_CHUNKS): results = collection.query(query_texts=[query], n_results=top_k) chunks = results["documents"][0] metas = results["metadatas"][0] return "\n\n".join( f"File: {m['source_file']}, Page: {m['page']}\n{c}" for m, c in zip(metas, chunks) ) # --------------------------- # 🧠 Run Inference # --------------------------- def ask_model(model_name, query): if not HF_TOKEN: return "❌ HF_TOKEN not set." context = search_context(query) system_prompt = "Answer only using the context. Say 'I don't know' if not found." prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>{system_prompt}<|start_header_id|>user<|end_header_id|>{context}\n\nQuestion: {query}<|start_header_id|>assistant<|end_header_id|>" tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN) model = AutoModelForCausalLM.from_pretrained(model_name, token=HF_TOKEN, device_map="auto") pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1) output = pipe(prompt, max_new_tokens=512, do_sample=True)[0]["generated_text"] return output.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip() # --------------------------- # 🎛 Gradio UI # --------------------------- def launch_interface(): with gr.Blocks() as demo: gr.Markdown(""" # 🧠 SmartManuals-AI (Hugging Face Edition) Upload manuals to `./Manuals`, click Embed, then ask questions. """) with gr.Row(): embed_button = gr.Button("⚙️ Embed Documents") embed_status = gr.Textbox(label="Status") with gr.Row(): model_select = gr.Dropdown(list(LLM_MODELS.keys()), label="Model", value="LLaMA 3.1 8B") question = gr.Textbox(label="Question") answer = gr.Textbox(label="Answer", lines=10) submit = gr.Button("🔍 Ask") embed_button.click(fn=embed_documents, outputs=embed_status) submit.click(fn=lambda m, q: ask_model(LLM_MODELS[m], q), inputs=[model_select, question], outputs=[answer]) demo.launch() # --------------------------- if __name__ == "__main__": launch_interface()