# โœ… SmartManuals-AI App for Hugging Face Spaces # Full app.py with spaCy-based sentence segmentation and model dropdown selection import os import json import fitz # PyMuPDF import chromadb import torch import docx import gradio as gr import pytesseract import numpy as np import spacy from tqdm import tqdm from PIL import Image from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM from sentence_transformers import SentenceTransformer, util # --------------------------- # โš™๏ธ Configuration # --------------------------- MANUALS_DIR = "./Manuals" CHROMA_PATH = "./chroma_store" CHROMA_COLLECTION = "manual_chunks" CHUNK_SIZE = 750 CHUNK_OVERLAP = 100 EMBED_MODEL = "all-MiniLM-L6-v2" DEFAULT_MODEL = "meta-llama/Llama-3-8B-Instruct" AVAILABLE_MODELS = [ "meta-llama/Llama-3-8B-Instruct", "meta-llama/Llama-4-Scout-17B-16E-Instruct", "google/gemma-1.1-7b-it", "mistralai/Mistral-7B-Instruct-v0.3", "Qwen/Qwen1.5-7B-Chat" ] HF_TOKEN = os.environ.get("HF_TOKEN") # --------------------------- # ๐Ÿ“š Load NLP model for sentence splitting # --------------------------- try: import spacy nlp = spacy.load("en_core_web_sm") except: os.system("python -m spacy download en_core_web_sm") nlp = spacy.load("en_core_web_sm") def split_sentences(text): return [sent.text.strip() for sent in nlp(text).sents if sent.text.strip()] # --------------------------- # ๐Ÿงน Text cleanup # --------------------------- def clean(text): return "\n".join([line.strip() for line in text.splitlines() if line.strip()]) # --------------------------- # ๐Ÿ“„ PDF and DOCX extractors # --------------------------- def extract_pdf_text(path): doc = fitz.open(path) pages = [] for i, page in enumerate(doc): text = page.get_text() if not text.strip(): pix = page.get_pixmap(dpi=300) img = Image.open(io.BytesIO(pix.tobytes("png"))) text = pytesseract.image_to_string(img) pages.append((i + 1, text)) return pages def extract_docx_text(path): doc = docx.Document(path) full_text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()]) return [(1, full_text)] # --------------------------- # ๐Ÿ“ฆ Chunk splitter # --------------------------- def chunkify(sentences, max_tokens=CHUNK_SIZE, overlap=CHUNK_OVERLAP): chunks = [] current = [] length = 0 for s in sentences: tokens = len(s.split()) if length + tokens > max_tokens: chunks.append(" ".join(current)) current = current[-overlap:] length = sum(len(w.split()) for w in current) current.append(s) length += tokens if current: chunks.append(" ".join(current)) return chunks # --------------------------- # ๐Ÿ”Ž Metadata from file # --------------------------- def extract_meta(name): name = name.lower() return { "model": next((m for m in ["se3", "se4", "symbio", "explore"] if m in name), "unknown"), "doc_type": next((d for d in ["owner", "service", "parts"] if d in name), "unknown"), "brand": "life fitness" } # --------------------------- # ๐Ÿ”  Embed and store chunks # --------------------------- def embed_all(): embedder = SentenceTransformer(EMBED_MODEL) client = chromadb.PersistentClient(path=CHROMA_PATH) try: client.delete_collection(CHROMA_COLLECTION) except: pass db = client.create_collection(CHROMA_COLLECTION) for fname in os.listdir(MANUALS_DIR): path = os.path.join(MANUALS_DIR, fname) if fname.endswith(".pdf"): pages = extract_pdf_text(path) elif fname.endswith(".docx"): pages = extract_docx_text(path) else: continue meta = extract_meta(fname) for page, text in pages: sents = split_sentences(clean(text)) chunks = chunkify(sents) for i, chunk in enumerate(chunks): db.add( ids=[f"{fname}::p{page}::c{i}"], documents=[chunk], metadatas=[{**meta, "source": fname, "page": page}] ) return db, embedder # --------------------------- # ๐Ÿค– Load selected LLM model # --------------------------- def load_model(repo): tokenizer = AutoTokenizer.from_pretrained(repo, token=HF_TOKEN) model = AutoModelForCausalLM.from_pretrained( repo, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map="auto" if torch.cuda.is_available() else None, token=HF_TOKEN ) return pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1) # --------------------------- # ๐Ÿ“ฅ Retrieval-Augmented QA # --------------------------- def answer_query(q, model_choice): results = db.query(query_texts=[q], n_results=3) context = "\n\n".join(results["documents"][0]) prompt = f""" You are a helpful assistant. Answer based on the context. If unsure, say "I don't know". Context: {context} Question: {q} Answer: """ pipe = load_model(model_choice) out = pipe(prompt, max_new_tokens=300, do_sample=False)[0]["generated_text"] return out.split("Answer:")[-1].strip() # --------------------------- # ๐Ÿš€ Initialize app # --------------------------- print("Embedding documents...") db, embedder = embed_all() print("Done embedding.") # --------------------------- # ๐ŸŽ›๏ธ Gradio UI # --------------------------- demo = gr.Blocks() with demo: gr.Markdown("""# ๐Ÿง  SmartManuals-AI Ask any question and let the model answer from your uploaded manuals. """) with gr.Row(): qbox = gr.Textbox(label="Ask a Question", placeholder="e.g. How to reset the SE3 console?") model_select = gr.Dropdown(choices=AVAILABLE_MODELS, label="Choose LLM", value=DEFAULT_MODEL) ansbox = gr.Textbox(label="Answer", lines=10) btn = gr.Button("๐Ÿ” Submit") btn.click(fn=answer_query, inputs=[qbox, model_select], outputs=ansbox) demo.launch()