Spaces:

damoojeje
/

SmartManuals-AI

Running

App Files Files Community

damoojeje commited on 23 days ago

Commit

df15a5f

verified ·

1 Parent(s): c36ee8b

Update app.py

Browse files

Files changed (1) hide show

app.py +107 -144

app.py CHANGED Viewed

@@ -1,192 +1,155 @@
-# ✅ SmartManuals-AI App for Hugging Face Spaces
-# Full app.py with spaCy-based sentence segmentation and model dropdown selection
-import io
 import os
-import json
 import fitz  # PyMuPDF
-import chromadb
-import torch
 import docx
 import gradio as gr
 import pytesseract
-import numpy as np
-import spacy
-from tqdm import tqdm
 from PIL import Image
-from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
 from sentence_transformers import SentenceTransformer, util
-# ---------------------------
-# ⚙️ Configuration
-# ---------------------------
-MANUALS_DIR = "./Manuals"
-CHROMA_PATH = "./chroma_store"
-CHROMA_COLLECTION = "manual_chunks"
 CHUNK_SIZE = 750
 CHUNK_OVERLAP = 100
-EMBED_MODEL = "all-MiniLM-L6-v2"
-DEFAULT_MODEL = "meta-llama/Llama-3-8B-Instruct"
-AVAILABLE_MODELS = [
-    "meta-llama/Llama-3-8B-Instruct",
-    "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-    "google/gemma-1.1-7b-it",
     "mistralai/Mistral-7B-Instruct-v0.3",
-    "Qwen/Qwen1.5-7B-Chat"
 ]
-HF_TOKEN = os.environ.get("HF_TOKEN")
-# ---------------------------
-# 📚 Load NLP model for sentence splitting
-# ---------------------------
-try:
-    import spacy
-    nlp = spacy.load("en_core_web_sm")
-except:
-    os.system("python -m spacy download en_core_web_sm")
-    nlp = spacy.load("en_core_web_sm")
-def split_sentences(text):
-    return [sent.text.strip() for sent in nlp(text).sents if sent.text.strip()]
-# ---------------------------
-# 🧹 Text cleanup
-# ---------------------------
-def clean(text):
-    return "\n".join([line.strip() for line in text.splitlines() if line.strip()])
-# ---------------------------
-# 📄 PDF and DOCX extractors
-# ---------------------------
 def extract_pdf_text(path):
     doc = fitz.open(path)
-    pages = []
     for i, page in enumerate(doc):
         text = page.get_text()
         if not text.strip():
-            pix = page.get_pixmap(dpi=300)
-            img = Image.open(io.BytesIO(pix.tobytes("png")))
             text = pytesseract.image_to_string(img)
-        pages.append((i + 1, text))
-    return pages
 def extract_docx_text(path):
     doc = docx.Document(path)
-    full_text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
-    return [(1, full_text)]
-# ---------------------------
-# 📦 Chunk splitter
-# ---------------------------
-def chunkify(sentences, max_tokens=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
     chunks = []
     current = []
-    length = 0
-    for s in sentences:
-        tokens = len(s.split())
-        if length + tokens > max_tokens:
             chunks.append(" ".join(current))
-            current = current[-overlap:]
-            length = sum(len(w.split()) for w in current)
-        current.append(s)
-        length += tokens
     if current:
         chunks.append(" ".join(current))
     return chunks
-# ---------------------------
-# 🔎 Metadata from file
-# ---------------------------
-def extract_meta(name):
-    name = name.lower()
-    return {
-        "model": next((m for m in ["se3", "se4", "symbio", "explore"] if m in name), "unknown"),
-        "doc_type": next((d for d in ["owner", "service", "parts"] if d in name), "unknown"),
-        "brand": "life fitness"
-    }
-# ---------------------------
-# 🔠 Embed and store chunks
-# ---------------------------
 def embed_all():
-    embedder = SentenceTransformer(EMBED_MODEL)
-    client = chromadb.PersistentClient(path=CHROMA_PATH)
-    try:
-        client.delete_collection(CHROMA_COLLECTION)
-    except:
-        pass
-    db = client.create_collection(CHROMA_COLLECTION)
-    for fname in os.listdir(MANUALS_DIR):
-        path = os.path.join(MANUALS_DIR, fname)
-        if fname.endswith(".pdf"):
-            pages = extract_pdf_text(path)
-        elif fname.endswith(".docx"):
-            pages = extract_docx_text(path)
         else:
             continue
-        meta = extract_meta(fname)
-        for page, text in pages:
-            sents = split_sentences(clean(text))
-            chunks = chunkify(sents)
-            for i, chunk in enumerate(chunks):
-                db.add(
-                    ids=[f"{fname}::p{page}::c{i}"],
-                    documents=[chunk],
-                    metadatas=[{**meta, "source": fname, "page": page}]
-                )
-    return db, embedder
-# ---------------------------
-# 🤖 Load selected LLM model
-# ---------------------------
-def load_model(repo):
-    tokenizer = AutoTokenizer.from_pretrained(repo, token=HF_TOKEN)
-    model = AutoModelForCausalLM.from_pretrained(
-        repo, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-        device_map="auto" if torch.cuda.is_available() else None, token=HF_TOKEN
-    )
-    return pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
-# ---------------------------
-# 📥 Retrieval-Augmented QA
-# ---------------------------
-def answer_query(q, model_choice):
-    results = db.query(query_texts=[q], n_results=3)
     context = "\n\n".join(results["documents"][0])
     prompt = f"""
-You are a helpful assistant. Answer based on the context. If unsure, say "I don't know".
 Context:
 {context}
-Question: {q}
 Answer:
 """
-    pipe = load_model(model_choice)
-    out = pipe(prompt, max_new_tokens=300, do_sample=False)[0]["generated_text"]
-    return out.split("Answer:")[-1].strip()
-# ---------------------------
-# 🚀 Initialize app
-# ---------------------------
-print("Embedding documents...")
 db, embedder = embed_all()
-print("Done embedding.")
-# ---------------------------
 # 🎛️ Gradio UI
-# ---------------------------
-demo = gr.Blocks()
-with demo:
-    gr.Markdown("""# 🧠 SmartManuals-AI
-Ask any question and let the model answer from your uploaded manuals.
-""")
     with gr.Row():
-        qbox = gr.Textbox(label="Ask a Question", placeholder="e.g. How to reset the SE3 console?")
-        model_select = gr.Dropdown(choices=AVAILABLE_MODELS, label="Choose LLM", value=DEFAULT_MODEL)
-    ansbox = gr.Textbox(label="Answer", lines=10)
-    btn = gr.Button("🔍 Submit")
-    btn.click(fn=answer_query, inputs=[qbox, model_select], outputs=ansbox)
-demo.launch()

 import os
 import fitz  # PyMuPDF
 import docx
+import json
 import gradio as gr
 import pytesseract
 from PIL import Image
+from tqdm import tqdm
+import chromadb
+import torch
+import nltk
 from sentence_transformers import SentenceTransformer, util
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+# ----------------------------
+# ✅ Ensure nltk punkt is available
+# ----------------------------
+try:
+    nltk.data.find("tokenizers/punkt")
+except LookupError:
+    nltk.download("punkt")
+from nltk.tokenize import sent_tokenize
+# ----------------------------
+# ⚙️ Config
+# ----------------------------
+MANUAL_DIR = "./Manuals"
+CHROMA_DIR = "./chroma_store"
 CHUNK_SIZE = 750
 CHUNK_OVERLAP = 100
+MAX_CONTEXT = 3
+DEFAULT_MODEL = "meta-llama/Llama-3-8b-Instruct"
+MODEL_OPTIONS = [
+    "meta-llama/Llama-3-8b-Instruct",
     "mistralai/Mistral-7B-Instruct-v0.3",
+    "google/gemma-1.1-7b-it"
 ]
+HF_TOKEN = os.environ.get("HF_TOKEN")
+# ----------------------------
+# 🔍 Utility functions
+# ----------------------------
 def extract_pdf_text(path):
+    text_blocks = []
     doc = fitz.open(path)
     for i, page in enumerate(doc):
         text = page.get_text()
         if not text.strip():
+            img = Image.open(io.BytesIO(page.get_pixmap().tobytes("png")))
             text = pytesseract.image_to_string(img)
+        text_blocks.append({"page": i + 1, "text": text})
+    return text_blocks
 def extract_docx_text(path):
     doc = docx.Document(path)
+    full_text = "\n".join([para.text for para in doc.paragraphs])
+    return [{"page": 1, "text": full_text}]
+def split_sentences(text):
+    try:
+        return sent_tokenize(text)
+    except Exception:
+        return text.split(". ")
+def chunk_text(sentences):
     chunks = []
     current = []
+    count = 0
+    for sentence in sentences:
+        tokens = sentence.split()
+        if count + len(tokens) > CHUNK_SIZE:
             chunks.append(" ".join(current))
+            current = current[-CHUNK_OVERLAP:]
+            count = sum(len(s.split()) for s in current)
+        current.append(sentence)
+        count += len(tokens)
     if current:
         chunks.append(" ".join(current))
     return chunks
 def embed_all():
+    client = chromadb.PersistentClient(path=CHROMA_DIR)
+    if "manual_chunks" in [c.name for c in client.list_collections()]:
+        client.delete_collection("manual_chunks")
+    collection = client.create_collection("manual_chunks")
+    embedder = SentenceTransformer("all-MiniLM-L6-v2")
+    for fname in os.listdir(MANUAL_DIR):
+        fpath = os.path.join(MANUAL_DIR, fname)
+        if fname.lower().endswith(".pdf"):
+            pages = extract_pdf_text(fpath)
+        elif fname.lower().endswith(".docx"):
+            pages = extract_docx_text(fpath)
         else:
             continue
+        for page in pages:
+            sents = split_sentences(page["text"])
+            chunks = chunk_text(sents)
+            for idx, chunk in enumerate(chunks):
+                cid = f"{fname}::p{page['page']}::c{idx}"
+                collection.add(documents=[chunk], ids=[cid], metadatas=[{"source": fname, "page": page["page"]}])
+    return collection, embedder
+def get_model(model_id):
+    tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
+    model = AutoModelForCausalLM.from_pretrained(model_id, token=HF_TOKEN, torch_dtype=torch.float32)
+    return pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)
+def run_query(question, model_name):
+    results = db.query(query_texts=[question], n_results=MAX_CONTEXT)
+    if not results or not results.get("documents"):
+        return "No matching information found."
     context = "\n\n".join(results["documents"][0])
     prompt = f"""
+You are a helpful assistant. Use the following context to answer the question.
 Context:
 {context}
+Question: {question}
 Answer:
 """
+    model = get_model(model_name)
+    res = model(prompt, max_new_tokens=300)[0]['generated_text']
+    return res.split("Answer:")[-1].strip()
+# ----------------------------
+# ✅ Startup: Embed manuals
+# ----------------------------
 db, embedder = embed_all()
+# ----------------------------
 # 🎛️ Gradio UI
+# ----------------------------
+with gr.Blocks() as demo:
+    gr.Markdown("""
+    # 📘 SmartManuals-AI (Docker)
+    Ask any question from the preloaded manuals (PDF + Word).
+    """)
     with gr.Row():
+        question = gr.Textbox(label="Ask a Question")
+        model = gr.Dropdown(choices=MODEL_OPTIONS, value=DEFAULT_MODEL, label="Choose LLM")
+    btn = gr.Button("Ask")
+    answer = gr.Textbox(label="Answer", lines=10)
+    btn.click(fn=run_query, inputs=[question, model], outputs=answer)
+demo.launch(server_name="0.0.0.0", server_port=7860)