SmartManuals-AI / app.py
damoojeje's picture
Update app.py
835a614 verified
raw
history blame
6.53 kB
# βœ… Hugging Face-ready `app.py` for SmartManuals-AI
# Supports PDF/DOCX upload, embedding, querying via multiple HF models, and OCR fallback
import os
import fitz # PyMuPDF
import nltk
import json
import io
import docx2txt
import pytesseract
import chromadb
import gradio as gr
import torch
from tqdm import tqdm
from PIL import Image
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer, util
from nltk.tokenize import sent_tokenize
nltk.download("punkt")
# ----------------------------
# Configuration
# ----------------------------
CHROMA_PATH = "./chroma_store"
COLLECTION_NAME = "manual_chunks"
CHUNK_SIZE = 750
CHUNK_OVERLAP = 100
MAX_CONTEXT = 3
HF_MODELS = [
"meta-llama/Llama-3-8B-Instruct",
"meta-llama/Llama-3.1-8B-Instruct",
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
"mistralai/Mistral-7B-Instruct-v0.3",
"google/gemma-1.1-7b-it",
"Qwen/Qwen3-30B-A3B",
]
HF_TOKEN = os.environ.get("HF_TOKEN")
# ----------------------------
# Utilities
# ----------------------------
def clean_text(text):
return "\n".join([line.strip() for line in text.splitlines() if line.strip()])
def split_sentences(text):
return sent_tokenize(text)
def chunk_sentences(sentences):
chunks, chunk, length = [], [], 0
for sent in sentences:
tokens = len(sent.split())
if length + tokens > CHUNK_SIZE:
chunks.append(" ".join(chunk))
chunk = chunk[-CHUNK_OVERLAP:]
length = sum(len(s.split()) for s in chunk)
chunk.append(sent)
length += tokens
if chunk:
chunks.append(" ".join(chunk))
return chunks
def extract_text_pdf(file):
doc = fitz.open(stream=file.read(), filetype="pdf")
texts = []
for page in doc:
text = page.get_text()
if not text.strip():
pix = page.get_pixmap(dpi=300)
img = Image.open(io.BytesIO(pix.tobytes("png")))
text = pytesseract.image_to_string(img)
texts.append(text)
return texts
def extract_text_docx(file):
return [docx2txt.process(file)]
def extract_metadata(filename):
lower = filename.lower()
model = next((m for m in [
"se3hd", "se3", "se4", "symbio", "explore", "integrity x", "integrity sl",
"everest", "engage", "inspire", "discover", "95t", "95x", "95c", "95r", "97c"
] if m in lower.replace(" ", "")), "unknown")
doc_type = "unknown"
if "om" in lower or "owner" in lower:
doc_type = "owner manual"
elif "sm" in lower or "service" in lower:
doc_type = "service manual"
elif "assembly" in lower:
doc_type = "assembly instructions"
elif "parts" in lower:
doc_type = "parts manual"
elif "bulletin" in lower:
doc_type = "service bulletin"
return model, doc_type
# ----------------------------
# Embedding pipeline
# ----------------------------
def embed_docs(files, progress=gr.Progress()):
embedder = SentenceTransformer("all-MiniLM-L6-v2")
client = chromadb.PersistentClient(path=CHROMA_PATH)
try:
client.delete_collection(COLLECTION_NAME)
except: pass
collection = client.create_collection(COLLECTION_NAME)
texts, ids, metadatas = [], [], []
i = 0
for file in progress.tqdm(files, desc="Embedding files"):
filename = os.path.basename(file.name)
ext = filename.lower().split(".")[-1]
raw_texts = extract_text_pdf(file) if ext == "pdf" else extract_text_docx(file)
model, doc_type = extract_metadata(filename)
for page, text in enumerate(raw_texts):
sents = split_sentences(clean_text(text))
for j, chunk in enumerate(chunk_sentences(sents)):
texts.append(chunk)
ids.append(f"{filename}::p{page+1}::c{j+1}")
metadatas.append({"source_file": filename, "page": page+1, "model": model, "doc_type": doc_type})
i += 1
if len(texts) >= 16:
collection.add(documents=texts, metadatas=metadatas, ids=ids,
embeddings=embedder.encode(texts).tolist())
texts, metadatas, ids = [], [], []
if texts:
collection.add(documents=texts, metadatas=metadatas, ids=ids,
embeddings=embedder.encode(texts).tolist())
return f"βœ… Embedded {i} chunks from {len(files)} files."
# ----------------------------
# Querying pipeline
# ----------------------------
def query_rag(q, model_name):
embedder = SentenceTransformer("all-MiniLM-L6-v2")
client = chromadb.PersistentClient(path=CHROMA_PATH)
collection = client.get_collection(COLLECTION_NAME)
chunks = collection.query(query_texts=[q], n_results=MAX_CONTEXT)
context = "\n\n".join(chunks['documents'][0])
prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a helpful assistant. Only answer from the provided manual context below.
If unsure, say 'I don't know'.
<context>
{context}
</context>
<|start_header_id|>user<|end_header_id|>
{q}<|start_header_id|>assistant<|end_header_id|>"""
tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN)
model = AutoModelForCausalLM.from_pretrained(model_name, token=HF_TOKEN, torch_dtype=torch.float32)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)
result = pipe(prompt, max_new_tokens=300)[0]["generated_text"]
return result.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()
# ----------------------------
# Gradio Interface
# ----------------------------
with gr.Blocks() as demo:
gr.Markdown("""# 🧠 SmartManuals-AI (HF Edition)
Upload PDF or Word documents, embed them locally, and ask technical questions using LLMs (LLaMA 3, Mistral, etc).""")
with gr.Tab("πŸ“₯ Upload & Embed"):
uploader = gr.File(file_types=[".pdf", ".docx"], file_count="multiple")
embed_btn = gr.Button("πŸš€ Embed Files")
embed_output = gr.Textbox(label="Embed Log")
with gr.Tab("❓ Ask a Question"):
question = gr.Textbox(label="Your Question")
model_select = gr.Dropdown(choices=HF_MODELS, label="Model", value=HF_MODELS[0])
ask_btn = gr.Button("πŸ’¬ Ask")
response = gr.Textbox(label="Answer", lines=8)
embed_btn.click(embed_docs, inputs=uploader, outputs=embed_output)
ask_btn.click(query_rag, inputs=[question, model_select], outputs=response)
demo.launch()