# ✅ Hugging Face-ready `app.py` for SmartManuals-AI # Supports PDF/DOCX upload, embedding, querying via multiple HF models, and OCR fallback import os import fitz # PyMuPDF import nltk import json import io import docx2txt import pytesseract import chromadb import gradio as gr import torch from tqdm import tqdm from PIL import Image from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM from sentence_transformers import SentenceTransformer, util from nltk.tokenize import sent_tokenize nltk.download("punkt") # ---------------------------- # Configuration # ---------------------------- CHROMA_PATH = "./chroma_store" COLLECTION_NAME = "manual_chunks" CHUNK_SIZE = 750 CHUNK_OVERLAP = 100 MAX_CONTEXT = 3 HF_MODELS = [ "meta-llama/Llama-3-8B-Instruct", "meta-llama/Llama-3.1-8B-Instruct", "meta-llama/Llama-4-Scout-17B-16E-Instruct", "mistralai/Mistral-7B-Instruct-v0.3", "google/gemma-1.1-7b-it", "Qwen/Qwen3-30B-A3B", ] HF_TOKEN = os.environ.get("HF_TOKEN") # ---------------------------- # Utilities # ---------------------------- def clean_text(text): return "\n".join([line.strip() for line in text.splitlines() if line.strip()]) def split_sentences(text): return sent_tokenize(text) def chunk_sentences(sentences): chunks, chunk, length = [], [], 0 for sent in sentences: tokens = len(sent.split()) if length + tokens > CHUNK_SIZE: chunks.append(" ".join(chunk)) chunk = chunk[-CHUNK_OVERLAP:] length = sum(len(s.split()) for s in chunk) chunk.append(sent) length += tokens if chunk: chunks.append(" ".join(chunk)) return chunks def extract_text_pdf(file): doc = fitz.open(stream=file.read(), filetype="pdf") texts = [] for page in doc: text = page.get_text() if not text.strip(): pix = page.get_pixmap(dpi=300) img = Image.open(io.BytesIO(pix.tobytes("png"))) text = pytesseract.image_to_string(img) texts.append(text) return texts def extract_text_docx(file): return [docx2txt.process(file)] def extract_metadata(filename): lower = filename.lower() model = next((m for m in [ "se3hd", "se3", "se4", "symbio", "explore", "integrity x", "integrity sl", "everest", "engage", "inspire", "discover", "95t", "95x", "95c", "95r", "97c" ] if m in lower.replace(" ", "")), "unknown") doc_type = "unknown" if "om" in lower or "owner" in lower: doc_type = "owner manual" elif "sm" in lower or "service" in lower: doc_type = "service manual" elif "assembly" in lower: doc_type = "assembly instructions" elif "parts" in lower: doc_type = "parts manual" elif "bulletin" in lower: doc_type = "service bulletin" return model, doc_type # ---------------------------- # Embedding pipeline # ---------------------------- def embed_docs(files, progress=gr.Progress()): embedder = SentenceTransformer("all-MiniLM-L6-v2") client = chromadb.PersistentClient(path=CHROMA_PATH) try: client.delete_collection(COLLECTION_NAME) except: pass collection = client.create_collection(COLLECTION_NAME) texts, ids, metadatas = [], [], [] i = 0 for file in progress.tqdm(files, desc="Embedding files"): filename = os.path.basename(file.name) ext = filename.lower().split(".")[-1] raw_texts = extract_text_pdf(file) if ext == "pdf" else extract_text_docx(file) model, doc_type = extract_metadata(filename) for page, text in enumerate(raw_texts): sents = split_sentences(clean_text(text)) for j, chunk in enumerate(chunk_sentences(sents)): texts.append(chunk) ids.append(f"{filename}::p{page+1}::c{j+1}") metadatas.append({"source_file": filename, "page": page+1, "model": model, "doc_type": doc_type}) i += 1 if len(texts) >= 16: collection.add(documents=texts, metadatas=metadatas, ids=ids, embeddings=embedder.encode(texts).tolist()) texts, metadatas, ids = [], [], [] if texts: collection.add(documents=texts, metadatas=metadatas, ids=ids, embeddings=embedder.encode(texts).tolist()) return f"✅ Embedded {i} chunks from {len(files)} files." # ---------------------------- # Querying pipeline # ---------------------------- def query_rag(q, model_name): embedder = SentenceTransformer("all-MiniLM-L6-v2") client = chromadb.PersistentClient(path=CHROMA_PATH) collection = client.get_collection(COLLECTION_NAME) chunks = collection.query(query_texts=[q], n_results=MAX_CONTEXT) context = "\n\n".join(chunks['documents'][0]) prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a helpful assistant. Only answer from the provided manual context below. If unsure, say 'I don't know'. {context} <|start_header_id|>user<|end_header_id|> {q}<|start_header_id|>assistant<|end_header_id|>""" tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN) model = AutoModelForCausalLM.from_pretrained(model_name, token=HF_TOKEN, torch_dtype=torch.float32) pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1) result = pipe(prompt, max_new_tokens=300)[0]["generated_text"] return result.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip() # ---------------------------- # Gradio Interface # ---------------------------- with gr.Blocks() as demo: gr.Markdown("""# 🧠 SmartManuals-AI (HF Edition) Upload PDF or Word documents, embed them locally, and ask technical questions using LLMs (LLaMA 3, Mistral, etc).""") with gr.Tab("📥 Upload & Embed"): uploader = gr.File(file_types=[".pdf", ".docx"], file_count="multiple") embed_btn = gr.Button("🚀 Embed Files") embed_output = gr.Textbox(label="Embed Log") with gr.Tab("❓ Ask a Question"): question = gr.Textbox(label="Your Question") model_select = gr.Dropdown(choices=HF_MODELS, label="Model", value=HF_MODELS[0]) ask_btn = gr.Button("💬 Ask") response = gr.Textbox(label="Answer", lines=8) embed_btn.click(embed_docs, inputs=uploader, outputs=embed_output) ask_btn.click(query_rag, inputs=[question, model_select], outputs=response) demo.launch()