Spaces:
Running
Running
# β Hugging Face-ready `app.py` for SmartManuals-AI | |
# Supports PDF/DOCX upload, embedding, querying via multiple HF models, and OCR fallback | |
import os | |
import fitz # PyMuPDF | |
import nltk | |
import json | |
import io | |
import docx2txt | |
import pytesseract | |
import chromadb | |
import gradio as gr | |
import torch | |
from tqdm import tqdm | |
from PIL import Image | |
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM | |
from sentence_transformers import SentenceTransformer, util | |
from nltk.tokenize import sent_tokenize | |
nltk.download("punkt") | |
# ---------------------------- | |
# Configuration | |
# ---------------------------- | |
CHROMA_PATH = "./chroma_store" | |
COLLECTION_NAME = "manual_chunks" | |
CHUNK_SIZE = 750 | |
CHUNK_OVERLAP = 100 | |
MAX_CONTEXT = 3 | |
HF_MODELS = [ | |
"meta-llama/Llama-3-8B-Instruct", | |
"meta-llama/Llama-3.1-8B-Instruct", | |
"meta-llama/Llama-4-Scout-17B-16E-Instruct", | |
"mistralai/Mistral-7B-Instruct-v0.3", | |
"google/gemma-1.1-7b-it", | |
"Qwen/Qwen3-30B-A3B", | |
] | |
HF_TOKEN = os.environ.get("HF_TOKEN") | |
# ---------------------------- | |
# Utilities | |
# ---------------------------- | |
def clean_text(text): | |
return "\n".join([line.strip() for line in text.splitlines() if line.strip()]) | |
def split_sentences(text): | |
return sent_tokenize(text) | |
def chunk_sentences(sentences): | |
chunks, chunk, length = [], [], 0 | |
for sent in sentences: | |
tokens = len(sent.split()) | |
if length + tokens > CHUNK_SIZE: | |
chunks.append(" ".join(chunk)) | |
chunk = chunk[-CHUNK_OVERLAP:] | |
length = sum(len(s.split()) for s in chunk) | |
chunk.append(sent) | |
length += tokens | |
if chunk: | |
chunks.append(" ".join(chunk)) | |
return chunks | |
def extract_text_pdf(file): | |
doc = fitz.open(stream=file.read(), filetype="pdf") | |
texts = [] | |
for page in doc: | |
text = page.get_text() | |
if not text.strip(): | |
pix = page.get_pixmap(dpi=300) | |
img = Image.open(io.BytesIO(pix.tobytes("png"))) | |
text = pytesseract.image_to_string(img) | |
texts.append(text) | |
return texts | |
def extract_text_docx(file): | |
return [docx2txt.process(file)] | |
def extract_metadata(filename): | |
lower = filename.lower() | |
model = next((m for m in [ | |
"se3hd", "se3", "se4", "symbio", "explore", "integrity x", "integrity sl", | |
"everest", "engage", "inspire", "discover", "95t", "95x", "95c", "95r", "97c" | |
] if m in lower.replace(" ", "")), "unknown") | |
doc_type = "unknown" | |
if "om" in lower or "owner" in lower: | |
doc_type = "owner manual" | |
elif "sm" in lower or "service" in lower: | |
doc_type = "service manual" | |
elif "assembly" in lower: | |
doc_type = "assembly instructions" | |
elif "parts" in lower: | |
doc_type = "parts manual" | |
elif "bulletin" in lower: | |
doc_type = "service bulletin" | |
return model, doc_type | |
# ---------------------------- | |
# Embedding pipeline | |
# ---------------------------- | |
def embed_docs(files, progress=gr.Progress()): | |
embedder = SentenceTransformer("all-MiniLM-L6-v2") | |
client = chromadb.PersistentClient(path=CHROMA_PATH) | |
try: | |
client.delete_collection(COLLECTION_NAME) | |
except: pass | |
collection = client.create_collection(COLLECTION_NAME) | |
texts, ids, metadatas = [], [], [] | |
i = 0 | |
for file in progress.tqdm(files, desc="Embedding files"): | |
filename = os.path.basename(file.name) | |
ext = filename.lower().split(".")[-1] | |
raw_texts = extract_text_pdf(file) if ext == "pdf" else extract_text_docx(file) | |
model, doc_type = extract_metadata(filename) | |
for page, text in enumerate(raw_texts): | |
sents = split_sentences(clean_text(text)) | |
for j, chunk in enumerate(chunk_sentences(sents)): | |
texts.append(chunk) | |
ids.append(f"{filename}::p{page+1}::c{j+1}") | |
metadatas.append({"source_file": filename, "page": page+1, "model": model, "doc_type": doc_type}) | |
i += 1 | |
if len(texts) >= 16: | |
collection.add(documents=texts, metadatas=metadatas, ids=ids, | |
embeddings=embedder.encode(texts).tolist()) | |
texts, metadatas, ids = [], [], [] | |
if texts: | |
collection.add(documents=texts, metadatas=metadatas, ids=ids, | |
embeddings=embedder.encode(texts).tolist()) | |
return f"β Embedded {i} chunks from {len(files)} files." | |
# ---------------------------- | |
# Querying pipeline | |
# ---------------------------- | |
def query_rag(q, model_name): | |
embedder = SentenceTransformer("all-MiniLM-L6-v2") | |
client = chromadb.PersistentClient(path=CHROMA_PATH) | |
collection = client.get_collection(COLLECTION_NAME) | |
chunks = collection.query(query_texts=[q], n_results=MAX_CONTEXT) | |
context = "\n\n".join(chunks['documents'][0]) | |
prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|> | |
You are a helpful assistant. Only answer from the provided manual context below. | |
If unsure, say 'I don't know'. | |
<context> | |
{context} | |
</context> | |
<|start_header_id|>user<|end_header_id|> | |
{q}<|start_header_id|>assistant<|end_header_id|>""" | |
tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN) | |
model = AutoModelForCausalLM.from_pretrained(model_name, token=HF_TOKEN, torch_dtype=torch.float32) | |
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1) | |
result = pipe(prompt, max_new_tokens=300)[0]["generated_text"] | |
return result.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip() | |
# ---------------------------- | |
# Gradio Interface | |
# ---------------------------- | |
with gr.Blocks() as demo: | |
gr.Markdown("""# π§ SmartManuals-AI (HF Edition) | |
Upload PDF or Word documents, embed them locally, and ask technical questions using LLMs (LLaMA 3, Mistral, etc).""") | |
with gr.Tab("π₯ Upload & Embed"): | |
uploader = gr.File(file_types=[".pdf", ".docx"], file_count="multiple") | |
embed_btn = gr.Button("π Embed Files") | |
embed_output = gr.Textbox(label="Embed Log") | |
with gr.Tab("β Ask a Question"): | |
question = gr.Textbox(label="Your Question") | |
model_select = gr.Dropdown(choices=HF_MODELS, label="Model", value=HF_MODELS[0]) | |
ask_btn = gr.Button("π¬ Ask") | |
response = gr.Textbox(label="Answer", lines=8) | |
embed_btn.click(embed_docs, inputs=uploader, outputs=embed_output) | |
ask_btn.click(query_rag, inputs=[question, model_select], outputs=response) | |
demo.launch() | |