Spaces:

damoojeje
/

SmartManuals-AI

Running

App Files Files Community

SmartManuals-AI / app.py

damoojeje

Update app.py

835a614 verified 2 months ago

raw

history blame

6.53 kB

	# ✅ Hugging Face-ready `app.py` for SmartManuals-AI
	# Supports PDF/DOCX upload, embedding, querying via multiple HF models, and OCR fallback

	import os
	import fitz # PyMuPDF
	import nltk
	import json
	import io
	import docx2txt
	import pytesseract
	import chromadb
	import gradio as gr
	import torch
	from tqdm import tqdm
	from PIL import Image
	from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
	from sentence_transformers import SentenceTransformer, util
	from nltk.tokenize import sent_tokenize

	nltk.download("punkt")

	# ----------------------------
	# Configuration
	# ----------------------------
	CHROMA_PATH = "./chroma_store"
	COLLECTION_NAME = "manual_chunks"
	CHUNK_SIZE = 750
	CHUNK_OVERLAP = 100
	MAX_CONTEXT = 3
	HF_MODELS = [
	"meta-llama/Llama-3-8B-Instruct",
	"meta-llama/Llama-3.1-8B-Instruct",
	"meta-llama/Llama-4-Scout-17B-16E-Instruct",
	"mistralai/Mistral-7B-Instruct-v0.3",
	"google/gemma-1.1-7b-it",
	"Qwen/Qwen3-30B-A3B",
	]
	HF_TOKEN = os.environ.get("HF_TOKEN")

	# ----------------------------
	# Utilities
	# ----------------------------
	def clean_text(text):
	return "\n".join([line.strip() for line in text.splitlines() if line.strip()])

	def split_sentences(text):
	return sent_tokenize(text)

	def chunk_sentences(sentences):
	chunks, chunk, length = [], [], 0
	for sent in sentences:
	tokens = len(sent.split())
	if length + tokens > CHUNK_SIZE:
	chunks.append(" ".join(chunk))
	chunk = chunk[-CHUNK_OVERLAP:]
	length = sum(len(s.split()) for s in chunk)
	chunk.append(sent)
	length += tokens
	if chunk:
	chunks.append(" ".join(chunk))
	return chunks

	def extract_text_pdf(file):
	doc = fitz.open(stream=file.read(), filetype="pdf")
	texts = []
	for page in doc:
	text = page.get_text()
	if not text.strip():
	pix = page.get_pixmap(dpi=300)
	img = Image.open(io.BytesIO(pix.tobytes("png")))
	text = pytesseract.image_to_string(img)
	texts.append(text)
	return texts

	def extract_text_docx(file):
	return [docx2txt.process(file)]

	def extract_metadata(filename):
	lower = filename.lower()
	model = next((m for m in [
	"se3hd", "se3", "se4", "symbio", "explore", "integrity x", "integrity sl",
	"everest", "engage", "inspire", "discover", "95t", "95x", "95c", "95r", "97c"
	] if m in lower.replace(" ", "")), "unknown")

	doc_type = "unknown"
	if "om" in lower or "owner" in lower:
	doc_type = "owner manual"
	elif "sm" in lower or "service" in lower:
	doc_type = "service manual"
	elif "assembly" in lower:
	doc_type = "assembly instructions"
	elif "parts" in lower:
	doc_type = "parts manual"
	elif "bulletin" in lower:
	doc_type = "service bulletin"

	return model, doc_type

	# ----------------------------
	# Embedding pipeline
	# ----------------------------
	def embed_docs(files, progress=gr.Progress()):
	embedder = SentenceTransformer("all-MiniLM-L6-v2")
	client = chromadb.PersistentClient(path=CHROMA_PATH)
	try:
	client.delete_collection(COLLECTION_NAME)
	except: pass
	collection = client.create_collection(COLLECTION_NAME)

	texts, ids, metadatas = [], [], []
	i = 0
	for file in progress.tqdm(files, desc="Embedding files"):
	filename = os.path.basename(file.name)
	ext = filename.lower().split(".")[-1]
	raw_texts = extract_text_pdf(file) if ext == "pdf" else extract_text_docx(file)
	model, doc_type = extract_metadata(filename)
	for page, text in enumerate(raw_texts):
	sents = split_sentences(clean_text(text))
	for j, chunk in enumerate(chunk_sentences(sents)):
	texts.append(chunk)
	ids.append(f"{filename}::p{page+1}::c{j+1}")
	metadatas.append({"source_file": filename, "page": page+1, "model": model, "doc_type": doc_type})
	i += 1
	if len(texts) >= 16:
	collection.add(documents=texts, metadatas=metadatas, ids=ids,
	embeddings=embedder.encode(texts).tolist())
	texts, metadatas, ids = [], [], []
	if texts:
	collection.add(documents=texts, metadatas=metadatas, ids=ids,
	embeddings=embedder.encode(texts).tolist())
	return f"✅ Embedded {i} chunks from {len(files)} files."

	# ----------------------------
	# Querying pipeline
	# ----------------------------
	def query_rag(q, model_name):
	embedder = SentenceTransformer("all-MiniLM-L6-v2")
	client = chromadb.PersistentClient(path=CHROMA_PATH)
	collection = client.get_collection(COLLECTION_NAME)
	chunks = collection.query(query_texts=[q], n_results=MAX_CONTEXT)

	context = "\n\n".join(chunks['documents'][0])
	prompt = f"""<\|begin_of_text\|><\|start_header_id\|>system<\|end_header_id\|>
	You are a helpful assistant. Only answer from the provided manual context below.
	If unsure, say 'I don't know'.
	<context>
	{context}
	</context>
	<\|start_header_id\|>user<\|end_header_id\|>
	{q}<\|start_header_id\|>assistant<\|end_header_id\|>"""

	tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN)
	model = AutoModelForCausalLM.from_pretrained(model_name, token=HF_TOKEN, torch_dtype=torch.float32)
	pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)
	result = pipe(prompt, max_new_tokens=300)[0]["generated_text"]
	return result.split("<\|start_header_id\|>assistant<\|end_header_id\|>")[-1].strip()

	# ----------------------------
	# Gradio Interface
	# ----------------------------
	with gr.Blocks() as demo:
	gr.Markdown("""# 🧠 SmartManuals-AI (HF Edition)
	Upload PDF or Word documents, embed them locally, and ask technical questions using LLMs (LLaMA 3, Mistral, etc).""")

	with gr.Tab("📥 Upload & Embed"):
	uploader = gr.File(file_types=[".pdf", ".docx"], file_count="multiple")
	embed_btn = gr.Button("🚀 Embed Files")
	embed_output = gr.Textbox(label="Embed Log")

	with gr.Tab("❓ Ask a Question"):
	question = gr.Textbox(label="Your Question")
	model_select = gr.Dropdown(choices=HF_MODELS, label="Model", value=HF_MODELS[0])
	ask_btn = gr.Button("💬 Ask")
	response = gr.Textbox(label="Answer", lines=8)

	embed_btn.click(embed_docs, inputs=uploader, outputs=embed_output)
	ask_btn.click(query_rag, inputs=[question, model_select], outputs=response)

	demo.launch()