damoojeje commited on
Commit
df15a5f
Β·
verified Β·
1 Parent(s): c36ee8b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +107 -144
app.py CHANGED
@@ -1,192 +1,155 @@
1
- # βœ… SmartManuals-AI App for Hugging Face Spaces
2
- # Full app.py with spaCy-based sentence segmentation and model dropdown selection
3
- import io
4
  import os
5
- import json
6
  import fitz # PyMuPDF
7
- import chromadb
8
- import torch
9
  import docx
 
10
  import gradio as gr
11
  import pytesseract
12
- import numpy as np
13
- import spacy
14
- from tqdm import tqdm
15
  from PIL import Image
16
- from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
 
 
 
17
  from sentence_transformers import SentenceTransformer, util
 
18
 
19
- # ---------------------------
20
- # βš™οΈ Configuration
21
- # ---------------------------
22
- MANUALS_DIR = "./Manuals"
23
- CHROMA_PATH = "./chroma_store"
24
- CHROMA_COLLECTION = "manual_chunks"
 
 
 
 
 
 
 
 
 
25
  CHUNK_SIZE = 750
26
  CHUNK_OVERLAP = 100
27
- EMBED_MODEL = "all-MiniLM-L6-v2"
28
- DEFAULT_MODEL = "meta-llama/Llama-3-8B-Instruct"
29
- AVAILABLE_MODELS = [
30
- "meta-llama/Llama-3-8B-Instruct",
31
- "meta-llama/Llama-4-Scout-17B-16E-Instruct",
32
- "google/gemma-1.1-7b-it",
33
  "mistralai/Mistral-7B-Instruct-v0.3",
34
- "Qwen/Qwen1.5-7B-Chat"
35
  ]
36
- HF_TOKEN = os.environ.get("HF_TOKEN")
37
-
38
- # ---------------------------
39
- # πŸ“š Load NLP model for sentence splitting
40
- # ---------------------------
41
- try:
42
- import spacy
43
- nlp = spacy.load("en_core_web_sm")
44
- except:
45
- os.system("python -m spacy download en_core_web_sm")
46
- nlp = spacy.load("en_core_web_sm")
47
 
48
- def split_sentences(text):
49
- return [sent.text.strip() for sent in nlp(text).sents if sent.text.strip()]
50
-
51
- # ---------------------------
52
- # 🧹 Text cleanup
53
- # ---------------------------
54
- def clean(text):
55
- return "\n".join([line.strip() for line in text.splitlines() if line.strip()])
56
 
57
- # ---------------------------
58
- # πŸ“„ PDF and DOCX extractors
59
- # ---------------------------
60
  def extract_pdf_text(path):
 
61
  doc = fitz.open(path)
62
- pages = []
63
  for i, page in enumerate(doc):
64
  text = page.get_text()
65
  if not text.strip():
66
- pix = page.get_pixmap(dpi=300)
67
- img = Image.open(io.BytesIO(pix.tobytes("png")))
68
  text = pytesseract.image_to_string(img)
69
- pages.append((i + 1, text))
70
- return pages
71
 
72
  def extract_docx_text(path):
73
  doc = docx.Document(path)
74
- full_text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
75
- return [(1, full_text)]
76
 
77
- # ---------------------------
78
- # πŸ“¦ Chunk splitter
79
- # ---------------------------
80
- def chunkify(sentences, max_tokens=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
 
 
 
81
  chunks = []
82
  current = []
83
- length = 0
84
- for s in sentences:
85
- tokens = len(s.split())
86
- if length + tokens > max_tokens:
87
  chunks.append(" ".join(current))
88
- current = current[-overlap:]
89
- length = sum(len(w.split()) for w in current)
90
- current.append(s)
91
- length += tokens
92
  if current:
93
  chunks.append(" ".join(current))
94
  return chunks
95
 
96
- # ---------------------------
97
- # πŸ”Ž Metadata from file
98
- # ---------------------------
99
- def extract_meta(name):
100
- name = name.lower()
101
- return {
102
- "model": next((m for m in ["se3", "se4", "symbio", "explore"] if m in name), "unknown"),
103
- "doc_type": next((d for d in ["owner", "service", "parts"] if d in name), "unknown"),
104
- "brand": "life fitness"
105
- }
106
-
107
- # ---------------------------
108
- # πŸ”  Embed and store chunks
109
- # ---------------------------
110
  def embed_all():
111
- embedder = SentenceTransformer(EMBED_MODEL)
112
- client = chromadb.PersistentClient(path=CHROMA_PATH)
113
- try:
114
- client.delete_collection(CHROMA_COLLECTION)
115
- except:
116
- pass
117
- db = client.create_collection(CHROMA_COLLECTION)
118
-
119
- for fname in os.listdir(MANUALS_DIR):
120
- path = os.path.join(MANUALS_DIR, fname)
121
- if fname.endswith(".pdf"):
122
- pages = extract_pdf_text(path)
123
- elif fname.endswith(".docx"):
124
- pages = extract_docx_text(path)
125
  else:
126
  continue
127
- meta = extract_meta(fname)
128
- for page, text in pages:
129
- sents = split_sentences(clean(text))
130
- chunks = chunkify(sents)
131
- for i, chunk in enumerate(chunks):
132
- db.add(
133
- ids=[f"{fname}::p{page}::c{i}"],
134
- documents=[chunk],
135
- metadatas=[{**meta, "source": fname, "page": page}]
136
- )
137
- return db, embedder
138
-
139
- # ---------------------------
140
- # πŸ€– Load selected LLM model
141
- # ---------------------------
142
- def load_model(repo):
143
- tokenizer = AutoTokenizer.from_pretrained(repo, token=HF_TOKEN)
144
- model = AutoModelForCausalLM.from_pretrained(
145
- repo, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
146
- device_map="auto" if torch.cuda.is_available() else None, token=HF_TOKEN
147
- )
148
- return pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
149
-
150
- # ---------------------------
151
- # πŸ“₯ Retrieval-Augmented QA
152
- # ---------------------------
153
- def answer_query(q, model_choice):
154
- results = db.query(query_texts=[q], n_results=3)
155
  context = "\n\n".join(results["documents"][0])
156
  prompt = f"""
157
- You are a helpful assistant. Answer based on the context. If unsure, say "I don't know".
158
 
159
  Context:
160
  {context}
161
 
162
- Question: {q}
163
  Answer:
164
  """
165
- pipe = load_model(model_choice)
166
- out = pipe(prompt, max_new_tokens=300, do_sample=False)[0]["generated_text"]
167
- return out.split("Answer:")[-1].strip()
168
-
169
- # ---------------------------
170
- # πŸš€ Initialize app
171
- # ---------------------------
172
- print("Embedding documents...")
173
  db, embedder = embed_all()
174
- print("Done embedding.")
175
 
176
- # ---------------------------
177
  # πŸŽ›οΈ Gradio UI
178
- # ---------------------------
179
- demo = gr.Blocks()
 
 
 
 
180
 
181
- with demo:
182
- gr.Markdown("""# 🧠 SmartManuals-AI
183
- Ask any question and let the model answer from your uploaded manuals.
184
- """)
185
  with gr.Row():
186
- qbox = gr.Textbox(label="Ask a Question", placeholder="e.g. How to reset the SE3 console?")
187
- model_select = gr.Dropdown(choices=AVAILABLE_MODELS, label="Choose LLM", value=DEFAULT_MODEL)
188
- ansbox = gr.Textbox(label="Answer", lines=10)
189
- btn = gr.Button("πŸ” Submit")
190
- btn.click(fn=answer_query, inputs=[qbox, model_select], outputs=ansbox)
 
191
 
192
- demo.launch()
 
 
 
 
1
  import os
 
2
  import fitz # PyMuPDF
 
 
3
  import docx
4
+ import json
5
  import gradio as gr
6
  import pytesseract
 
 
 
7
  from PIL import Image
8
+ from tqdm import tqdm
9
+ import chromadb
10
+ import torch
11
+ import nltk
12
  from sentence_transformers import SentenceTransformer, util
13
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
14
 
15
+ # ----------------------------
16
+ # βœ… Ensure nltk punkt is available
17
+ # ----------------------------
18
+ try:
19
+ nltk.data.find("tokenizers/punkt")
20
+ except LookupError:
21
+ nltk.download("punkt")
22
+
23
+ from nltk.tokenize import sent_tokenize
24
+
25
+ # ----------------------------
26
+ # βš™οΈ Config
27
+ # ----------------------------
28
+ MANUAL_DIR = "./Manuals"
29
+ CHROMA_DIR = "./chroma_store"
30
  CHUNK_SIZE = 750
31
  CHUNK_OVERLAP = 100
32
+ MAX_CONTEXT = 3
33
+
34
+ DEFAULT_MODEL = "meta-llama/Llama-3-8b-Instruct"
35
+ MODEL_OPTIONS = [
36
+ "meta-llama/Llama-3-8b-Instruct",
 
37
  "mistralai/Mistral-7B-Instruct-v0.3",
38
+ "google/gemma-1.1-7b-it"
39
  ]
 
 
 
 
 
 
 
 
 
 
 
40
 
41
+ HF_TOKEN = os.environ.get("HF_TOKEN")
 
 
 
 
 
 
 
42
 
43
+ # ----------------------------
44
+ # πŸ” Utility functions
45
+ # ----------------------------
46
  def extract_pdf_text(path):
47
+ text_blocks = []
48
  doc = fitz.open(path)
 
49
  for i, page in enumerate(doc):
50
  text = page.get_text()
51
  if not text.strip():
52
+ img = Image.open(io.BytesIO(page.get_pixmap().tobytes("png")))
 
53
  text = pytesseract.image_to_string(img)
54
+ text_blocks.append({"page": i + 1, "text": text})
55
+ return text_blocks
56
 
57
  def extract_docx_text(path):
58
  doc = docx.Document(path)
59
+ full_text = "\n".join([para.text for para in doc.paragraphs])
60
+ return [{"page": 1, "text": full_text}]
61
 
62
+ def split_sentences(text):
63
+ try:
64
+ return sent_tokenize(text)
65
+ except Exception:
66
+ return text.split(". ")
67
+
68
+ def chunk_text(sentences):
69
  chunks = []
70
  current = []
71
+ count = 0
72
+ for sentence in sentences:
73
+ tokens = sentence.split()
74
+ if count + len(tokens) > CHUNK_SIZE:
75
  chunks.append(" ".join(current))
76
+ current = current[-CHUNK_OVERLAP:]
77
+ count = sum(len(s.split()) for s in current)
78
+ current.append(sentence)
79
+ count += len(tokens)
80
  if current:
81
  chunks.append(" ".join(current))
82
  return chunks
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  def embed_all():
85
+ client = chromadb.PersistentClient(path=CHROMA_DIR)
86
+ if "manual_chunks" in [c.name for c in client.list_collections()]:
87
+ client.delete_collection("manual_chunks")
88
+ collection = client.create_collection("manual_chunks")
89
+ embedder = SentenceTransformer("all-MiniLM-L6-v2")
90
+
91
+ for fname in os.listdir(MANUAL_DIR):
92
+ fpath = os.path.join(MANUAL_DIR, fname)
93
+ if fname.lower().endswith(".pdf"):
94
+ pages = extract_pdf_text(fpath)
95
+ elif fname.lower().endswith(".docx"):
96
+ pages = extract_docx_text(fpath)
 
 
97
  else:
98
  continue
99
+
100
+ for page in pages:
101
+ sents = split_sentences(page["text"])
102
+ chunks = chunk_text(sents)
103
+ for idx, chunk in enumerate(chunks):
104
+ cid = f"{fname}::p{page['page']}::c{idx}"
105
+ collection.add(documents=[chunk], ids=[cid], metadatas=[{"source": fname, "page": page["page"]}])
106
+
107
+ return collection, embedder
108
+
109
+ def get_model(model_id):
110
+ tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
111
+ model = AutoModelForCausalLM.from_pretrained(model_id, token=HF_TOKEN, torch_dtype=torch.float32)
112
+ return pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)
113
+
114
+ def run_query(question, model_name):
115
+ results = db.query(query_texts=[question], n_results=MAX_CONTEXT)
116
+ if not results or not results.get("documents"):
117
+ return "No matching information found."
118
+
 
 
 
 
 
 
 
 
119
  context = "\n\n".join(results["documents"][0])
120
  prompt = f"""
121
+ You are a helpful assistant. Use the following context to answer the question.
122
 
123
  Context:
124
  {context}
125
 
126
+ Question: {question}
127
  Answer:
128
  """
129
+ model = get_model(model_name)
130
+ res = model(prompt, max_new_tokens=300)[0]['generated_text']
131
+ return res.split("Answer:")[-1].strip()
132
+
133
+ # ----------------------------
134
+ # βœ… Startup: Embed manuals
135
+ # ----------------------------
 
136
  db, embedder = embed_all()
 
137
 
138
+ # ----------------------------
139
  # πŸŽ›οΈ Gradio UI
140
+ # ----------------------------
141
+ with gr.Blocks() as demo:
142
+ gr.Markdown("""
143
+ # πŸ“˜ SmartManuals-AI (Docker)
144
+ Ask any question from the preloaded manuals (PDF + Word).
145
+ """)
146
 
 
 
 
 
147
  with gr.Row():
148
+ question = gr.Textbox(label="Ask a Question")
149
+ model = gr.Dropdown(choices=MODEL_OPTIONS, value=DEFAULT_MODEL, label="Choose LLM")
150
+ btn = gr.Button("Ask")
151
+ answer = gr.Textbox(label="Answer", lines=10)
152
+
153
+ btn.click(fn=run_query, inputs=[question, model], outputs=answer)
154
 
155
+ demo.launch(server_name="0.0.0.0", server_port=7860)