damoojeje commited on
Commit
2d58fdd
·
verified ·
1 Parent(s): 7b8bb00

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -60
app.py CHANGED
@@ -41,7 +41,7 @@ def split_sentences(text):
41
  try:
42
  return sent_tokenize(text)
43
  except:
44
- print("\u26a0\ufe0f Tokenizer fallback: simple split.")
45
  return text.split(". ")
46
 
47
  def split_chunks(sentences, max_tokens=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
@@ -73,70 +73,80 @@ def extract_pdf_text(path):
73
  text = pytesseract.image_to_string(img)
74
  chunks.append((path, i + 1, clean(text)))
75
  except Exception as e:
76
- print("\u274c PDF read error:", path, e)
77
  return chunks
78
 
79
  def extract_docx_text(path):
80
  try:
81
  return [(path, 1, clean(docx2txt.process(path)))]
82
  except Exception as e:
83
- print("\u274c DOCX read error:", path, e)
84
  return []
85
 
86
  # ---------------- Embedding ----------------
87
  def embed_all():
88
- embedder = SentenceTransformer("all-MiniLM-L6-v2")
89
- embedder.eval()
90
- client = chromadb.PersistentClient(path=CHROMA_PATH)
91
-
92
  try:
93
- client.delete_collection(COLLECTION_NAME)
94
- except:
95
- pass
96
- collection = client.get_or_create_collection(COLLECTION_NAME)
97
-
98
- docs, ids, metas = [], [], []
99
- print("\ud83d\udcc4 Processing manuals...")
100
-
101
- for fname in os.listdir(MANUALS_DIR):
102
- fpath = os.path.join(MANUALS_DIR, fname)
103
- if fname.lower().endswith(".pdf"):
104
- pages = extract_pdf_text(fpath)
105
- elif fname.lower().endswith(".docx"):
106
- pages = extract_docx_text(fpath)
107
- else:
108
- continue
109
-
110
- for path, page, text in pages:
111
- for i, chunk in enumerate(split_chunks(split_sentences(text))):
112
- chunk_id = f"{fname}::{page}::{i}"
113
- docs.append(chunk)
114
- ids.append(chunk_id)
115
- metas.append({"source": fname, "page": page})
116
-
117
- if len(docs) >= 16:
118
- embs = embedder.encode(docs).tolist()
119
- collection.add(documents=docs, ids=ids, metadatas=metas, embeddings=embs)
120
- docs, ids, metas = [], [], []
121
-
122
- if docs:
123
- embs = embedder.encode(docs).tolist()
124
- collection.add(documents=docs, ids=ids, metadatas=metas, embeddings=embs)
125
-
126
- print(f"\u2705 Embedded {len(ids)} chunks.")
127
- return collection, embedder
 
 
 
 
 
 
 
 
 
 
128
 
129
  # ---------------- Model Setup ----------------
130
  def load_model():
131
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
132
- model = AutoModelForCausalLM.from_pretrained(
133
- MODEL_ID,
134
- token=HF_TOKEN,
135
- device_map="auto" if torch.cuda.is_available() else None,
136
- torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
137
- ).to(device)
138
- pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
139
- return pipe, tokenizer
 
 
 
 
140
 
141
  def ask_model(question, context, pipe, tokenizer):
142
  prompt = f"""Use only the following context to answer. If uncertain, say \"I don't know.\"
@@ -152,38 +162,41 @@ A:"""
152
 
153
  # ---------------- Query ----------------
154
  def get_answer(question):
155
- if not all([embedder, db, model_pipe, model_tokenizer]):
156
- return "⚠️ The system is still initializing or failed to load. Please try again later."
157
  try:
158
  query_emb = embedder.encode(question, convert_to_tensor=True)
159
  results = db.query(query_texts=[question], n_results=MAX_CONTEXT_CHUNKS)
160
  context = "\n\n".join(results["documents"][0])
161
  return ask_model(question, context, model_pipe, model_tokenizer)
162
  except Exception as e:
163
- print("\u274c Query error:", e)
164
  return f"Error: {e}"
165
 
166
  # ---------------- UI ----------------
167
  with gr.Blocks() as demo:
168
- gr.Markdown("## \ud83e\udd16 SmartManuals-AI (Granite 3.2-2B)")
169
  with gr.Row():
170
  question = gr.Textbox(label="Ask your question")
171
  ask = gr.Button("Ask")
172
  answer = gr.Textbox(label="Answer", lines=8)
173
  ask.click(fn=get_answer, inputs=question, outputs=answer)
174
 
175
- # ---------------- Startup ----------------
176
- embedder = db = model_pipe = model_tokenizer = None
 
 
177
 
178
  try:
179
  db, embedder = embed_all()
180
  except Exception as e:
181
- print("\u274c Embedding failed:", e)
182
 
183
  try:
184
  model_pipe, model_tokenizer = load_model()
185
  except Exception as e:
186
- print("\u274c Model loading failed:", e)
187
 
 
188
  if __name__ == "__main__":
189
- demo.launch()
 
41
  try:
42
  return sent_tokenize(text)
43
  except:
44
+ print("Tokenizer fallback: simple split.")
45
  return text.split(". ")
46
 
47
  def split_chunks(sentences, max_tokens=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
 
73
  text = pytesseract.image_to_string(img)
74
  chunks.append((path, i + 1, clean(text)))
75
  except Exception as e:
76
+ print("PDF read error:", path, e)
77
  return chunks
78
 
79
  def extract_docx_text(path):
80
  try:
81
  return [(path, 1, clean(docx2txt.process(path)))]
82
  except Exception as e:
83
+ print("DOCX read error:", path, e)
84
  return []
85
 
86
  # ---------------- Embedding ----------------
87
  def embed_all():
 
 
 
 
88
  try:
89
+ embedder = SentenceTransformer("all-MiniLM-L6-v2")
90
+ embedder.eval()
91
+ client = chromadb.PersistentClient(path=CHROMA_PATH)
92
+
93
+ try:
94
+ client.delete_collection(COLLECTION_NAME)
95
+ except:
96
+ pass
97
+
98
+ collection = client.get_or_create_collection(COLLECTION_NAME)
99
+
100
+ docs, ids, metas = [], [], []
101
+ print("Processing manuals...")
102
+
103
+ for fname in os.listdir(MANUALS_DIR):
104
+ fpath = os.path.join(MANUALS_DIR, fname)
105
+ if fname.lower().endswith(".pdf"):
106
+ pages = extract_pdf_text(fpath)
107
+ elif fname.lower().endswith(".docx"):
108
+ pages = extract_docx_text(fpath)
109
+ else:
110
+ continue
111
+
112
+ for path, page, text in pages:
113
+ for i, chunk in enumerate(split_chunks(split_sentences(text))):
114
+ chunk_id = f"{fname}::{page}::{i}"
115
+ docs.append(chunk)
116
+ ids.append(chunk_id)
117
+ metas.append({"source": fname, "page": page})
118
+
119
+ if len(docs) >= 16:
120
+ embs = embedder.encode(docs).tolist()
121
+ collection.add(documents=docs, ids=ids, metadatas=metas, embeddings=embs)
122
+ docs, ids, metas = [], [], []
123
+
124
+ if docs:
125
+ embs = embedder.encode(docs).tolist()
126
+ collection.add(documents=docs, ids=ids, metadatas=metas, embeddings=embs)
127
+
128
+ print(f"Embedded {len(ids)} chunks.")
129
+ return collection, embedder
130
+
131
+ except Exception as e:
132
+ print("Embedding startup failed:", e)
133
+ return None, None
134
 
135
  # ---------------- Model Setup ----------------
136
  def load_model():
137
+ try:
138
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
139
+ model = AutoModelForCausalLM.from_pretrained(
140
+ MODEL_ID,
141
+ token=HF_TOKEN,
142
+ device_map="auto" if torch.cuda.is_available() else None,
143
+ torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
144
+ ).to(device)
145
+ pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
146
+ return pipe, tokenizer
147
+ except Exception as e:
148
+ print("Model loading failed:", e)
149
+ return None, None
150
 
151
  def ask_model(question, context, pipe, tokenizer):
152
  prompt = f"""Use only the following context to answer. If uncertain, say \"I don't know.\"
 
162
 
163
  # ---------------- Query ----------------
164
  def get_answer(question):
165
+ if not embedder or not db or not model_pipe:
166
+ return "System not ready. Try again after initialization."
167
  try:
168
  query_emb = embedder.encode(question, convert_to_tensor=True)
169
  results = db.query(query_texts=[question], n_results=MAX_CONTEXT_CHUNKS)
170
  context = "\n\n".join(results["documents"][0])
171
  return ask_model(question, context, model_pipe, model_tokenizer)
172
  except Exception as e:
173
+ print("Query error:", e)
174
  return f"Error: {e}"
175
 
176
  # ---------------- UI ----------------
177
  with gr.Blocks() as demo:
178
+ gr.Markdown("## SmartManuals-AI (Granite 3.2-2B)")
179
  with gr.Row():
180
  question = gr.Textbox(label="Ask your question")
181
  ask = gr.Button("Ask")
182
  answer = gr.Textbox(label="Answer", lines=8)
183
  ask.click(fn=get_answer, inputs=question, outputs=answer)
184
 
185
+ # Startup Initialization
186
+ embedder = None
187
+ model_pipe = None
188
+ model_tokenizer = None
189
 
190
  try:
191
  db, embedder = embed_all()
192
  except Exception as e:
193
+ print(" Embedding failed:", e)
194
 
195
  try:
196
  model_pipe, model_tokenizer = load_model()
197
  except Exception as e:
198
+ print(" Model load failed:", e)
199
 
200
+ # Launch
201
  if __name__ == "__main__":
202
+ demo.launch(share=True)