Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,106 +1,76 @@
|
|
1 |
-
# app.py
|
2 |
import os
|
3 |
-
import
|
4 |
-
import faiss
|
5 |
import numpy as np
|
6 |
-
|
7 |
-
import gradio as gr
|
8 |
-
import spaces
|
9 |
-
|
10 |
-
from unstructured.partition.pdf import partition_pdf
|
11 |
from sentence_transformers import SentenceTransformer
|
12 |
-
|
13 |
-
|
14 |
-
# βββ Configuration βββββββββββββββββββββββββββββββββββββββββββββ
|
15 |
-
PDF_FOLDER = "meal_plans"
|
16 |
-
MODEL_NAME = "facebook/rag-sequence-nq"
|
17 |
-
EMBED_MODEL = "all-MiniLM-L6-v2"
|
18 |
-
TOP_K = 5
|
19 |
-
|
20 |
-
# βββ 1) LOAD + CHUNK ALL PDFs ββββββββββββββββββββββββββββββββββ
|
21 |
-
rag_tokenizer = RagTokenizer.from_pretrained(MODEL_NAME)
|
22 |
-
texts, sources, pages = [], [], []
|
23 |
-
|
24 |
-
for pdf_path in glob.glob(f"{PDF_FOLDER}/*.pdf"):
|
25 |
-
book = os.path.basename(pdf_path)
|
26 |
-
pages_data = partition_pdf(filename=pdf_path)
|
27 |
-
for pg_num, page in enumerate(pages_data, start=1):
|
28 |
-
enc = rag_tokenizer(
|
29 |
-
page.text,
|
30 |
-
max_length=800,
|
31 |
-
truncation=True,
|
32 |
-
return_overflowing_tokens=True,
|
33 |
-
stride=50,
|
34 |
-
return_tensors="pt"
|
35 |
-
)
|
36 |
-
for token_ids in enc["input_ids"]:
|
37 |
-
chunk = rag_tokenizer.decode(token_ids, skip_special_tokens=True)
|
38 |
-
texts.append(chunk)
|
39 |
-
sources.append(book)
|
40 |
-
pages.append(pg_num)
|
41 |
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
dim = embeddings.shape[1]
|
46 |
-
index = faiss.IndexFlatL2(dim)
|
47 |
-
index.add(embeddings)
|
48 |
|
49 |
-
#
|
50 |
-
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
-
|
54 |
-
def
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
weeks: str
|
62 |
-
):
|
63 |
-
# build prefs string
|
64 |
-
avoid_list = [a.strip() for a in avoid.split(",") if a.strip()]
|
65 |
-
prefs = (
|
66 |
-
f"Goal={goal}; Diet={','.join(diet)}; "
|
67 |
-
f"Meals={meals}/day; Avoid={','.join(avoid_list)}; Duration={weeks}"
|
68 |
-
)
|
69 |
-
# 1) RETRIEVE top-k chunks
|
70 |
-
q_emb = embedder.encode([message], convert_to_numpy=True)
|
71 |
-
D, I = index.search(q_emb, TOP_K)
|
72 |
-
context = "\n".join(f"[{sources[i]} p{pages[i]}] {texts[i]}" for i in I[0])
|
73 |
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
|
|
|
|
|
|
82 |
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
avoid = gr.Textbox(placeholder="e.g. Gluten, Dairy, Nutsβ¦", label="Avoidances (comma-separated)")
|
98 |
-
weeks = gr.Dropdown(["1 week","2 weeks","3 weeks","4 weeks"], label="Plan Length", value="1 week")
|
99 |
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
)
|
104 |
|
105 |
-
|
106 |
-
demo.launch()
|
|
|
|
|
1 |
import os
|
2 |
+
import fitz # PyMuPDF
|
|
|
3 |
import numpy as np
|
4 |
+
import faiss
|
|
|
|
|
|
|
|
|
5 |
from sentence_transformers import SentenceTransformer
|
6 |
+
import gradio as gr
|
7 |
+
import spaces # for ZeroGPU
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
+
@spaces.GPU
|
10 |
+
def query_app(user_input, include_source, verbose):
|
11 |
+
return search_index(user_input, index, documents, include_source, verbose)
|
|
|
|
|
|
|
12 |
|
13 |
+
# PDF reader
|
14 |
+
def extract_text_from_pdf(folder_path="meal_plans"):
|
15 |
+
documents = []
|
16 |
+
for filename in os.listdir(folder_path):
|
17 |
+
if filename.lower().endswith(".pdf"):
|
18 |
+
path = os.path.join(folder_path, filename)
|
19 |
+
try:
|
20 |
+
doc = fitz.open(path)
|
21 |
+
text = ""
|
22 |
+
for page in doc:
|
23 |
+
text += page.get_text()
|
24 |
+
documents.append({"text": text, "source": filename})
|
25 |
+
except Exception as e:
|
26 |
+
print(f"Error reading {filename}: {e}")
|
27 |
+
return documents
|
28 |
|
29 |
+
# Index builder
|
30 |
+
def create_index(docs):
|
31 |
+
texts = [doc["text"] for doc in docs]
|
32 |
+
embeddings = model.encode(texts)
|
33 |
+
dim = embeddings[0].shape[0]
|
34 |
+
index = faiss.IndexFlatL2(dim)
|
35 |
+
index.add(np.array(embeddings).astype("float32"))
|
36 |
+
return index
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
+
# Search logic
|
39 |
+
def search_index(query, index, docs, include_source=True, verbose=False, top_k=3):
|
40 |
+
query_vec = model.encode([query])
|
41 |
+
D, I = index.search(np.array(query_vec).astype("float32"), top_k)
|
42 |
+
responses = []
|
43 |
+
for i in I[0]:
|
44 |
+
doc = docs[i]
|
45 |
+
snippet = doc["text"][:750 if verbose else 300].replace("\n", " ").strip()
|
46 |
+
label = f"**π {doc['source']}**\n" if include_source else ""
|
47 |
+
responses.append(f"{label}{snippet}...")
|
48 |
+
return "\n\n---\n\n".join(responses)
|
49 |
|
50 |
+
# Setup
|
51 |
+
model = SentenceTransformer("all-MiniLM-L6-v2")
|
52 |
+
documents = extract_text_from_pdf("meal_plans")
|
53 |
+
index = create_index(documents)
|
54 |
|
55 |
+
# Gradio UI
|
56 |
+
with gr.Blocks(title="Meal Plan Chat Assistant") as demo:
|
57 |
+
gr.Markdown("## π½οΈ Meal Plan Assistant\nChat with your PDF documents in `meal_plans/` folder.")
|
58 |
+
with gr.Row():
|
59 |
+
with gr.Column(scale=4):
|
60 |
+
chatbot = gr.Chatbot()
|
61 |
+
user_input = gr.Textbox(placeholder="Ask something...", show_label=False)
|
62 |
+
send_btn = gr.Button("Ask")
|
63 |
+
with gr.Column(scale=1):
|
64 |
+
include_source = gr.Checkbox(label="Include Source", value=True)
|
65 |
+
verbose = gr.Checkbox(label="Verbose Mode", value=False)
|
66 |
|
67 |
+
def user_query(msg, history, source, verbose_mode):
|
68 |
+
answer = query_app(msg, source, verbose_mode)
|
69 |
+
history = history + [(msg, answer)]
|
70 |
+
return history, history
|
|
|
|
|
71 |
|
72 |
+
send_btn.click(user_query,
|
73 |
+
inputs=[user_input, chatbot, include_source, verbose],
|
74 |
+
outputs=[chatbot, chatbot])
|
|
|
75 |
|
76 |
+
demo.launch()
|
|