Spaces:

zamal
/

Multimodal-Chat-Playground

Running on Zero

App Files Files Community

zamal commited on May 30

Commit

6d3678b

verified ·

1 Parent(s): a5216ce

Update app.py

Browse files

Files changed (1) hide show

app.py +109 -204

app.py CHANGED Viewed

@@ -54,8 +54,9 @@ vision_model = LlavaNextForConditionalGeneration.from_pretrained(
 # Add at the top of your module, alongside your other globals
-CURRENT_VDB = None
 @spaces.GPU()
 def get_image_description(image: Image.Image) -> str:
@@ -97,61 +98,60 @@ SHARED_EMB_FN = embedding_functions.SentenceTransformerEmbeddingFunction(
 def get_vectordb(text: str, images: list[Image.Image], img_names: list[str]):
     """
-    Build an in-memory ChromaDB instance with two collections:
       • text_db  (chunks of the PDF text)
       • image_db (image descriptions + raw image bytes)
-    Returns the Chroma client for later querying.
     """
-    # ——— 1) Init & wipe old ————————————————
-    client = chromadb.EphemeralClient()
     for col in ("text_db", "image_db"):
         if col in [c.name for c in client.list_collections()]:
             client.delete_collection(col)
-    # ——— 2) Create fresh collections —————————
     text_col = client.get_or_create_collection(
         name="text_db",
-        embedding_function=SHARED_EMB_FN,
-        data_loader=ImageLoader(),   # loader only matters for images, benign here
     )
     img_col = client.get_or_create_collection(
         name="image_db",
         embedding_function=SHARED_EMB_FN,
-        metadata={"hnsw:space": "cosine"},
-        data_loader=ImageLoader(),
     )
-    # ——— 3) Add images if any ———————————————
     if images:
-        descs = []
-        metas = []
         for idx, img in enumerate(images):
-            # build one-line caption (or fallback)
             try:
-                caption = get_image_description(img)
-            except Exception:
-                caption = "⚠️ could not describe image"
-            descs.append(f"{img_names[idx]}: {caption}")
             metas.append({"image": image_to_bytes(img)})
-        img_col.add(
-            ids=[str(i) for i in range(len(images))],
-            documents=descs,
-            metadatas=metas,
-        )
-    # ——— 4) Chunk & add text ———————————————
     splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
     docs = splitter.create_documents([text])
-    text_col.add(
-        ids=[str(i) for i in range(len(docs))],
-        documents=[d.page_content for d in docs],
-    )
     return client
 # Text extraction
 def result_to_text(result, as_text=False):
     pages = []
@@ -169,18 +169,12 @@ OCR_CHOICES = {
 def extract_data_from_pdfs(
     docs: list[str],
     session: dict,
-    include_images: str,    # "Include Images" or "Exclude Images"
-    do_ocr: str,            # "Get Text With OCR" or "Get Available Text Only"
-    ocr_choice: str,        # key into OCR_CHOICES
-    vlm_choice: str,        # HF repo ID for LlavaNext
     progress=gr.Progress()
 ):
-    """
-    1) (Optional) OCR setup
-    2) Vision+Lang model setup & monkey-patch get_image_description
-    3) Extract text & images
-    4) Build and stash vector DB in CURRENT_VDB
-    """
     if not docs:
         raise gr.Error("No documents to process")
@@ -193,60 +187,57 @@ def extract_data_from_pdfs(
     # 2) Vision–language model
     proc = LlavaNextProcessor.from_pretrained(vlm_choice)
-    vis = (
-        LlavaNextForConditionalGeneration
-        .from_pretrained(vlm_choice, torch_dtype=torch.float16, low_cpu_mem_usage=True)
-        .to("cuda")
-    )
-    # Monkey-patch our pipeline for image captions
-    def describe(img: Image.Image) -> str:
-        torch.cuda.empty_cache()
-        gc.collect()
         prompt = "[INST] <image>\nDescribe the image in a sentence [/INST]"
-        inputs = proc(prompt, img, return_tensors="pt").to("cuda")
-        output = vis.generate(**inputs, max_new_tokens=100)
-        return proc.decode(output[0], skip_special_tokens=True)
-    global get_image_description, CURRENT_VDB
     get_image_description = describe
-    # 3) Extract text + images
     progress(0.2, "Extracting text and images…")
     all_text = ""
     images, names = [], []
     for path in docs:
         if local_ocr:
             pdf = DocumentFile.from_pdf(path)
             res = local_ocr(pdf)
             all_text += result_to_text(res, as_text=True) + "\n\n"
         else:
-            txt = PdfReader(path).pages[0].extract_text() or ""
-            all_text += txt + "\n\n"
         if include_images == "Include Images":
             imgs = extract_images([path])
             images.extend(imgs)
             names.extend([os.path.basename(path)] * len(imgs))
-    # 4) Build + store the vector DB
     progress(0.6, "Indexing in vector DB…")
-    CURRENT_VDB = get_vectordb(all_text, images, names)
     session["processed"] = True
     sample_imgs = images[:4] if include_images == "Include Images" else []
-    # ─── return *exactly four* picklable outputs ───
     return (
-        session,            # gr.State: so UI knows we're ready
-        all_text[:2000] + "...",  # preview text
-        sample_imgs,        # preview images
-        "<h3>Done!</h3>"    # Done message
     )
 # Chat function
 def conversation(
     session: dict,
@@ -258,46 +249,44 @@ def conversation(
     max_tok: int,
     model_id: str
 ):
-    """
-    Uses the global CURRENT_VDB (set by extract_data_from_pdfs) to answer.
-    """
-    global CURRENT_VDB
-    if not session.get("processed") or CURRENT_VDB is None:
         raise gr.Error("Please extract data first")
-    llm = HuggingFaceEndpoint(
-        repo_id=model_id,
-        temperature=temp,
-        max_new_tokens=max_tok,
-        huggingfacehub_api_token=HF_TOKEN
-    )
-    # 1) Text retrieval
-    text_col = CURRENT_VDB.get_collection("text_db")
-    docs = text_col.query(
-        query_texts=[question],
-        n_results=int(num_ctx),
-        include=["documents"]
-    )["documents"][0]
-    # 2) Image retrieval
-    img_col = CURRENT_VDB.get_collection("image_db")
-    img_q = img_col.query(
-        query_texts=[question],
-        n_results=int(img_ctx),
-        include=["metadatas", "documents"]
-    )
     img_descs = img_q["documents"][0] or ["No images found"]
     images = []
     for meta in img_q["metadatas"][0]:
-        b64 = meta.get("image", "")
         try:
             images.append(Image.open(io.BytesIO(base64.b64decode(b64))))
         except:
             pass
     img_desc = "\n".join(img_descs)
-    # 3) Build prompt & call LLM
     prompt = PromptTemplate(
         template="""
 Context:
@@ -310,34 +299,27 @@ Question:
 {q}
 Answer:
-""",
-        input_variables=["text", "img_desc", "q"],
-    )
-    user_input = prompt.format(
-        text="\n\n".join(docs),
-        img_desc=img_desc,
-        q=question
     )
     try:
-        answer = llm.invoke(user_input)
     except HfHubHTTPError as e:
-        if e.response.status_code == 404:
-            answer = f"❌ Model `{model_id}` not hosted on HF Inference API."
-        else:
-            answer = f"⚠️ HF API error: {e}"
     except Exception as e:
         answer = f"⚠️ Unexpected error: {e}"
     new_history = history + [
-        {"role": "user",      "content": question},
-        {"role": "assistant", "content": answer}
     ]
     return new_history, docs, images
 # ─────────────────────────────────────────────────────────────────────────────
 # Gradio UI
 CSS = """
@@ -359,128 +341,54 @@ MODEL_OPTIONS = [
 ]
 with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
-    # State to track that extraction completed (and carry any metadata)
     session_state = gr.State({})
-    # ─── Welcome Screen ─────────────────────────────────────────────
     with gr.Column(visible=True) as welcome_col:
-        gr.Markdown(
-            f"<div style='text-align: center'>\n{WELCOME_INTRO}\n</div>",
-            elem_id="welcome_md"
-        )
         start_btn = gr.Button("🚀 Start")
-    # ─── Main App (hidden until Start is clicked) ───────────────────
     with gr.Column(visible=False) as app_col:
         gr.Markdown("## 📚 Multimodal Chat-PDF Playground")
-        # We need to capture the extract‐event so we can chain the “show chat tab” later
         extract_event = None
         with gr.Tabs() as tabs:
-            # ── Tab 1: Upload & Extract ───────────────────────────────
             with gr.TabItem("1. Upload & Extract"):
-                docs = gr.File(
-                    file_count="multiple",
-                    file_types=[".pdf"],
-                    label="Upload PDFs"
-                )
-                include_dd = gr.Radio(
-                    ["Include Images", "Exclude Images"],
-                    value="Exclude Images",
-                    label="Images"
-                )
-                ocr_radio = gr.Radio(
-                    ["Get Text With OCR", "Get Available Text Only"],
-                    value="Get Available Text Only",
-                    label="OCR"
-                )
-                ocr_dd = gr.Dropdown(
-                    choices=list(OCR_CHOICES.keys()),
-                    value=list(OCR_CHOICES.keys())[0],
-                    label="OCR Model"
-                )
-                vlm_dd = gr.Dropdown(
-                    choices=[
-                        "llava-hf/llava-v1.6-mistral-7b-hf",
-                        "llava-hf/llava-v1.5-mistral-7b"
-                    ],
-                    value="llava-hf/llava-v1.6-mistral-7b-hf",
-                    label="Vision-Language Model"
-                )
-                extract_btn = gr.Button("Extract")
-                preview_text = gr.Textbox(
-                    lines=10,
-                    label="Sample Text",
-                    interactive=False
-                )
-                preview_img = gr.Gallery(
-                    label="Sample Images",
-                    rows=2,
-                    value=[]
-                )
                 preview_html = gr.HTML()
-                # Kick off extraction and capture the event
                 extract_event = extract_btn.click(
                     fn=extract_data_from_pdfs,
-                    inputs=[
-                        docs,
-                        session_state,
-                        include_dd,
-                        ocr_radio,
-                        ocr_dd,
-                        vlm_dd
-                    ],
-                    outputs=[
-                        session_state,   # sets session["processed"]=True
-                        preview_text,    # shows first bits of text
-                        preview_img,     # shows first images
-                        preview_html     # shows “<h3>Done!</h3>”
-                    ]
                 )
-            # ── Tab 2: Chat (initially hidden) ──────────────────────────
             with gr.TabItem("2. Chat", visible=False) as chat_tab:
                 with gr.Row():
                     with gr.Column(scale=3):
                         chat = gr.Chatbot(type="messages", label="Chat")
-                        msg  = gr.Textbox(
-                            placeholder="Ask about your PDF...",
-                            label="Your question"
-                        )
                         send = gr.Button("Send")
                     with gr.Column(scale=1):
-                        model_dd = gr.Dropdown(
-                            MODEL_OPTIONS,
-                            value=MODEL_OPTIONS[0],
-                            label="Choose Chat Model"
-                        )
-                        num_ctx = gr.Slider(1, 20, value=3, label="Text Contexts")
-                        img_ctx = gr.Slider(1, 10, value=2, label="Image Contexts")
-                        temp    = gr.Slider(0.1, 1.0, step=0.1, value=0.4, label="Temperature")
-                        max_tok = gr.Slider(10, 1000, step=10, value=200, label="Max Tokens")
                 send.click(
                     fn=conversation,
-                    inputs=[
-                        session_state,
-                        msg,
-                        num_ctx,
-                        img_ctx,
-                        chat,
-                        temp,
-                        max_tok,
-                        model_dd
-                    ],
-                    outputs=[
-                        chat,
-                        gr.Dataframe(),   # shows retrieved text chunks
-                        gr.Gallery(label="Relevant Images", rows=2, value=[])
-                    ]
                 )
-        # After both tabs are defined, chain the “unhide chat tab” event
         extract_event.then(
             fn=lambda: gr.update(visible=True),
             inputs=[],
@@ -489,13 +397,10 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
         gr.HTML("<center>Made with ❤️ by Zamal</center>")
-    # ─── Wire the Start button ───────────────────────────────────────
     start_btn.click(
         fn=lambda: (gr.update(visible=False), gr.update(visible=True)),
-        inputs=[],
         outputs=[welcome_col, app_col]
     )
 if __name__ == "__main__":
     demo.launch()

 # Add at the top of your module, alongside your other globals
+PERSIST_DIR = "./chroma_db"
+if os.path.exists(PERSIST_DIR):
+    shutil.rmtree(PERSIST_DIR)
 @spaces.GPU()
 def get_image_description(image: Image.Image) -> str:
 def get_vectordb(text: str, images: list[Image.Image], img_names: list[str]):
     """
+    Build a *persistent* ChromaDB instance on disk, with two collections:
       • text_db  (chunks of the PDF text)
       • image_db (image descriptions + raw image bytes)
     """
+    # 1) Make or clean the on-disk folder
+    shutil.rmtree(PERSIST_DIR, ignore_errors=True)
+    os.makedirs(PERSIST_DIR, exist_ok=True)
+    # 2) Persistent client
+    client = chromadb.Client(Settings(
+        chroma_db_impl="duckdb+parquet",
+        persist_directory=PERSIST_DIR
+    ))
+    # 3) Create / wipe collections
     for col in ("text_db", "image_db"):
         if col in [c.name for c in client.list_collections()]:
             client.delete_collection(col)
     text_col = client.get_or_create_collection(
         name="text_db",
+        embedding_function=SHARED_EMB_FN
     )
     img_col = client.get_or_create_collection(
         name="image_db",
         embedding_function=SHARED_EMB_FN,
+        metadata={"hnsw:space": "cosine"}
     )
+    # 4) Add images
     if images:
+        descs, metas = [], []
         for idx, img in enumerate(images):
             try:
+                cap = get_image_description(img)
+            except:
+                cap = "⚠️ could not describe image"
+            descs.append(f"{img_names[idx]}: {cap}")
             metas.append({"image": image_to_bytes(img)})
+        img_col.add(ids=[str(i) for i in range(len(images))],
+                    documents=descs,
+                    metadatas=metas)
+    # 5) Chunk & add text
     splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
     docs = splitter.create_documents([text])
+    text_col.add(ids=[str(i) for i in range(len(docs))],
+                 documents=[d.page_content for d in docs])
     return client
 # Text extraction
 def result_to_text(result, as_text=False):
     pages = []
 def extract_data_from_pdfs(
     docs: list[str],
     session: dict,
+    include_images: str,
+    do_ocr: str,
+    ocr_choice: str,
+    vlm_choice: str,
     progress=gr.Progress()
 ):
     if not docs:
         raise gr.Error("No documents to process")
     # 2) Vision–language model
     proc = LlavaNextProcessor.from_pretrained(vlm_choice)
+    vis = (LlavaNextForConditionalGeneration
+           .from_pretrained(vlm_choice, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+           .to("cuda"))
+    # 3) Monkey-patch caption fn
+    def describe(img):
+        torch.cuda.empty_cache(); gc.collect()
         prompt = "[INST] <image>\nDescribe the image in a sentence [/INST]"
+        inp = proc(prompt, img, return_tensors="pt").to("cuda")
+        out = vis.generate(**inp, max_new_tokens=100)
+        return proc.decode(out[0], skip_special_tokens=True)
+    global get_image_description
     get_image_description = describe
+    # 4) Extract text & images
     progress(0.2, "Extracting text and images…")
     all_text = ""
     images, names = [], []
     for path in docs:
         if local_ocr:
             pdf = DocumentFile.from_pdf(path)
             res = local_ocr(pdf)
             all_text += result_to_text(res, as_text=True) + "\n\n"
         else:
+            all_text += (PdfReader(path).pages[0].extract_text() or "") + "\n\n"
         if include_images == "Include Images":
             imgs = extract_images([path])
             images.extend(imgs)
             names.extend([os.path.basename(path)] * len(imgs))
+    # 5) Build + persist the vectordb
     progress(0.6, "Indexing in vector DB…")
+    client = get_vectordb(all_text, images, names)
+    # 6) Mark session and return UI outputs
     session["processed"] = True
+    session["persist_directory"] = PERSIST_DIR
     sample_imgs = images[:4] if include_images == "Include Images" else []
     return (
+        session,               # gr.State
+        all_text[:2000] + "...",
+        sample_imgs,
+        "<h3>Done!</h3>"
     )
 # Chat function
 def conversation(
     session: dict,
     max_tok: int,
     model_id: str
 ):
+    pd = session.get("persist_directory")
+    if not session.get("processed") or not pd:
         raise gr.Error("Please extract data first")
+    # 1) Reopen the same persistent client
+    client = chromadb.Client(Settings(
+        chroma_db_impl="duckdb+parquet",
+        persist_directory=pd
+    ))
+    # 2) Text retrieval
+    text_col = client.get_collection("text_db")
+    docs = text_col.query(query_texts=[question],
+                          n_results=int(num_ctx),
+                          include=["documents"])["documents"][0]
+    # 3) Image retrieval
+    img_col = client.get_collection("image_db")
+    img_q = img_col.query(query_texts=[question],
+                          n_results=int(img_ctx),
+                          include=["metadatas","documents"])
     img_descs = img_q["documents"][0] or ["No images found"]
     images = []
     for meta in img_q["metadatas"][0]:
+        b64 = meta.get("image","")
         try:
             images.append(Image.open(io.BytesIO(base64.b64decode(b64))))
         except:
             pass
     img_desc = "\n".join(img_descs)
+    # 4) Build prompt & call LLM
+    llm = HuggingFaceEndpoint(
+        repo_id=model_id,
+        temperature=temp,
+        max_new_tokens=max_tok,
+        huggingfacehub_api_token=HF_TOKEN
+    )
     prompt = PromptTemplate(
         template="""
 Context:
 {q}
 Answer:
+""", input_variables=["text","img_desc","q"]
     )
+    inp = prompt.format(text="\n\n".join(docs), img_desc=img_desc, q=question)
     try:
+        answer = llm.invoke(inp)
     except HfHubHTTPError as e:
+        answer = "❌ Model not hosted" if e.response.status_code==404 else f"⚠️ HF error: {e}"
     except Exception as e:
         answer = f"⚠️ Unexpected error: {e}"
     new_history = history + [
+        {"role":"user", "content":question},
+        {"role":"assistant","content":answer}
     ]
     return new_history, docs, images
 # ─────────────────────────────────────────────────────────────────────────────
 # Gradio UI
 CSS = """
 ]
 with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
     session_state = gr.State({})
     with gr.Column(visible=True) as welcome_col:
+        gr.Markdown(f"<div style='text-align:center'>{WELCOME_INTRO}</div>")
         start_btn = gr.Button("🚀 Start")
     with gr.Column(visible=False) as app_col:
         gr.Markdown("## 📚 Multimodal Chat-PDF Playground")
         extract_event = None
         with gr.Tabs() as tabs:
             with gr.TabItem("1. Upload & Extract"):
+                docs = gr.File(file_count="multiple", file_types=[".pdf"], label="Upload PDFs")
+                include_dd = gr.Radio(["Include Images","Exclude Images"],"Exclude Images","Images")
+                ocr_radio = gr.Radio(["Get Text With OCR","Get Available Text Only"],"Get Available Text Only","OCR")
+                ocr_dd    = gr.Dropdown(list(OCR_CHOICES.keys()), list(OCR_CHOICES.keys())[0], "OCR Model")
+                vlm_dd    = gr.Dropdown(["llava-hf/llava-v1.6-mistral-7b-hf","llava-hf/llava-v1.5-mistral-7b"], "llava-hf/llava-v1.6-mistral-7b-hf", "Vision-Language Model")
+                extract_btn  = gr.Button("Extract")
+                preview_text = gr.Textbox(lines=10, label="Sample Text", interactive=False)
+                preview_img  = gr.Gallery(label="Sample Images", rows=2, value=[])
                 preview_html = gr.HTML()
                 extract_event = extract_btn.click(
                     fn=extract_data_from_pdfs,
+                    inputs=[docs, session_state, include_dd, ocr_radio, ocr_dd, vlm_dd],
+                    outputs=[session_state, preview_text, preview_img, preview_html]
                 )
             with gr.TabItem("2. Chat", visible=False) as chat_tab:
                 with gr.Row():
                     with gr.Column(scale=3):
                         chat = gr.Chatbot(type="messages", label="Chat")
+                        msg  = gr.Textbox(placeholder="Ask about your PDF...", label="Your question")
                         send = gr.Button("Send")
                     with gr.Column(scale=1):
+                        model_dd = gr.Dropdown(MODEL_OPTIONS, MODEL_OPTIONS[0], "Choose Chat Model")
+                        num_ctx  = gr.Slider(1,20, value=3, label="Text Contexts")
+                        img_ctx  = gr.Slider(1,10, value=2, label="Image Contexts")
+                        temp     = gr.Slider(0.1,1.0, step=0.1, value=0.4, label="Temperature")
+                        max_tok  = gr.Slider(10,1000, step=10, value=200, label="Max Tokens")
                 send.click(
                     fn=conversation,
+                    inputs=[session_state, msg, num_ctx, img_ctx, chat, temp, max_tok, model_dd],
+                    outputs=[chat, gr.Dataframe(), gr.Gallery(label="Relevant Images", rows=2, value=[])]
                 )
+        # Unhide the Chat tab once extraction completes
         extract_event.then(
             fn=lambda: gr.update(visible=True),
             inputs=[],
         gr.HTML("<center>Made with ❤️ by Zamal</center>")
     start_btn.click(
         fn=lambda: (gr.update(visible=False), gr.update(visible=True)),
         outputs=[welcome_col, app_col]
     )
 if __name__ == "__main__":
     demo.launch()