Spaces:

zamal
/

Multimodal-Chat-Playground

Running on Zero

App Files Files Community

zamal commited on May 30

Commit

82895ea

verified ·

1 Parent(s): cd8c42c

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -48

app.py CHANGED Viewed

@@ -51,11 +51,19 @@ vision_model = LlavaNextForConditionalGeneration.from_pretrained(
 ).to("cuda")
 @spaces.GPU()
 def get_image_description(image: Image.Image) -> str:
     global processor, vision_model
-    # on first call, load & move to cuda
     if processor is None or vision_model is None:
         processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
         vision_model = LlavaNextForConditionalGeneration.from_pretrained(
@@ -64,9 +72,9 @@ def get_image_description(image: Image.Image) -> str:
             low_cpu_mem_usage=True
         ).to("cuda")
     torch.cuda.empty_cache()
     gc.collect()
     prompt = "[INST] <image>\nDescribe the image in a sentence [/INST]"
     inputs = processor(prompt, image, return_tensors="pt").to("cuda")
     output = vision_model.generate(**inputs, max_new_tokens=100)
@@ -166,23 +174,22 @@ def extract_data_from_pdfs(
     progress=gr.Progress()
 ):
     """
-    1) Dynamically instantiate the chosen OCR pipeline (if any)
-    2) Dynamically instantiate the chosen vision‐language model
-    3) Monkey‐patch get_image_description to use that VL model
-    4) Extract text & images, index into ChromaDB
     """
     if not docs:
         raise gr.Error("No documents to process")
-    # ——— 1) OCR setup (if requested) —————————————————————
     if do_ocr == "Get Text With OCR":
         db_m, crnn_m = OCR_CHOICES[ocr_choice]
         local_ocr = ocr_predictor(db_m, crnn_m, pretrained=True, assume_straight_pages=True)
     else:
         local_ocr = None
-    # ——— 2) Vision‐language model setup ——————————————————
-    # Load processor + model *inside* the GPU worker
     proc = LlavaNextProcessor.from_pretrained(vlm_choice)
     vis = (
         LlavaNextForConditionalGeneration
@@ -190,25 +197,24 @@ def extract_data_from_pdfs(
         .to("cuda")
     )
-    # ——— 3) Monkey‐patch get_image_description —————————————————
     def describe(img: Image.Image) -> str:
-        torch.cuda.empty_cache()
-        gc.collect()
         prompt = "[INST] <image>\nDescribe the image in a sentence [/INST]"
         inputs = proc(prompt, img, return_tensors="pt").to("cuda")
         output = vis.generate(**inputs, max_new_tokens=100)
         return proc.decode(output[0], skip_special_tokens=True)
-    global get_image_description
     get_image_description = describe
-    # ——— 4) Extract text & images —————————————————————
     progress(0.2, "Extracting text and images…")
     all_text = ""
     images, names = [], []
     for path in docs:
-        # text extraction
         if local_ocr:
             pdf = DocumentFile.from_pdf(path)
             res = local_ocr(pdf)
@@ -217,43 +223,48 @@ def extract_data_from_pdfs(
             txt = PdfReader(path).pages[0].extract_text() or ""
             all_text += txt + "\n\n"
-        # image extraction
         if include_images == "Include Images":
             imgs = extract_images([path])
             images.extend(imgs)
             names.extend([os.path.basename(path)] * len(imgs))
-    # ——— 5) Index into ChromaDB —————————————————————
     progress(0.6, "Indexing in vector DB…")
-    vdb = get_vectordb(all_text, images, names)
-    # mark session done & prepare outputs
     session["processed"] = True
     sample_imgs = images[:4] if include_images == "Include Images" else []
     return (
-        vdb,
-        session,
-        gr.Row(visible=True),
         all_text[:2000] + "...",
         sample_imgs,
         "<h3>Done!</h3>"
     )
 # Chat function
 def conversation(
-    vdb, question: str, num_ctx, img_ctx,
-    history: list, temp: float, max_tok: int, model_id: str
 ):
-    # 0) Cast the context sliders to ints
-    num_ctx = int(num_ctx)
-    img_ctx = int(img_ctx)
-    # 1) Guard: must have extracted first
-    if vdb is None:
         raise gr.Error("Please extract data first")
-    # 2) Instantiate the chosen HF endpoint
     llm = HuggingFaceEndpoint(
         repo_id=model_id,
         temperature=temp,
@@ -261,23 +272,22 @@ def conversation(
         huggingfacehub_api_token=HF_TOKEN
     )
-    # 3) Query text collection
-    text_col = vdb.get_collection("text_db")
     docs = text_col.query(
         query_texts=[question],
-        n_results=num_ctx,              # now an int
         include=["documents"]
     )["documents"][0]
-    # 4) Query image collection
-    img_col = vdb.get_collection("image_db")
     img_q = img_col.query(
         query_texts=[question],
-        n_results=img_ctx,              # now an int
         include=["metadatas", "documents"]
     )
-    # … rest unchanged …
-    images, img_descs = [], img_q["documents"][0] or ["No images found"]
     for meta in img_q["metadatas"][0]:
         b64 = meta.get("image", "")
         try:
@@ -286,7 +296,7 @@ def conversation(
             pass
     img_desc = "\n".join(img_descs)
-    # 5) Build prompt
     prompt = PromptTemplate(
         template="""
 Context:
@@ -302,10 +312,12 @@ Answer:
 """,
         input_variables=["text", "img_desc", "q"],
     )
-    context = "\n\n".join(docs)
-    user_input = prompt.format(text=context, img_desc=img_desc, q=question)
-    # 6) Call the model with error handling
     try:
         answer = llm.invoke(user_input)
     except HfHubHTTPError as e:
@@ -316,13 +328,10 @@ Answer:
     except Exception as e:
         answer = f"⚠️ Unexpected error: {e}"
-    # 7) Append to history
     new_history = history + [
-        {"role":"user",      "content": question},
-        {"role":"assistant","content": answer}
     ]
-    # 8) Return updated history, docs, images
     return new_history, docs, images

 ).to("cuda")
+# Add at the top of your module, alongside your other globals
+CURRENT_VDB = None
 @spaces.GPU()
 def get_image_description(image: Image.Image) -> str:
+    """
+    Lazy-loads the Llava processor + model into the GPU worker,
+    runs captioning, and returns a one-sentence description.
+    """
     global processor, vision_model
+    # First-call: instantiate + move to CUDA
     if processor is None or vision_model is None:
         processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
         vision_model = LlavaNextForConditionalGeneration.from_pretrained(
             low_cpu_mem_usage=True
         ).to("cuda")
+    # clear and run
     torch.cuda.empty_cache()
     gc.collect()
     prompt = "[INST] <image>\nDescribe the image in a sentence [/INST]"
     inputs = processor(prompt, image, return_tensors="pt").to("cuda")
     output = vision_model.generate(**inputs, max_new_tokens=100)
     progress=gr.Progress()
 ):
     """
+    1) (Optional) OCR setup
+    2) V+L model setup & monkey-patch get_image_description
+    3) Extract text and images
+    4) Build and store vector DB in global CURRENT_VDB
     """
     if not docs:
         raise gr.Error("No documents to process")
+    # 1) OCR instantiation if requested
     if do_ocr == "Get Text With OCR":
         db_m, crnn_m = OCR_CHOICES[ocr_choice]
         local_ocr = ocr_predictor(db_m, crnn_m, pretrained=True, assume_straight_pages=True)
     else:
         local_ocr = None
+    # 2) Vision–language model instantiation
     proc = LlavaNextProcessor.from_pretrained(vlm_choice)
     vis = (
         LlavaNextForConditionalGeneration
         .to("cuda")
     )
+    # Monkey-patch global captioning fn
     def describe(img: Image.Image) -> str:
+        torch.cuda.empty_cache(); gc.collect()
         prompt = "[INST] <image>\nDescribe the image in a sentence [/INST]"
         inputs = proc(prompt, img, return_tensors="pt").to("cuda")
         output = vis.generate(**inputs, max_new_tokens=100)
         return proc.decode(output[0], skip_special_tokens=True)
+    global get_image_description, CURRENT_VDB
     get_image_description = describe
+    # 3) Extract text & images
     progress(0.2, "Extracting text and images…")
     all_text = ""
     images, names = [], []
     for path in docs:
+        # text
         if local_ocr:
             pdf = DocumentFile.from_pdf(path)
             res = local_ocr(pdf)
             txt = PdfReader(path).pages[0].extract_text() or ""
             all_text += txt + "\n\n"
+        # images
         if include_images == "Include Images":
             imgs = extract_images([path])
             images.extend(imgs)
             names.extend([os.path.basename(path)] * len(imgs))
+    # 4) Build and stash the vector DB
     progress(0.6, "Indexing in vector DB…")
+    CURRENT_VDB = get_vectordb(all_text, images, names)
+    # mark done & return only picklable outputs
     session["processed"] = True
     sample_imgs = images[:4] if include_images == "Include Images" else []
     return (
+        session,               # gr.State for “processed”
+        gr.Row(visible=True),  # to un‐hide your chat UI
         all_text[:2000] + "...",
         sample_imgs,
         "<h3>Done!</h3>"
     )
 # Chat function
 def conversation(
+    session: dict,
+    question: str,
+    num_ctx: int,
+    img_ctx: int,
+    history: list,
+    temp: float,
+    max_tok: int,
+    model_id: str
 ):
+    """
+    Pulls CURRENT_VDB from module global, runs text+image retrieval,
+    calls the HF endpoint, and returns updated chat history.
+    """
+    global CURRENT_VDB
+    if not session.get("processed") or CURRENT_VDB is None:
         raise gr.Error("Please extract data first")
     llm = HuggingFaceEndpoint(
         repo_id=model_id,
         temperature=temp,
         huggingfacehub_api_token=HF_TOKEN
     )
+    # Retrieve top‐k text & images
+    text_col = CURRENT_VDB.get_collection("text_db")
     docs = text_col.query(
         query_texts=[question],
+        n_results=int(num_ctx),
         include=["documents"]
     )["documents"][0]
+    img_col = CURRENT_VDB.get_collection("image_db")
     img_q = img_col.query(
         query_texts=[question],
+        n_results=int(img_ctx),
         include=["metadatas", "documents"]
     )
+    img_descs = img_q["documents"][0] or ["No images found"]
+    images = []
     for meta in img_q["metadatas"][0]:
         b64 = meta.get("image", "")
         try:
             pass
     img_desc = "\n".join(img_descs)
+    # Build and call prompt
     prompt = PromptTemplate(
         template="""
 Context:
 """,
         input_variables=["text", "img_desc", "q"],
     )
+    user_input = prompt.format(
+        text="\n\n".join(docs),
+        img_desc=img_desc,
+        q=question
+    )
     try:
         answer = llm.invoke(user_input)
     except HfHubHTTPError as e:
     except Exception as e:
         answer = f"⚠️ Unexpected error: {e}"
     new_history = history + [
+        {"role": "user",      "content": question},
+        {"role": "assistant", "content": answer}
     ]
     return new_history, docs, images