Spaces:

jawakja
/

Mulktimodal_chatbot

Sleeping

App Files Files Community

jawakja commited on May 16

Commit

48fbb50

verified ·

1 Parent(s): 4ec7065

Create app.py

Browse files

Files changed (1) hide show

app.py +176 -0

app.py ADDED Viewed

	@@ -0,0 +1,176 @@

+import gradio as gr
+import fitz  # PyMuPDF
+import torch
+import cv2
+import os
+import tempfile
+import shutil
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from sentence_transformers import SentenceTransformer
+import faiss
+# Load Qwen-VL-Chat
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL-Chat", trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(
+    "Qwen/Qwen-VL-Chat",
+    device_map="auto",
+    torch_dtype=torch.bfloat16,
+    trust_remote_code=True
+).eval()
+# Embedding model
+embed_model = SentenceTransformer('all-MiniLM-L6-v2')
+# Global state for FAISS
+chunks = []
+index = None
+# PDF processing
+def extract_chunks_from_pdf(pdf_path, chunk_size=1000, overlap=200):
+    doc = fitz.open(pdf_path)
+    text = ""
+    for page in doc:
+        text += page.get_text()
+    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size - overlap)]
+def build_faiss_index(chunks):
+    embeddings = embed_model.encode(chunks, convert_to_numpy=True)
+    dim = embeddings.shape[1]
+    idx = faiss.IndexFlatL2(dim)
+    idx.add(embeddings)
+    return idx
+def rag_query(query, chunks, index, top_k=3):
+    q_emb = embed_model.encode([query], convert_to_numpy=True)
+    D, I = index.search(q_emb, top_k)
+    return "\n\n".join([chunks[i] for i in I[0]])
+# Vision/Text chat
+def chat_with_qwen(text=None, image=None):
+    elements = []
+    if image:
+        elements.append({"image": image})
+    if text:
+        elements.append({"text": text})
+    if not elements:
+        return "Please upload or type something."
+    query = tokenizer.from_list_format(elements)
+    response, _ = model.chat(tokenizer, query, history=None)
+    return response
+# Video frame extraction
+def extract_video_frames(video_path, max_frames=3):
+    cap = cv2.VideoCapture(video_path)
+    frames, count = [], 0
+    while len(frames) < max_frames:
+        success, frame = cap.read()
+        if not success:
+            break
+        frames.append(frame)
+        count += 1
+        cap.set(cv2.CAP_PROP_POS_FRAMES, count * 30)
+    cap.release()
+    return frames
+# Main chatbot logic
+def multimodal_chat(message, history, image=None, video=None, pdf=None):
+    global chunks, index
+    # PDF-based RAG
+    if pdf:
+        chunks = extract_chunks_from_pdf(pdf.name)
+        index = build_faiss_index(chunks)
+        context = rag_query(message, chunks, index)
+        final_prompt = f"Context:\n{context}\n\nQuestion: {message}"
+        response = chat_with_qwen(final_prompt)
+        return response
+    # Image
+    if image:
+        response = chat_with_qwen(message, image)
+        return response
+    # Video (extract frames and send all in one call)
+    if video:
+        temp_dir = tempfile.mkdtemp()
+        video_path = os.path.join(temp_dir, "vid.mp4")
+        shutil.copy(video, video_path)
+        frames = extract_video_frames(video_path)
+        # Save and collect image paths
+        images = []
+        for i, frame in enumerate(frames):
+            temp_img_path = os.path.join(temp_dir, f"frame_{i}.jpg")
+            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            cv2.imwrite(temp_img_path, frame_rgb)
+            images.append(temp_img_path)
+        # Combine all frames and text into one query
+        elements = [{"image": img} for img in images]
+        if message:
+            elements.append({"text": message})
+        query = tokenizer.from_list_format(elements)
+        response, _ = model.chat(tokenizer, query, history=None)
+        return response
+    # Text only
+    if message:
+        return chat_with_qwen(message)
+    return "Please input a message, image, video, or PDF."
+# ---- Gradio UI ---- #
+with gr.Blocks(css="""
+body {
+background-color: #f3f6fc;
+}
+.gradio-container {
+font-family: 'Segoe UI', sans-serif;
+}
+h1 {
+background: linear-gradient(to right, #667eea, #764ba2);
+color: white !important;
+padding: 1rem;
+border-radius: 12px;
+margin-bottom: 0.5rem;
+}
+p {
+font-size: 1rem;
+color: white;
+}
+.gr-box {
+background-color: white;
+border-radius: 12px;
+box-shadow: 0 0 10px rgba(0,0,0,0.05);
+padding: 16px;
+}
+footer {display: none !important;}
+""") as demo:
+    gr.Markdown(
+        "<h1 style='text-align: center;'>Multimodal Chatbot powered by LLAVACMVRL and QWEN-VL</h1>"
+        "<p style='text-align: center;'>Ask questions with text, images, videos, or PDFs in a smart and multimodal way.</p>"
+    )
+    chatbot = gr.Chatbot(show_label=False, height=450)
+    state = gr.State([])
+    with gr.Row():
+        txt = gr.Textbox(show_label=False, placeholder="Type a message...", scale=5)
+        send_btn = gr.Button("🚀 Send", scale=1)
+    with gr.Row():
+        image_input = gr.Image(type="filepath", label="Upload Image")
+        video_input = gr.Video(label="Upload Video")
+        pdf_input = gr.File(file_types=[".pdf"], label="Upload PDF")
+    def user_send(message, history, image, video, pdf):
+        response = multimodal_chat(message, history, image, video, pdf)
+        history.append((message, response))
+        return "", history
+    send_btn.click(user_send, [txt, state, image_input, video_input, pdf_input], [txt, chatbot])
+    txt.submit(user_send, [txt, state, image_input, video_input, pdf_input], [txt, chatbot])
+# Launch the app
+demo.launch()