Spaces:

yinde
/

videoqa

Sleeping

File size: 3,766 Bytes

import os
import cv2
import base64
import gradio as gr
from openai import OpenAI

# 1. Frame Extraction
def extract_frames(video_path: str, num_frames: int = 8, max_resolution: int = 720):
    frames_base64 = []
    cap = cv2.VideoCapture(video_path)

    if not cap.isOpened():
        raise RuntimeError(f"Cannot open video file: {video_path}")

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    step = max(total_frames // num_frames, 1)
    frame_indices = [min(i * step, total_frames - 1) for i in range(num_frames)]

    for index in frame_indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, index)
        ret, frame = cap.read()
        if not ret or frame is None:
            continue

        h, w, _ = frame.shape
        if max(h, w) > max_resolution:
            scale = max_resolution / float(max(h, w))
            frame = cv2.resize(frame, (int(w * scale), int(h * scale)))

        success, buffer = cv2.imencode(".jpg", frame, [cv2.IMWRITE_JPEG_QUALITY, 90])
        if success:
            b64 = base64.b64encode(buffer).decode("utf-8")
            data_uri = f"data:image/jpeg;base64,{b64}"
            frames_base64.append(data_uri)

    cap.release()
    return frames_base64

# 2. Prompt Construction
def build_prompt(frames, question):
    content = [{"type": "text", "text": question}]
    for image_data_uri in frames:
        content.append({
            "type": "image_url",
            "image_url": {"url": image_data_uri}
        })
    return content

# 3. Nebius Inference Call
def query_qwen(prompt_content):
    api_key = os.getenv("NEBIUS_API_KEY")
    print(api_key)  # Debugging line to check if API key is loaded correctly
    if not api_key:
        raise ValueError("NEBIUS_API_KEY not found in environment variables.")
    
    client = OpenAI(api_key=api_key, base_url="https://api.studio.nebius.ai/v1/")
    try:
        response = client.chat.completions.create(
            model="Qwen/Qwen2.5-VL-72B-Instruct",
            messages=[{"role": "user", "content": prompt_content}],
            temperature=0.2,
            max_tokens=512
        )
        return response
    except Exception as e:
        return {"error": str(e)}

# 4. Parse Response
def parse_response(response):
    if isinstance(response, dict) and "error" in response:
        return f"Error: {response['error']}"

    try:
        choice = response.choices[0]
        if hasattr(choice, "message"):
            return choice.message.content.strip()
        else:
            return choice.get("message", {}).get("content", "No message received.")
    except Exception as e:
        return f"Failed to parse response: {str(e)}"

# MCP Core Function
def answer_question(video_path: str, question: str) -> str:
    try:
        frames = extract_frames(video_path)
        prompt = build_prompt(frames, question)
        response = query_qwen(prompt)
        return parse_response(response)
    except Exception as e:
        return f"Something went wrong: {str(e)}"

# Gradio App UI
def gradio_interface(video, question):
    return answer_question(video, question)

with gr.Blocks(title="🎥 Video QA with Qwen2.5-VL") as demo:
    gr.Markdown("## 🎥 Interactive Video Question Answering\nUpload a video and ask a question about it.")

    with gr.Row():
        video_input = gr.Video(label="Upload Video")
        question_input = gr.Textbox(label="Your Question", placeholder="e.g., What color was the car in the first scene?")

    answer_output = gr.Textbox(label="Model Answer", lines=3)

    submit_btn = gr.Button("Get Answer")
    submit_btn.click(fn=gradio_interface, inputs=[video_input, question_input], outputs=answer_output)


# Launch the interface and MCP server
if __name__ == "__main__":
    demo.launch(mcp_server=True)