Spaces:

saadfarhad
/

Crowdanalyzer_v1

Runtime error

App Files Files Community

saadfarhad commited on Feb 10

Commit

5dd0627

verified ·

1 Parent(s): 8118925

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -58

app.py CHANGED Viewed

@@ -1,62 +1,77 @@
 import gradio as gr
 import torch
-import importlib
-from transformers import AutoConfig, AutoProcessor, AutoModelForCausalLM
-from transformers.models.llava.configuration_llava import LlavaConfig
-# --- Diagnostic: Load the configuration ---
-config = AutoConfig.from_pretrained("lmms-lab/LLaVA-Video-7B-Qwen2", trust_remote_code=True)
-print("Configuration type:", type(config))
-print("Configuration architectures:", config.architectures)
-# Expecting the architecture name to be "LlavaQwenForCausalLM"
-arch = config.architectures[0]  # This should be "LlavaQwenForCausalLM"
-# --- Dynamic Import: Retrieve the model class by name ---
-# Import the module that (should) contain the custom model class.
-module = importlib.import_module("transformers.models.llava.modeling_llava")
-try:
-    model_cls = getattr(module, arch)
-    print("Successfully imported model class:", model_cls)
-except AttributeError:
-    raise ImportError(f"Cannot find class {arch} in module transformers.models.llava.modeling_llava")
-# --- Register the Custom Model Class ---
-# This tells the auto loader that for LlavaConfig, use our dynamically imported model class.
-AutoModelForCausalLM.register(LlavaConfig, model_cls)
-# --- Load Processor and Model ---
-processor = AutoProcessor.from_pretrained(
-    "lmms-lab/LLaVA-Video-7B-Qwen2",
-    trust_remote_code=True
-)
-model = AutoModelForCausalLM.from_pretrained(
-    "lmms-lab/LLaVA-Video-7B-Qwen2",
-    trust_remote_code=True
-)
-# Move model to GPU if available
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model.to(device)
-def analyze_video(video_path):
-    prompt = "Analyze this video of a concert and determine the moment when the crowd is most engaged."
-    # Process the text and video input
-    inputs = processor(text=prompt, video=video_path, return_tensors="pt")
-    inputs = {k: v.to(device) for k, v in inputs.items()}
-    # Generate output (assuming the custom model implements generate)
-    outputs = model.generate(**inputs, max_new_tokens=100)
-    answer = processor.decode(outputs[0], skip_special_tokens=True)
-    return answer
-# Create the Gradio Interface
-iface = gr.Interface(
-    fn=analyze_video,
-    inputs=gr.Video(label="Upload Concert/Event Video", type="filepath"),
-    outputs=gr.Textbox(label="Engagement Analysis"),
-    title="Crowd Engagement Analyzer",
-    description="Upload a video of a concert or event and the model will analyze the moment when the crowd is most engaged."
-)
 if __name__ == "__main__":
-    iface.launch()

 import gradio as gr
 import torch
+from transformers import AutoModel, AutoTokenizer
+# Set the model path (this is the repository/model ID on Hugging Face)
+model_path = "OpenGVLab/InternVideo2_5_Chat_8B"
+# Load the tokenizer and model with remote code enabled.
+# .half() converts the model to FP16 and .cuda() moves it to GPU (if available).
+tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda()
+# Get the image processor from the vision tower (if needed by the model's implementation)
+image_processor = model.get_vision_tower().image_processor
+# Evaluation settings
+max_num_frames = 512
+generation_config = {
+    "do_sample": False,
+    "temperature": 0.0,
+    "max_new_tokens": 1024,
+    "top_p": 0.1,
+    "num_beams": 1,
+}
+# Define a chat function that performs either single-turn or multi-turn conversation.
+def chat_interface(video_path, user_prompt, chat_history):
+    """
+    Performs a chat turn with the model. If no chat_history is provided,
+    it starts a new conversation.
+    Parameters:
+        video_path (str): The filepath of the uploaded video.
+        user_prompt (str): The user's question.
+        chat_history (list): The conversation history (empty list for a new conversation).
+    Returns:
+        A tuple containing the model's output (str) and the updated chat history (list).
+    """
+    if chat_history is None:
+        chat_history = []
+    # The model.chat() method returns output and updated history.
+    output, new_history = model.chat(
+        video_path=video_path,
+        tokenizer=tokenizer,
+        user_prompt=user_prompt,
+        chat_history=chat_history,
+        return_history=True,
+        max_num_frames=max_num_frames,
+        generation_config=generation_config
+    )
+    return output, new_history
+# Build the Gradio interface.
+with gr.Blocks() as demo:
+    gr.Markdown("## InternVideo2_5_Chat_8B Chat Interface")
+    with gr.Row():
+        video_input = gr.Video(label="Upload Video", type="filepath")
+        question_input = gr.Textbox(label="Enter your question", placeholder="Type your question here...")
+    # We'll use a hidden state to keep the conversation history.
+    chat_state = gr.State([])
+    output_text = gr.Textbox(label="Model Response")
+    def process_chat(video, question, history):
+        response, new_history = chat_interface(video, question, history)
+        return response, new_history
+    send_btn = gr.Button("Send")
+    send_btn.click(
+        process_chat,
+        inputs=[video_input, question_input, chat_state],
+        outputs=[output_text, chat_state]
+    )
+# Launch the app.
 if __name__ == "__main__":
+    demo.launch()