Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,62 +1,77 @@
|
|
1 |
import gradio as gr
|
2 |
import torch
|
3 |
-
import
|
4 |
-
from transformers import AutoConfig, AutoProcessor, AutoModelForCausalLM
|
5 |
-
from transformers.models.llava.configuration_llava import LlavaConfig
|
6 |
-
|
7 |
-
# --- Diagnostic: Load the configuration ---
|
8 |
-
config = AutoConfig.from_pretrained("lmms-lab/LLaVA-Video-7B-Qwen2", trust_remote_code=True)
|
9 |
-
print("Configuration type:", type(config))
|
10 |
-
print("Configuration architectures:", config.architectures)
|
11 |
-
|
12 |
-
# Expecting the architecture name to be "LlavaQwenForCausalLM"
|
13 |
-
arch = config.architectures[0] # This should be "LlavaQwenForCausalLM"
|
14 |
-
|
15 |
-
# --- Dynamic Import: Retrieve the model class by name ---
|
16 |
-
# Import the module that (should) contain the custom model class.
|
17 |
-
module = importlib.import_module("transformers.models.llava.modeling_llava")
|
18 |
-
try:
|
19 |
-
model_cls = getattr(module, arch)
|
20 |
-
print("Successfully imported model class:", model_cls)
|
21 |
-
except AttributeError:
|
22 |
-
raise ImportError(f"Cannot find class {arch} in module transformers.models.llava.modeling_llava")
|
23 |
-
|
24 |
-
# --- Register the Custom Model Class ---
|
25 |
-
# This tells the auto loader that for LlavaConfig, use our dynamically imported model class.
|
26 |
-
AutoModelForCausalLM.register(LlavaConfig, model_cls)
|
27 |
-
|
28 |
-
# --- Load Processor and Model ---
|
29 |
-
processor = AutoProcessor.from_pretrained(
|
30 |
-
"lmms-lab/LLaVA-Video-7B-Qwen2",
|
31 |
-
trust_remote_code=True
|
32 |
-
)
|
33 |
-
model = AutoModelForCausalLM.from_pretrained(
|
34 |
-
"lmms-lab/LLaVA-Video-7B-Qwen2",
|
35 |
-
trust_remote_code=True
|
36 |
-
)
|
37 |
-
|
38 |
-
# Move model to GPU if available
|
39 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
40 |
-
model.to(device)
|
41 |
-
|
42 |
-
def analyze_video(video_path):
|
43 |
-
prompt = "Analyze this video of a concert and determine the moment when the crowd is most engaged."
|
44 |
-
# Process the text and video input
|
45 |
-
inputs = processor(text=prompt, video=video_path, return_tensors="pt")
|
46 |
-
inputs = {k: v.to(device) for k, v in inputs.items()}
|
47 |
-
# Generate output (assuming the custom model implements generate)
|
48 |
-
outputs = model.generate(**inputs, max_new_tokens=100)
|
49 |
-
answer = processor.decode(outputs[0], skip_special_tokens=True)
|
50 |
-
return answer
|
51 |
-
|
52 |
-
# Create the Gradio Interface
|
53 |
-
iface = gr.Interface(
|
54 |
-
fn=analyze_video,
|
55 |
-
inputs=gr.Video(label="Upload Concert/Event Video", type="filepath"),
|
56 |
-
outputs=gr.Textbox(label="Engagement Analysis"),
|
57 |
-
title="Crowd Engagement Analyzer",
|
58 |
-
description="Upload a video of a concert or event and the model will analyze the moment when the crowd is most engaged."
|
59 |
-
)
|
60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
if __name__ == "__main__":
|
62 |
-
|
|
|
1 |
import gradio as gr
|
2 |
import torch
|
3 |
+
from transformers import AutoModel, AutoTokenizer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
+
# Set the model path (this is the repository/model ID on Hugging Face)
|
6 |
+
model_path = "OpenGVLab/InternVideo2_5_Chat_8B"
|
7 |
+
|
8 |
+
# Load the tokenizer and model with remote code enabled.
|
9 |
+
# .half() converts the model to FP16 and .cuda() moves it to GPU (if available).
|
10 |
+
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
11 |
+
model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda()
|
12 |
+
|
13 |
+
# Get the image processor from the vision tower (if needed by the model's implementation)
|
14 |
+
image_processor = model.get_vision_tower().image_processor
|
15 |
+
|
16 |
+
# Evaluation settings
|
17 |
+
max_num_frames = 512
|
18 |
+
generation_config = {
|
19 |
+
"do_sample": False,
|
20 |
+
"temperature": 0.0,
|
21 |
+
"max_new_tokens": 1024,
|
22 |
+
"top_p": 0.1,
|
23 |
+
"num_beams": 1,
|
24 |
+
}
|
25 |
+
|
26 |
+
# Define a chat function that performs either single-turn or multi-turn conversation.
|
27 |
+
def chat_interface(video_path, user_prompt, chat_history):
|
28 |
+
"""
|
29 |
+
Performs a chat turn with the model. If no chat_history is provided,
|
30 |
+
it starts a new conversation.
|
31 |
+
|
32 |
+
Parameters:
|
33 |
+
video_path (str): The filepath of the uploaded video.
|
34 |
+
user_prompt (str): The user's question.
|
35 |
+
chat_history (list): The conversation history (empty list for a new conversation).
|
36 |
+
|
37 |
+
Returns:
|
38 |
+
A tuple containing the model's output (str) and the updated chat history (list).
|
39 |
+
"""
|
40 |
+
if chat_history is None:
|
41 |
+
chat_history = []
|
42 |
+
# The model.chat() method returns output and updated history.
|
43 |
+
output, new_history = model.chat(
|
44 |
+
video_path=video_path,
|
45 |
+
tokenizer=tokenizer,
|
46 |
+
user_prompt=user_prompt,
|
47 |
+
chat_history=chat_history,
|
48 |
+
return_history=True,
|
49 |
+
max_num_frames=max_num_frames,
|
50 |
+
generation_config=generation_config
|
51 |
+
)
|
52 |
+
return output, new_history
|
53 |
+
|
54 |
+
# Build the Gradio interface.
|
55 |
+
with gr.Blocks() as demo:
|
56 |
+
gr.Markdown("## InternVideo2_5_Chat_8B Chat Interface")
|
57 |
+
with gr.Row():
|
58 |
+
video_input = gr.Video(label="Upload Video", type="filepath")
|
59 |
+
question_input = gr.Textbox(label="Enter your question", placeholder="Type your question here...")
|
60 |
+
# We'll use a hidden state to keep the conversation history.
|
61 |
+
chat_state = gr.State([])
|
62 |
+
output_text = gr.Textbox(label="Model Response")
|
63 |
+
|
64 |
+
def process_chat(video, question, history):
|
65 |
+
response, new_history = chat_interface(video, question, history)
|
66 |
+
return response, new_history
|
67 |
+
|
68 |
+
send_btn = gr.Button("Send")
|
69 |
+
send_btn.click(
|
70 |
+
process_chat,
|
71 |
+
inputs=[video_input, question_input, chat_state],
|
72 |
+
outputs=[output_text, chat_state]
|
73 |
+
)
|
74 |
+
|
75 |
+
# Launch the app.
|
76 |
if __name__ == "__main__":
|
77 |
+
demo.launch()
|