saadfarhad commited on
Commit
5dd0627
·
verified ·
1 Parent(s): 8118925

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -58
app.py CHANGED
@@ -1,62 +1,77 @@
1
  import gradio as gr
2
  import torch
3
- import importlib
4
- from transformers import AutoConfig, AutoProcessor, AutoModelForCausalLM
5
- from transformers.models.llava.configuration_llava import LlavaConfig
6
-
7
- # --- Diagnostic: Load the configuration ---
8
- config = AutoConfig.from_pretrained("lmms-lab/LLaVA-Video-7B-Qwen2", trust_remote_code=True)
9
- print("Configuration type:", type(config))
10
- print("Configuration architectures:", config.architectures)
11
-
12
- # Expecting the architecture name to be "LlavaQwenForCausalLM"
13
- arch = config.architectures[0] # This should be "LlavaQwenForCausalLM"
14
-
15
- # --- Dynamic Import: Retrieve the model class by name ---
16
- # Import the module that (should) contain the custom model class.
17
- module = importlib.import_module("transformers.models.llava.modeling_llava")
18
- try:
19
- model_cls = getattr(module, arch)
20
- print("Successfully imported model class:", model_cls)
21
- except AttributeError:
22
- raise ImportError(f"Cannot find class {arch} in module transformers.models.llava.modeling_llava")
23
-
24
- # --- Register the Custom Model Class ---
25
- # This tells the auto loader that for LlavaConfig, use our dynamically imported model class.
26
- AutoModelForCausalLM.register(LlavaConfig, model_cls)
27
-
28
- # --- Load Processor and Model ---
29
- processor = AutoProcessor.from_pretrained(
30
- "lmms-lab/LLaVA-Video-7B-Qwen2",
31
- trust_remote_code=True
32
- )
33
- model = AutoModelForCausalLM.from_pretrained(
34
- "lmms-lab/LLaVA-Video-7B-Qwen2",
35
- trust_remote_code=True
36
- )
37
-
38
- # Move model to GPU if available
39
- device = "cuda" if torch.cuda.is_available() else "cpu"
40
- model.to(device)
41
-
42
- def analyze_video(video_path):
43
- prompt = "Analyze this video of a concert and determine the moment when the crowd is most engaged."
44
- # Process the text and video input
45
- inputs = processor(text=prompt, video=video_path, return_tensors="pt")
46
- inputs = {k: v.to(device) for k, v in inputs.items()}
47
- # Generate output (assuming the custom model implements generate)
48
- outputs = model.generate(**inputs, max_new_tokens=100)
49
- answer = processor.decode(outputs[0], skip_special_tokens=True)
50
- return answer
51
-
52
- # Create the Gradio Interface
53
- iface = gr.Interface(
54
- fn=analyze_video,
55
- inputs=gr.Video(label="Upload Concert/Event Video", type="filepath"),
56
- outputs=gr.Textbox(label="Engagement Analysis"),
57
- title="Crowd Engagement Analyzer",
58
- description="Upload a video of a concert or event and the model will analyze the moment when the crowd is most engaged."
59
- )
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  if __name__ == "__main__":
62
- iface.launch()
 
1
  import gradio as gr
2
  import torch
3
+ from transformers import AutoModel, AutoTokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
+ # Set the model path (this is the repository/model ID on Hugging Face)
6
+ model_path = "OpenGVLab/InternVideo2_5_Chat_8B"
7
+
8
+ # Load the tokenizer and model with remote code enabled.
9
+ # .half() converts the model to FP16 and .cuda() moves it to GPU (if available).
10
+ tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
11
+ model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda()
12
+
13
+ # Get the image processor from the vision tower (if needed by the model's implementation)
14
+ image_processor = model.get_vision_tower().image_processor
15
+
16
+ # Evaluation settings
17
+ max_num_frames = 512
18
+ generation_config = {
19
+ "do_sample": False,
20
+ "temperature": 0.0,
21
+ "max_new_tokens": 1024,
22
+ "top_p": 0.1,
23
+ "num_beams": 1,
24
+ }
25
+
26
+ # Define a chat function that performs either single-turn or multi-turn conversation.
27
+ def chat_interface(video_path, user_prompt, chat_history):
28
+ """
29
+ Performs a chat turn with the model. If no chat_history is provided,
30
+ it starts a new conversation.
31
+
32
+ Parameters:
33
+ video_path (str): The filepath of the uploaded video.
34
+ user_prompt (str): The user's question.
35
+ chat_history (list): The conversation history (empty list for a new conversation).
36
+
37
+ Returns:
38
+ A tuple containing the model's output (str) and the updated chat history (list).
39
+ """
40
+ if chat_history is None:
41
+ chat_history = []
42
+ # The model.chat() method returns output and updated history.
43
+ output, new_history = model.chat(
44
+ video_path=video_path,
45
+ tokenizer=tokenizer,
46
+ user_prompt=user_prompt,
47
+ chat_history=chat_history,
48
+ return_history=True,
49
+ max_num_frames=max_num_frames,
50
+ generation_config=generation_config
51
+ )
52
+ return output, new_history
53
+
54
+ # Build the Gradio interface.
55
+ with gr.Blocks() as demo:
56
+ gr.Markdown("## InternVideo2_5_Chat_8B Chat Interface")
57
+ with gr.Row():
58
+ video_input = gr.Video(label="Upload Video", type="filepath")
59
+ question_input = gr.Textbox(label="Enter your question", placeholder="Type your question here...")
60
+ # We'll use a hidden state to keep the conversation history.
61
+ chat_state = gr.State([])
62
+ output_text = gr.Textbox(label="Model Response")
63
+
64
+ def process_chat(video, question, history):
65
+ response, new_history = chat_interface(video, question, history)
66
+ return response, new_history
67
+
68
+ send_btn = gr.Button("Send")
69
+ send_btn.click(
70
+ process_chat,
71
+ inputs=[video_input, question_input, chat_state],
72
+ outputs=[output_text, chat_state]
73
+ )
74
+
75
+ # Launch the app.
76
  if __name__ == "__main__":
77
+ demo.launch()