Update README.md
Browse files
README.md
CHANGED
|
@@ -37,7 +37,7 @@ We have two ways of using our model for this application. The first is the recom
|
|
| 37 |
import t2v_metrics
|
| 38 |
|
| 39 |
### For a single (video, text) pair:
|
| 40 |
-
qwen_score = t2v_metrics.VQAScore(model='qwen2.5-vl-7b', checkpoint='chancharikm/qwen2.5-vl-7b-cam-motion
|
| 41 |
video = "videos/baby.mp4" # a video path in string format
|
| 42 |
text = "a baby crying"
|
| 43 |
# Calculate probability of "Yes" response
|
|
@@ -55,7 +55,7 @@ import torch
|
|
| 55 |
|
| 56 |
# Load the model
|
| 57 |
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 58 |
-
"chancharikm/qwen2.5-vl-7b-cam-motion
|
| 59 |
)
|
| 60 |
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
|
| 61 |
|
|
@@ -128,7 +128,7 @@ We have two ways of using our model for this application. The first is the recom
|
|
| 128 |
import t2v_metrics
|
| 129 |
|
| 130 |
### For a single (video, text) pair:
|
| 131 |
-
qwen_score = t2v_metrics.VQAScore(model='qwen2.5-vl-7b', checkpoint='chancharikm/qwen2.5-vl-7b-cam-motion
|
| 132 |
video = "videos/baby.mp4" # a video path in string format
|
| 133 |
text = "Please describe this image: "
|
| 134 |
# Calculate probability of "Yes" response
|
|
@@ -146,12 +146,12 @@ from qwen_vl_utils import process_vision_info
|
|
| 146 |
|
| 147 |
# default: Load the model on the available device(s)
|
| 148 |
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 149 |
-
"chancharikm/qwen2.5-vl-7b-cam-motion
|
| 150 |
)
|
| 151 |
|
| 152 |
# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
|
| 153 |
# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 154 |
-
# "chancharikm/qwen2.5-vl-7b-cam-motion
|
| 155 |
# torch_dtype=torch.bfloat16,
|
| 156 |
# attn_implementation="flash_attention_2",
|
| 157 |
# device_map="auto",
|
|
|
|
| 37 |
import t2v_metrics
|
| 38 |
|
| 39 |
### For a single (video, text) pair:
|
| 40 |
+
qwen_score = t2v_metrics.VQAScore(model='qwen2.5-vl-7b', checkpoint='chancharikm/qwen2.5-vl-7b-cam-motion')
|
| 41 |
video = "videos/baby.mp4" # a video path in string format
|
| 42 |
text = "a baby crying"
|
| 43 |
# Calculate probability of "Yes" response
|
|
|
|
| 55 |
|
| 56 |
# Load the model
|
| 57 |
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 58 |
+
"chancharikm/qwen2.5-vl-7b-cam-motion", torch_dtype="auto", device_map="auto"
|
| 59 |
)
|
| 60 |
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
|
| 61 |
|
|
|
|
| 128 |
import t2v_metrics
|
| 129 |
|
| 130 |
### For a single (video, text) pair:
|
| 131 |
+
qwen_score = t2v_metrics.VQAScore(model='qwen2.5-vl-7b', checkpoint='chancharikm/qwen2.5-vl-7b-cam-motion')
|
| 132 |
video = "videos/baby.mp4" # a video path in string format
|
| 133 |
text = "Please describe this image: "
|
| 134 |
# Calculate probability of "Yes" response
|
|
|
|
| 146 |
|
| 147 |
# default: Load the model on the available device(s)
|
| 148 |
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 149 |
+
"chancharikm/qwen2.5-vl-7b-cam-motion", torch_dtype="auto", device_map="auto"
|
| 150 |
)
|
| 151 |
|
| 152 |
# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
|
| 153 |
# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 154 |
+
# "chancharikm/qwen2.5-vl-7b-cam-motion",
|
| 155 |
# torch_dtype=torch.bfloat16,
|
| 156 |
# attn_implementation="flash_attention_2",
|
| 157 |
# device_map="auto",
|