import torch from transformers import AutoProcessor, AutoModelForVision2Seq from PIL import Image # ✅ Define the model name from Hugging Face MODEL_NAME = "deepseek-ai/deepseek-vl2-small" # ✅ Load model and processor processor = AutoProcessor.from_pretrained(MODEL_NAME) model = AutoModelForVision2Seq.from_pretrained(MODEL_NAME, torch_dtype=torch.float16) # ✅ Test the model with an image def predict(image_path): image = Image.open(image_path).convert("RGB") # Process input inputs = processor(images=image, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu") # Generate output output = model.generate(**inputs) # Decode response generated_text = processor.batch_decode(output, skip_special_tokens=True)[0] return generated_text # ✅ Example Usage if __name__ == "__main__": test_image_path = "test.jpg" # Replace with an actual image path print("Generated Output:", predict(test_image_path))