import gradio as gr import torch from transformers import AutoModel, AutoTokenizer # Model name model_name = "OpenGVLab/InternVideo2_5_Chat_8B" # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) # Load model efficiently model = AutoModel.from_pretrained( model_name, trust_remote_code=True, torch_dtype=torch.float16, # Use float16 for lower memory usage device_map="auto" # Automatically place model on available GPU ) # Define inference function def chat_with_model(prompt): inputs = tokenizer(prompt, return_tensors="pt").to("cuda") # Move inputs to GPU output = model.generate(**inputs, max_length=200) return tokenizer.decode(output[0], skip_special_tokens=True) # Create Gradio UI demo = gr.Interface( fn=chat_with_model, inputs=gr.Textbox(placeholder="Type your prompt here..."), outputs="text", title="InternVideo2.5 Chatbot", description="A chatbot powered by InternVideo2_5_Chat_8B.", theme="compact" ) # Run the Gradio app if __name__ == "__main__": demo.launch()