# app_fast.py - Vintern-1B Fast Version import gradio as gr import torch from transformers import AutoModel, AutoTokenizer import torchvision.transforms as T from torchvision.transforms.functional import InterpolationMode from PIL import Image import time import json import traceback # Setup device = "cpu" model = None tokenizer = None transform = None def build_transform(input_size=448): """Optimized transform""" IMAGENET_MEAN = (0.485, 0.456, 0.406) IMAGENET_STD = (0.229, 0.224, 0.225) return T.Compose([ T.Lambda(lambda img: img.convert('RGB') if hasattr(img, 'mode') and img.mode != 'RGB' else img), T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), T.ToTensor(), T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD) ]) def load_model(): """Load Vintern-1B (faster version)""" global model, tokenizer, transform try: print("🚀 Loading Vintern-1B (Fast Version)...") # Sử dụng model nhẹ hơn model_name = "5CD-AI/Vintern-1B-v2" # Thay vì v3.5 tokenizer = AutoTokenizer.from_pretrained( model_name, trust_remote_code=True ) model = AutoModel.from_pretrained( model_name, torch_dtype=torch.float32, trust_remote_code=True, low_cpu_mem_usage=True ) # Optimize model for inference model.eval() model = torch.jit.optimize_for_inference(model) transform = build_transform() print("✅ Fast model loaded!") return True except Exception as e: print(f"❌ Error: {e}") traceback.print_exc() return False def fast_analyze(image): """Optimized analysis function""" if model is None: return "❌ Model chưa sẵn sàng" try: start_time = time.time() # Quick image processing if image is None: return "❌ Không có ảnh" if hasattr(image, 'mode') and image.mode != 'RGB': image = image.convert('RGB') # Fast transform image_tensor = transform(image).unsqueeze(0).to(device) with torch.no_grad(): # Shorter, faster generation query = "Mô tả ngắn gọn:" try: result = model.chat( tokenizer, image_tensor, query, generation_config=dict( max_new_tokens=100, # Ngắn hơn → nhanh hơn do_sample=False, # Greedy → nhanh hơn temperature=0.7, num_beams=1 # No beam search → nhanh hơn ) ) except: # Fallback nhanh inputs = tokenizer(query, return_tensors="pt").to(device) outputs = model.generate( **inputs, max_new_tokens=80, do_sample=False, num_beams=1 ) result = tokenizer.decode(outputs[0], skip_special_tokens=True) result = result.replace(query, "").strip() processing_time = time.time() - start_time return f"""**📝 Mô tả nhanh:** {result} **⚡ Thời gian:** {processing_time:.1f}s **🤖 Model:** Vintern-1B-v2 (Optimized) **💨 Tốc độ:** {1/processing_time:.1f} FPS --- *Model được tối ưu cho tốc độ - phù hợp real-time* """ except Exception as e: return f"❌ Lỗi: {str(e)}" # Load model print("🚀 Starting Fast Vintern Server...") model_loaded = load_model() # Lightweight Gradio interface with gr.Blocks( title="Vintern-1B Fast", theme=gr.themes.Base(), ) as demo: gr.Markdown("# ⚡ Vintern-1B Fast - Tốc Độ Cao") if model_loaded: gr.Markdown("✅ **Model sẵn sàng!** Tối ưu cho tốc độ và real-time.") with gr.Row(): image_input = gr.Image(type="pil", label="📤 Upload Ảnh") result_output = gr.Textbox( label="📋 Kết Quả", lines=8, show_copy_button=True ) # Auto-analyze on upload image_input.change( fn=fast_analyze, inputs=image_input, outputs=result_output ) gr.Markdown(""" ### ⚡ Tối ưu cho tốc độ: - **Model nhẹ**: Vintern-1B-v2 (~1.5GB) - **Fast generation**: Greedy decode, short output - **Optimized**: JIT compilation, no beam search - **Real-time ready**: ~2-5 giây/ảnh """) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)