Spaces:

nguyentantoan
/

vintern-video-recognition

Sleeping

File size: 4,908 Bytes

8afdc67
cb8d367
 
 
 
 
 
 
8afdc67
a7f24f5
cb8d367
 
8afdc67
cb8d367
 
 
 
a7f24f5
8afdc67
a7f24f5
 
 
 
 
 
 
 
 
 
cb8d367
8afdc67
cb8d367
 
8afdc67
cb8d367
8afdc67
 
cb8d367
 
 
 
 
 
 
 
 
 
 
 
 
8afdc67
 
 
 
a7f24f5
cb8d367
8afdc67
cb8d367
 
 
8afdc67
a7f24f5
cb8d367
 
8afdc67
 
cb8d367
8afdc67
cb8d367
 
 
 
8afdc67
 
 
a7f24f5
8afdc67
 
5f21563
8afdc67
 
cb8d367
 
8afdc67
 
cb8d367
a7f24f5
8afdc67
a7f24f5
 
 
 
8afdc67
 
a7f24f5
8afdc67
a7f24f5
 
8afdc67
 
a7f24f5
 
 
8afdc67
 
 
cb8d367
8afdc67
 
cb8d367
 
 
8afdc67
 
cb8d367
8afdc67
 
 
a7f24f5
 
8afdc67
5f21563
cb8d367
 
8afdc67
a7f24f5
8afdc67
 
cb8d367
 
8afdc67
a7f24f5
8afdc67
 
a7f24f5
 
8afdc67
cb8d367
8afdc67
 
cb8d367
 
8afdc67
 
 
 
 
 
a7f24f5
8afdc67
a7f24f5
8afdc67
a7f24f5
 
 
 
cb8d367
8afdc67
 
 
 
 
cb8d367
 
 
8afdc67

# app_fast.py - Vintern-1B Fast Version
import gradio as gr
import torch
from transformers import AutoModel, AutoTokenizer
import torchvision.transforms as T
from torchvision.transforms.functional import InterpolationMode
from PIL import Image
import time
import json
import traceback

# Setup
device = "cpu"
model = None
tokenizer = None
transform = None

def build_transform(input_size=448):
    """Optimized transform"""
    IMAGENET_MEAN = (0.485, 0.456, 0.406)
    IMAGENET_STD = (0.229, 0.224, 0.225)
    
    return T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if hasattr(img, 'mode') and img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
    ])

def load_model():
    """Load Vintern-1B (faster version)"""
    global model, tokenizer, transform
    try:
        print("🚀 Loading Vintern-1B (Fast Version)...")
        
        # Sử dụng model nhẹ hơn
        model_name = "5CD-AI/Vintern-1B-v2"  # Thay vì v3.5
        
        tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            trust_remote_code=True
        )
        
        model = AutoModel.from_pretrained(
            model_name,
            torch_dtype=torch.float32,
            trust_remote_code=True,
            low_cpu_mem_usage=True
        )
        
        # Optimize model for inference
        model.eval()
        model = torch.jit.optimize_for_inference(model)
        
        transform = build_transform()
        
        print("✅ Fast model loaded!")
        return True
        
    except Exception as e:
        print(f"❌ Error: {e}")
        traceback.print_exc()
        return False

def fast_analyze(image):
    """Optimized analysis function"""
    if model is None:
        return "❌ Model chưa sẵn sàng"
    
    try:
        start_time = time.time()
        
        # Quick image processing
        if image is None:
            return "❌ Không có ảnh"
            
        if hasattr(image, 'mode') and image.mode != 'RGB':
            image = image.convert('RGB')
        
        # Fast transform
        image_tensor = transform(image).unsqueeze(0).to(device)
        
        with torch.no_grad():
            # Shorter, faster generation
            query = "Mô tả ngắn gọn:"
            
            try:
                result = model.chat(
                    tokenizer,
                    image_tensor,
                    query,
                    generation_config=dict(
                        max_new_tokens=100,  # Ngắn hơn → nhanh hơn
                        do_sample=False,     # Greedy → nhanh hơn
                        temperature=0.7,
                        num_beams=1         # No beam search → nhanh hơn
                    )
                )
            except:
                # Fallback nhanh
                inputs = tokenizer(query, return_tensors="pt").to(device)
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=80,
                    do_sample=False,
                    num_beams=1
                )
                result = tokenizer.decode(outputs[0], skip_special_tokens=True)
                result = result.replace(query, "").strip()
            
            processing_time = time.time() - start_time
            
            return f"""**📝 Mô tả nhanh:**
{result}

**⚡ Thời gian:** {processing_time:.1f}s
**🤖 Model:** Vintern-1B-v2 (Optimized)
**💨 Tốc độ:** {1/processing_time:.1f} FPS

---
*Model được tối ưu cho tốc độ - phù hợp real-time*
"""
    
    except Exception as e:
        return f"❌ Lỗi: {str(e)}"

# Load model
print("🚀 Starting Fast Vintern Server...")
model_loaded = load_model()

# Lightweight Gradio interface
with gr.Blocks(
    title="Vintern-1B Fast",
    theme=gr.themes.Base(),
) as demo:
    
    gr.Markdown("# ⚡ Vintern-1B Fast - Tốc Độ Cao")
    
    if model_loaded:
        gr.Markdown("✅ **Model sẵn sàng!** Tối ưu cho tốc độ và real-time.")
    
    with gr.Row():
        image_input = gr.Image(type="pil", label="📤 Upload Ảnh")
        result_output = gr.Textbox(
            label="📋 Kết Quả", 
            lines=8,
            show_copy_button=True
        )
    
    # Auto-analyze on upload
    image_input.change(
        fn=fast_analyze,
        inputs=image_input,
        outputs=result_output
    )
    
    gr.Markdown("""
    ### ⚡ Tối ưu cho tốc độ:
    - **Model nhẹ**: Vintern-1B-v2 (~1.5GB)
    - **Fast generation**: Greedy decode, short output
    - **Optimized**: JIT compilation, no beam search
    - **Real-time ready**: ~2-5 giây/ảnh
    """)

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)