import gradio as gr from transformers import AutoTokenizer, AutoModelForSeq2SeqLM from struct_caption import StructCaptioner from fusion_caption import FusionCaptioner # Khởi tạo mô hình struct_captioner = StructCaptioner("Skywork/SkyCaptioner-V1") fusion_captioner = FusionCaptioner("Qwen/Qwen3-8B") # Tải mô hình dịch tiếng Việt translation_model = AutoModelForSeq2SeqLM.from_pretrained("VietAI/envit5-translation") translation_tokenizer = AutoTokenizer.from_pretrained("VietAI/envit5-translation") def translate_to_vietnamese(text): inputs = translation_tokenizer(f"en: {text}", return_tensors="pt", padding=True) outputs = translation_model.generate(**inputs, max_length=512) return translation_tokenizer.decode(outputs[0], skip_special_tokens=True) # Giao diện người dùng Gradio with gr.Blocks() as demo: gr.Markdown("

SkyCaptioner-V1

") with gr.Row(): with gr.Column(scale=0.5): video_input = gr.Video(label="Upload Video", interactive=True, format="mp4") btn_struct = gr.Button("Generate Struct Caption") with gr.Column(): struct_caption_output = gr.Code(label="Struct Caption", language="json", lines=25, interactive=False) with gr.Row(): with gr.Column(scale=0.5): task_input = gr.Radio(label="Task Type", choices=["t2v", "i2v"], value="t2v", interactive=True) btn_fusion = gr.Button("Generate Fusion Caption") with gr.Column(): fusion_caption_output = gr.Textbox(label="Fusion Caption", value="", interactive=False) @gr.Interface(fn=generate_struct_caption, inputs=video_input, outputs=struct_caption_output) def generate_struct_caption(video): struct_caption = struct_captioner(video) return struct_caption @gr.Interface(fn=generate_fusion_caption, inputs=[struct_caption_output, task_input], outputs=fusion_caption_output) def generate_fusion_caption(struct_caption_str, task): fusion_caption = fusion_captioner(struct_caption_str, task) if task == "t2v": return fusion_caption else: return translate_to_vietnamese(fusion_caption) gr.Examples( examples=[["./examples/1.mp4"], ["./examples/2.mp4"], ["./examples/3.mp4"], ["./examples/4.mp4"]], inputs=video_input, label="Example Videos" ) demo.launch()