Files changed (1) hide show
  1. app.py +30 -60
app.py CHANGED
@@ -1,86 +1,56 @@
1
- import spaces
2
  import gradio as gr
3
-
4
  from struct_caption import StructCaptioner
5
  from fusion_caption import FusionCaptioner
6
 
 
7
  struct_captioner = StructCaptioner("Skywork/SkyCaptioner-V1")
8
  fusion_captioner = FusionCaptioner("Qwen/Qwen3-8B")
9
 
10
- with gr.Blocks() as demo:
11
- gr.Markdown(
12
- """
13
- <h1 style="text-align: center; font-size: 2em;">SkyCaptioner-V1</h1>
14
- """,
15
- elem_id="header"
16
- )
 
17
 
 
 
 
 
18
  with gr.Row():
19
  with gr.Column(scale=0.5):
20
- video_input = gr.Video(
21
- label="Upload Video",
22
- interactive=True,
23
- format="mp4",
24
- )
25
-
26
  btn_struct = gr.Button("Generate Struct Caption")
27
-
28
  with gr.Column():
29
- struct_caption_output = gr.Code(
30
- label="Struct Caption",
31
- language="json",
32
- lines=25,
33
- interactive=False
34
- )
35
 
36
  with gr.Row():
37
  with gr.Column(scale=0.5):
38
- with gr.Row():
39
- task_input = gr.Radio(
40
- label="Task Type",
41
- choices=["t2v", "i2v"],
42
- value="t2v",
43
- interactive=True
44
- )
45
- btn_fusion = gr.Button("Generate Fusion Caption")
46
-
47
- with gr.Column():
48
- fusion_caption_output = gr.Textbox(
49
- label="Fusion Caption",
50
- value="",
51
- interactive=False
52
- )
53
 
54
- @spaces.GPU(duration=120)
55
  def generate_struct_caption(video):
56
  struct_caption = struct_captioner(video)
57
- return struct_caption
58
 
59
- @spaces.GPU(duration=120)
60
  def generate_fusion_caption(struct_caption_str, task):
61
- return fusion_captioner(struct_caption_str, task)
62
-
63
- btn_struct.click(
64
- fn=generate_struct_caption,
65
- inputs=video_input,
66
- outputs=struct_caption_output
67
- )
68
-
69
- btn_fusion.click(
70
- fn=generate_fusion_caption,
71
- inputs=[struct_caption_output, task_input],
72
- outputs=fusion_caption_output
73
- )
74
 
75
  gr.Examples(
76
- examples=[
77
- ["./examples/1.mp4"],
78
- ["./examples/2.mp4"],
79
- ["./examples/3.mp4"],
80
- ["./examples/4.mp4"],
81
- ],
82
  inputs=video_input,
83
  label="Example Videos"
84
  )
85
 
86
- demo.launch()
 
 
1
  import gradio as gr
2
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
3
  from struct_caption import StructCaptioner
4
  from fusion_caption import FusionCaptioner
5
 
6
+ # Khởi tạo mô hình
7
  struct_captioner = StructCaptioner("Skywork/SkyCaptioner-V1")
8
  fusion_captioner = FusionCaptioner("Qwen/Qwen3-8B")
9
 
10
+ # Tải hình dịch tiếng Việt
11
+ translation_model = AutoModelForSeq2SeqLM.from_pretrained("VietAI/envit5-translation")
12
+ translation_tokenizer = AutoTokenizer.from_pretrained("VietAI/envit5-translation")
13
+
14
+ def translate_to_vietnamese(text):
15
+ inputs = translation_tokenizer(f"en: {text}", return_tensors="pt", padding=True)
16
+ outputs = translation_model.generate(**inputs, max_length=512)
17
+ return translation_tokenizer.decode(outputs[0], skip_special_tokens=True)
18
 
19
+ # Giao diện người dùng Gradio
20
+ with gr.Blocks() as demo:
21
+ gr.Markdown("<h1 style='text-align: center;'>SkyCaptioner-V1</h1>")
22
+
23
  with gr.Row():
24
  with gr.Column(scale=0.5):
25
+ video_input = gr.Video(label="Upload Video", interactive=True, format="mp4")
 
 
 
 
 
26
  btn_struct = gr.Button("Generate Struct Caption")
 
27
  with gr.Column():
28
+ struct_caption_output = gr.Code(label="Struct Caption", language="json", lines=25, interactive=False)
 
 
 
 
 
29
 
30
  with gr.Row():
31
  with gr.Column(scale=0.5):
32
+ task_input = gr.Radio(label="Task Type", choices=["t2v", "i2v"], value="t2v", interactive=True)
33
+ btn_fusion = gr.Button("Generate Fusion Caption")
34
+ with gr.Column():
35
+ fusion_caption_output = gr.Textbox(label="Fusion Caption", value="", interactive=False)
 
 
 
 
 
 
 
 
 
 
 
36
 
37
+ @gr.Interface(fn=generate_struct_caption, inputs=video_input, outputs=struct_caption_output)
38
  def generate_struct_caption(video):
39
  struct_caption = struct_captioner(video)
40
+ return struct_caption
41
 
42
+ @gr.Interface(fn=generate_fusion_caption, inputs=[struct_caption_output, task_input], outputs=fusion_caption_output)
43
  def generate_fusion_caption(struct_caption_str, task):
44
+ fusion_caption = fusion_captioner(struct_caption_str, task)
45
+ if task == "t2v":
46
+ return fusion_caption
47
+ else:
48
+ return translate_to_vietnamese(fusion_caption)
 
 
 
 
 
 
 
 
49
 
50
  gr.Examples(
51
+ examples=[["./examples/1.mp4"], ["./examples/2.mp4"], ["./examples/3.mp4"], ["./examples/4.mp4"]],
 
 
 
 
 
52
  inputs=video_input,
53
  label="Example Videos"
54
  )
55
 
56
+ demo.launch()