Harshit0414 commited on
Commit
995b558
·
1 Parent(s): cf3d408
Files changed (2) hide show
  1. app.py +67 -90
  2. app1.py +103 -0
app.py CHANGED
@@ -1,103 +1,80 @@
1
- """
2
- Gradio demo for UI‑TARS 1.5‑7B (image‑text‑to‑text) on Hugging Face Spaces.
3
- Save this file as **app.py** and add a *requirements.txt* with the packages
4
- listed below. Then create a new **Python** Space, upload both files and
5
- commit the Space will build and serve the app automatically.
6
-
7
- requirements.txt (suggested versions)
8
- -------------------------------------
9
- transformers==4.41.0
10
- accelerate>=0.29.0
11
- torch>=2.2
12
- sentencepiece # needed for many multilingual models
13
- bitsandbytes # optional: enables 4‑bit quantization if Space has GPU
14
- pillow
15
- gradio>=4.33
16
- """
17
-
18
- from __future__ import annotations
19
-
20
- from typing import List, Dict, Any
21
-
22
  import gradio as gr
23
- from PIL import Image
24
- from transformers import pipeline
25
- import base64
26
 
27
- def load_model():
28
- """Load the UI‑TARS multimodal pipeline once at startup."""
29
- print("Loading UI‑TARS 1.5‑7B… this may take a while the first time.")
30
- return pipeline(
31
- "image-text-to-text",
32
- model="ByteDance-Seed/UI-TARS-1.5-7B",
33
- device_map="auto", # automatically use GPU if available
34
- )
35
-
36
-
37
- pipe = load_model()
38
 
 
 
 
 
 
 
 
 
 
 
39
 
40
- def answer_question(image: Image.Image, question: str) -> str:
41
- """Run the model on the provided image & question and return its answer."""
42
- if image is None or not question.strip():
43
- return "Please supply **both** an image and a question."
44
-
45
- base64_image = base64.b64encode(image.tobytes()).decode('utf-8')
46
 
47
- # Compose a messages list in the expected multimodal chat format.
48
- messages: List[Dict[str, Any]] = [
49
- {
50
- "role": "user",
51
- "content": [
52
- {"type": "text", "text": f"You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. \n\n## Output Format\n```\nThought: ...\nAction: ...\n```\n\n## Action Space\n\nclick(start_box='<|box_start|>(x1, y1)<|box_end|>')\nleft_double(start_box='<|box_start|>(x1, y1)<|box_end|>')\nright_single(start_box='<|box_start|>(x1, y1)<|box_end|>')\ndrag(start_box='<|box_start|>(x1, y1)<|box_end|>', end_box='<|box_start|>(x3, y3)<|box_end|>')\nhotkey(key='')\ntype(content='') #If you want to submit your input, use \"\\n\" at the end of `content`.\nscroll(start_box='<|box_start|>(x1, y1)<|box_end|>', direction='down or up or right or left')\nwait() #Sleep for 5s and take a screenshot to check for any changes.\nfinished(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format.\n\n\n## Note\n- Use Chinese in `Thought` part.\n- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.\n\n## User Instruction\n{question.strip()}"},
53
- ],
54
- },
55
- {
56
- "role":"user",
57
- "content": [
58
- {"type": "image_url",
59
- "image_url": base64_image},
60
- ],
61
- }
 
62
  ]
63
-
64
- # The pipeline returns a list with one dict when `messages` is passed via
65
- # the `text` keyword. We extract the generated text robustly.
66
- outputs = pipe(text=messages)
67
-
68
- if isinstance(outputs, list):
69
- first = outputs[0]
70
- if isinstance(first, dict) and "generated_text" in first:
71
- return first["generated_text"].strip()
72
- return str(first)
73
-
74
- return str(outputs)
 
 
 
 
 
 
 
 
 
75
 
76
 
77
  demo = gr.Interface(
78
- fn=answer_question,
79
  inputs=[
80
- gr.Image(type="pil", label="Upload image"),
81
- gr.Textbox(label="Ask a question about the image", placeholder="e.g. What animal is on the candy?"),
82
- ],
83
- outputs=gr.Textbox(label="UI‑TARS answer"),
84
- title="UI‑TARS 1.5‑7B – Visual Q&A",
85
- description=(
86
- "Upload an image and ask a question. The **UI‑TARS 1.5‑7B** model will "
87
- "answer based on the visual content. Runs completely on‑device in this Space."
88
- ),
89
- examples=[
90
- [
91
- "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG",
92
- "What animal is on the candy?",
93
- ]
94
  ],
95
- cache_examples=True,
96
- allow_flagging="never",
97
  )
98
 
99
-
100
- if __name__ == "__main__":
101
- # Spaces automatically call `demo.launch()`, but running locally this
102
- # guard lets you execute `python app.py` for quick tests.
103
- demo.launch()
 
 
1
+ # app.py
2
+ import spaces
3
+ import ast
4
+ import torch
5
+ from PIL import Image, ImageDraw
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  import gradio as gr
 
 
 
7
 
8
+ from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
9
+ from qwen_vl_utils import process_vision_info # include this file in your repo if not pip-installable
 
 
 
 
 
 
 
 
 
10
 
11
+ # ---- model & processor loaded on CPU ----
12
+ model = Qwen2VLForConditionalGeneration.from_pretrained(
13
+ "ByteDance-Seed/UI-TARS-1.5-7B",
14
+ device_map="auto",
15
+ torch_dtype=torch.float16, # CPU-friendly
16
+ )
17
+ processor = AutoProcessor.from_pretrained(
18
+ "ByteDance-Seed/UI-TARS-1.5-7B",
19
+ size={"shortest_edge": 256 * 28 * 28, "longest_edge": 1344 * 28 * 28},
20
+ use_fast=True,
21
 
22
+ )
 
 
 
 
 
23
 
24
+ def draw_point(image: Image.Image, point=None, radius=5):
25
+ img = image.copy()
26
+ if point:
27
+ x, y = point[0] * img.width, point[1] * img.height
28
+ ImageDraw.Draw(img).ellipse(
29
+ (x - radius, y - radius, x + radius, y + radius), fill='red'
30
+ )
31
+ return img
32
+
33
+ @spaces.GPU
34
+ def navigate(image, task, platform):
35
+ messages = [
36
+ {"role": "user", "content": f"You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. \n\n## Output Format\n```\nThought: ...\nAction: ...\n```\n\n## Action Space\n\nclick(start_box='<|box_start|>(x1, y1)<|box_end|>')\nleft_double(start_box='<|box_start|>(x1, y1)<|box_end|>')\nright_single(start_box='<|box_start|>(x1, y1)<|box_end|>')\ndrag(start_box='<|box_start|>(x1, y1)<|box_end|>', end_box='<|box_start|>(x3, y3)<|box_end|>')\nhotkey(key='')\ntype(content='') #If you want to submit your input, use \"\\n\" at the end of `content`.\nscroll(start_box='<|box_start|>(x1, y1)<|box_end|>', direction='down or up or right or left')\nwait() #Sleep for 5s and take a screenshot to check for any changes.\nfinished(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format.\n\n\n## Note\n- Use Chinese in `Thought` part.\n- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.\n\n## User Instruction\n{task}"},
37
+ {"role":"user", "content": [
38
+ {"type": "image_url", "image_url": {"url":image}}
39
+ ]}
40
  ]
41
+ # prepare inputs
42
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
43
+ images, videos = process_vision_info(messages)
44
+ inputs = processor(text=[text], images=images, videos=videos, padding=True, return_tensors="pt")
45
+ inputs = inputs.to("cuda")
46
+
47
+ # generate
48
+ generated = model.generate(**inputs, max_new_tokens=128)
49
+ trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated)]
50
+ out = processor.batch_decode(trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
51
+
52
+ # optionally parse JSON and draw point
53
+ try:
54
+ actions = ast.literal_eval(out)
55
+ for act in actions if isinstance(actions, list) else [actions]:
56
+ pos = act.get('position')
57
+ if pos and isinstance(pos, list) and len(pos)==2:
58
+ image = draw_point(image, pos)
59
+ return image, out
60
+ except:
61
+ return image, out
62
 
63
 
64
  demo = gr.Interface(
65
+ fn=navigate,
66
  inputs=[
67
+ gr.Image(type="pil", label="Screenshot"),
68
+ gr.Textbox(lines=1, placeholder="e.g. Search the weather for New York", label="Task"),
69
+ gr.Dropdown(choices=["web", "phone"], value="web", label="Platform"),
 
 
 
 
 
 
 
 
 
 
 
70
  ],
71
+ outputs=[gr.Image(label="With Click Point"), gr.Textbox(label="Raw Action JSON")],
72
+ title="ShowUI-2B Navigation Demo",
73
  )
74
 
75
+ demo.launch(
76
+ server_name="0.0.0.0",
77
+ server_port=7860,
78
+ share=False, # or True if you need a public link
79
+ ssr_mode=False, # turn off experimental SSR so the process blocks
80
+ )
app1.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio demo for UI‑TARS 1.5‑7B (image‑text‑to‑text) on Hugging Face Spaces.
3
+ Save this file as **app.py** and add a *requirements.txt* with the packages
4
+ listed below. Then create a new **Python** Space, upload both files and
5
+ commit — the Space will build and serve the app automatically.
6
+
7
+ requirements.txt (suggested versions)
8
+ -------------------------------------
9
+ transformers==4.41.0
10
+ accelerate>=0.29.0
11
+ torch>=2.2
12
+ sentencepiece # needed for many multilingual models
13
+ bitsandbytes # optional: enables 4‑bit quantization if Space has GPU
14
+ pillow
15
+ gradio>=4.33
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ from typing import List, Dict, Any
21
+
22
+ import gradio as gr
23
+ from PIL import Image
24
+ from transformers import pipeline
25
+ import base64
26
+
27
+ def load_model():
28
+ """Load the UI‑TARS multimodal pipeline once at startup."""
29
+ print("Loading UI‑TARS 1.5‑7B… this may take a while the first time.")
30
+ return pipeline(
31
+ "image-text-to-text",
32
+ model="ByteDance-Seed/UI-TARS-1.5-7B",
33
+ device_map="auto", # automatically use GPU if available
34
+ )
35
+
36
+
37
+ pipe = load_model()
38
+
39
+
40
+ def answer_question(image: Image.Image, question: str) -> str:
41
+ """Run the model on the provided image & question and return its answer."""
42
+ if image is None or not question.strip():
43
+ return "Please supply **both** an image and a question."
44
+
45
+ base64_image = base64.b64encode(image.tobytes()).decode('utf-8')
46
+
47
+ # Compose a messages list in the expected multimodal chat format.
48
+ messages: List[Dict[str, Any]] = [
49
+ {
50
+ "role": "user",
51
+ "content": [
52
+ {"type": "text", "text": f"You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. \n\n## Output Format\n```\nThought: ...\nAction: ...\n```\n\n## Action Space\n\nclick(start_box='<|box_start|>(x1, y1)<|box_end|>')\nleft_double(start_box='<|box_start|>(x1, y1)<|box_end|>')\nright_single(start_box='<|box_start|>(x1, y1)<|box_end|>')\ndrag(start_box='<|box_start|>(x1, y1)<|box_end|>', end_box='<|box_start|>(x3, y3)<|box_end|>')\nhotkey(key='')\ntype(content='') #If you want to submit your input, use \"\\n\" at the end of `content`.\nscroll(start_box='<|box_start|>(x1, y1)<|box_end|>', direction='down or up or right or left')\nwait() #Sleep for 5s and take a screenshot to check for any changes.\nfinished(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format.\n\n\n## Note\n- Use Chinese in `Thought` part.\n- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.\n\n## User Instruction\n{question.strip()}"},
53
+ ],
54
+ },
55
+ {
56
+ "role":"user",
57
+ "content": [
58
+ {"type": "image_url",
59
+ "image_url": base64_image},
60
+ ],
61
+ }
62
+ ]
63
+
64
+ # The pipeline returns a list with one dict when `messages` is passed via
65
+ # the `text` keyword. We extract the generated text robustly.
66
+ outputs = pipe(text=messages)
67
+
68
+ if isinstance(outputs, list):
69
+ first = outputs[0]
70
+ if isinstance(first, dict) and "generated_text" in first:
71
+ return first["generated_text"].strip()
72
+ return str(first)
73
+
74
+ return str(outputs)
75
+
76
+
77
+ demo = gr.Interface(
78
+ fn=answer_question,
79
+ inputs=[
80
+ gr.Image(type="pil", label="Upload image"),
81
+ gr.Textbox(label="Ask a question about the image", placeholder="e.g. What animal is on the candy?"),
82
+ ],
83
+ outputs=gr.Textbox(label="UI‑TARS answer"),
84
+ title="UI‑TARS 1.5‑7B – Visual Q&A",
85
+ description=(
86
+ "Upload an image and ask a question. The **UI‑TARS 1.5‑7B** model will "
87
+ "answer based on the visual content. Runs completely on‑device in this Space."
88
+ ),
89
+ examples=[
90
+ [
91
+ "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG",
92
+ "What animal is on the candy?",
93
+ ]
94
+ ],
95
+ cache_examples=True,
96
+ allow_flagging="never",
97
+ )
98
+
99
+
100
+ if __name__ == "__main__":
101
+ # Spaces automatically call `demo.launch()`, but running locally this
102
+ # guard lets you execute `python app.py` for quick tests.
103
+ demo.launch()