Spaces:

TIGER-Lab
/

Pixel-Reasoner

Running on Zero

App Files Files Community

wenhu commited on 19 days ago

Commit

d5d2872

verified ·

1 Parent(s): 31d3f3e

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -22

app.py CHANGED Viewed

@@ -14,14 +14,12 @@ from serve_constants import html_header, bibtext, learn_more_markdown, tos_markd
 cur_dir = os.path.dirname(os.path.abspath(__file__))
 MODEL_ID = "TIGER-Lab/PixelReasoner-RL-v1"
-# MODEL_ID = "/home/ma-user/work/haozhe/workspace/lmm-r1/toolckpts/pix17K0506wt-NormalizedPenalizedFixedReweightCont-256-lossvernone-samplevernone-fmtnone-group-n8-ml10000-lr10-sysvcot-8node/global_step24_hf_evalbest"
-example_image = f"{cur_dir}/example_images/1.jpg" # /home/ma-user/work/haozhe/workspace/vlspaces/
-# example_image = "/home/ma-user/work/haozhe/workspace/vlspaces/example_images/1.jpg"
 example_text = "What kind of restaurant is it?"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True,
-                                          #   min_pixels=min_pixels,
-                                              max_pixels=512*28*28,
-                                            )
 model = AutoModelForImageTextToText.from_pretrained(
     MODEL_ID,
     trust_remote_code=True,
@@ -172,6 +170,7 @@ def model_inference(input_dict, history):
     # Create the full path to the folder
     folder_path = os.path.join(current_path, folder_to_find)
     print('files', files)
     imagelist = rawimagelist = current_message_images = [load_image(image) for image in files]
     all_images += current_message_images
     messages.append({
@@ -183,7 +182,7 @@ def model_inference(input_dict, history):
     })
     print(messages)
-    # complete_assistant_response_for_gradio = ""
     complete_assistant_response_for_gradio = []
     while True:
         """
@@ -199,15 +198,9 @@ def model_inference(input_dict, history):
         streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=False)
         generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024, temperature=0.1, top_p=0.95, top_k=50)
-        # import pdb; pdb.set_trace()
         thread = Thread(target=model.generate, kwargs=generation_kwargs)
         thread.start()
-        # buffer = ""
-        # for new_text in streamer:
-        #     buffer += new_text
-        #     yield buffer
-        # print(buffer)
         current_model_output_segment = "" # Text generated in this specific model call
         toolflag = False
         for new_text_chunk in streamer:
@@ -226,18 +219,14 @@ def model_inference(input_dict, history):
         processed_segment = current_model_output_segment.split("<|im_end|>", 1)[0] if "<|im_end|>" in current_model_output_segment else current_model_output_segment
         # Append this processed segment to the cumulative display string for Gradio
-        # complete_assistant_response_for_gradio += processed_segment + "\n\n"
         complete_assistant_response_for_gradio += [processed_segment + "\n\n"]
-        # print(f"this one: {complete_assistant_response_for_gradio}")
         yield complete_assistant_response_for_gradio # Ensure the fully processed segment is yielded to Gradio
         # Check for tool call in the *just generated* segment
         qatext_for_tool_check = processed_segment
         require_tool = tool_end in qatext_for_tool_check and tool_start in qatext_for_tool_check
-        # print(f"Segment from model: \"{qatext_for_tool_check[:200]}...\", Requires tool: {require_tool}")
         if require_tool:
             tool_params = parse_last_tool(qatext_for_tool_check)
@@ -252,8 +241,6 @@ def model_inference(input_dict, history):
             print(raw_result)
             proc_img = raw_result
             all_images += [proc_img]
-            # complete_assistant_response_for_gradio += [(proc_img, "Visual Operation Result")]
-            # yield complete_assistant_response_for_gradio # Update Gradio display
             new_piece = dict(role='user', content=[
                                     dict(type='text', text="\nHere is the cropped image (Image Size: {}x{}):".format(proc_img.size[0], proc_img.size[1])),
@@ -261,7 +248,6 @@ def model_inference(input_dict, history):
                                 ]
             )
             messages.append(new_piece)
-            # print(messages)
             # complete_assistant_response_for_gradio += f"\n<b>Analyzing Operation Result ...</b> @region(size={proc_img.size[0]}x{proc_img.size[1]})\n\n"
             complete_assistant_response_for_gradio += [f"\n<b>Analyzing Operation Result ...</b> @region(size={proc_img.size[0]}x{proc_img.size[1]})\n\n"]
             yield complete_assistant_response_for_gradio # Update Gradio display
@@ -272,7 +258,13 @@ def model_inference(input_dict, history):
 with gr.Blocks() as demo:
     examples = [
-        [{"text": example_text, "files": [example_image]}]
     ]
     gr.HTML(html_header)

 cur_dir = os.path.dirname(os.path.abspath(__file__))
 MODEL_ID = "TIGER-Lab/PixelReasoner-RL-v1"
+example_image = f"{cur_dir}/example_images/1.jpg"
+print(example_image)
 example_text = "What kind of restaurant is it?"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True,
+                                          max_pixels=512*28*28)
 model = AutoModelForImageTextToText.from_pretrained(
     MODEL_ID,
     trust_remote_code=True,
     # Create the full path to the folder
     folder_path = os.path.join(current_path, folder_to_find)
     print('files', files)
     imagelist = rawimagelist = current_message_images = [load_image(image) for image in files]
     all_images += current_message_images
     messages.append({
     })
     print(messages)
     complete_assistant_response_for_gradio = []
     while True:
         """
         streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=False)
         generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024, temperature=0.1, top_p=0.95, top_k=50)
         thread = Thread(target=model.generate, kwargs=generation_kwargs)
         thread.start()
         current_model_output_segment = "" # Text generated in this specific model call
         toolflag = False
         for new_text_chunk in streamer:
         processed_segment = current_model_output_segment.split("<|im_end|>", 1)[0] if "<|im_end|>" in current_model_output_segment else current_model_output_segment
         # Append this processed segment to the cumulative display string for Gradio
         complete_assistant_response_for_gradio += [processed_segment + "\n\n"]
         yield complete_assistant_response_for_gradio # Ensure the fully processed segment is yielded to Gradio
         # Check for tool call in the *just generated* segment
         qatext_for_tool_check = processed_segment
         require_tool = tool_end in qatext_for_tool_check and tool_start in qatext_for_tool_check
         if require_tool:
             tool_params = parse_last_tool(qatext_for_tool_check)
             print(raw_result)
             proc_img = raw_result
             all_images += [proc_img]
             new_piece = dict(role='user', content=[
                                     dict(type='text', text="\nHere is the cropped image (Image Size: {}x{}):".format(proc_img.size[0], proc_img.size[1])),
                                 ]
             )
             messages.append(new_piece)
             # complete_assistant_response_for_gradio += f"\n<b>Analyzing Operation Result ...</b> @region(size={proc_img.size[0]}x{proc_img.size[1]})\n\n"
             complete_assistant_response_for_gradio += [f"\n<b>Analyzing Operation Result ...</b> @region(size={proc_img.size[0]}x{proc_img.size[1]})\n\n"]
             yield complete_assistant_response_for_gradio # Update Gradio display
 with gr.Blocks() as demo:
     examples = [
+        [
+            {"text": example_text,
+             "files": [
+                 example_image
+             ]
+            }
+        ]
     ]
     gr.HTML(html_header)