declare-lab
/

Emma-X

Image-Text-to-Text

Model card Files Files and versions

Emrys-Hong commited on Jan 2

Commit

1b2ebf2

·

1 Parent(s): ae16192

Update

Files changed (1) hide show

modeling_prismatic.py +5 -8

modeling_prismatic.py CHANGED Viewed

@@ -541,7 +541,7 @@ class EmmaxForActionPrediction(PrismaticForConditionalGeneration):
         return actions, generated_ids
     @torch.inference_mode()
-    def generate_actions(self, image: Image, prompt_text: str, type: str, **kwargs: str) -> str:
         # For now, only support generation with a batch size of 1 for simplicity
         # image_transform, tokenizer = self.vision_backbone.image_transform, self.llm_backbone.tokenizer
@@ -557,18 +557,15 @@ class EmmaxForActionPrediction(PrismaticForConditionalGeneration):
         #     raise ValueError(f"Unsupported `pixel_values` type = {type(pixel_values)}")
         # Invoke super().generate --> taps into `GenerationMixin` which (redirects) to `forward()`
-        autocast_dtype = self.llm_backbone.half_precision_dtype
         # with torch.autocast("cuda", dtype=autocast_dtype, enabled=self.enable_mixed_precision_training):
-        with torch.autocast("cuda", dtype=torch.float16):
             # fmt: off
             generated_ids = self.generate(
-                input_ids=input_ids,            # Shape: [1, seq]
-                pixel_values=pixel_values,      # Shape: [1, 3, res, res] or Dict[str, Shape[1, 3, res, res]]
-                **kwargs
             )
             # fmt: on
-        generated_text = tokenizer.decode(generated_ids[0, input_ids.shape[1] :], skip_special_tokens=True).strip()
         s = solver
         actions, reasoning = s.extract_action_policies(generated_text)
@@ -586,7 +583,7 @@ class EmmaxForActionPrediction(PrismaticForConditionalGeneration):
             )
             _actions.append(action_norm)
-        return _actions, generated_text
     @staticmethod

         return actions, generated_ids
     @torch.inference_mode()
+    def generate_actions(self, inputs, tokenizer, **kwargs: str) -> str:
         # For now, only support generation with a batch size of 1 for simplicity
         # image_transform, tokenizer = self.vision_backbone.image_transform, self.llm_backbone.tokenizer
         #     raise ValueError(f"Unsupported `pixel_values` type = {type(pixel_values)}")
         # Invoke super().generate --> taps into `GenerationMixin` which (redirects) to `forward()`
         # with torch.autocast("cuda", dtype=autocast_dtype, enabled=self.enable_mixed_precision_training):
+        with torch.autocast("cuda", dtype=torch.bfloat16):
             # fmt: off
             generated_ids = self.generate(
+                **inputs, **kwargs
             )
             # fmt: on
+        generated_text = tokenizer.decode(generated_ids[0, inputs['input_ids'].shape[1] :], skip_special_tokens=True).strip()
         s = solver
         actions, reasoning = s.extract_action_policies(generated_text)
             )
             _actions.append(action_norm)
+        return _actions[0], generated_text
     @staticmethod