Training in progress, epoch 1

Browse files

Files changed (8) hide show

README.md +7 -7
adapter_config.json +6 -6
adapter_model.safetensors +2 -2
chat_template.jinja +47 -0
runs/Jul20_17-33-00_meedgxh100a/events.out.tfevents.1753047182.meedgxh100a.2023753.0 +3 -0
tokenizer_config.json +0 -1
train_medgemma_ft_copy.py +38 -68
training_args.bin +2 -2

README.md CHANGED Viewed

@@ -4,8 +4,8 @@ library_name: transformers
 model_name: medgemma-27b-it-dr5
 tags:
 - generated_from_trainer
-- trl
 - sft
 licence: license
 ---
@@ -27,18 +27,18 @@ print(output["generated_text"])
 ## Training procedure
-[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/yoon307-kaist/medgemma-27b-it-dr5-Project/runs/mbxoj7k5)
 This model was trained with SFT.
 ### Framework versions
-- TRL: 0.19.0
-- Transformers: 4.51.3
-- Pytorch: 2.5.0
-- Datasets: 3.6.0
-- Tokenizers: 0.21.1
 ## Citations

 model_name: medgemma-27b-it-dr5
 tags:
 - generated_from_trainer
 - sft
+- trl
 licence: license
 ---
 ## Training procedure
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/yoon307-kaist/medgemma-27b-it-dr5-Project/runs/6argv9kb)
 This model was trained with SFT.
 ### Framework versions
+- TRL: 0.19.1
+- Transformers: 4.53.2
+- Pytorch: 2.6.0+cu124
+- Datasets: 4.0.0
+- Tokenizers: 0.21.2
 ## Citations

adapter_config.json CHANGED Viewed

@@ -29,15 +29,15 @@
   "revision": null,
   "target_modules": [
     "v_proj",
-    "gate_proj",
     "fc2",
-    "k_proj",
-    "out_proj",
     "q_proj",
-    "o_proj",
-    "down_proj",
     "up_proj",
-    "fc1"
   ],
   "task_type": "CAUSAL_LM",
   "trainable_token_indices": null,

   "revision": null,
   "target_modules": [
     "v_proj",
+    "down_proj",
+    "o_proj",
+    "fc1",
     "fc2",
     "q_proj",
+    "k_proj",
+    "gate_proj",
     "up_proj",
+    "out_proj"
   ],
   "task_type": "CAUSAL_LM",
   "trainable_token_indices": null,

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:193d5bc56b229dcd16a327b58b3d06056ba3d4a25c915706b577c5185a762759
-size 11766077184

 version https://git-lfs.github.com/spec/v1
+oid sha256:f6b8752afa62eaf145b3ab7bcd63788ad169ed8f26b3a901c59c47ac67134b7b
+size 6127553104

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,47 @@

+{{ bos_token }}
+{%- if messages[0]['role'] == 'system' -%}
+    {%- if messages[0]['content'] is string -%}
+        {%- set first_user_prefix = messages[0]['content'] + '
+' -%}
+    {%- else -%}
+        {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
+' -%}
+    {%- endif -%}
+    {%- set loop_messages = messages[1:] -%}
+{%- else -%}
+    {%- set first_user_prefix = "" -%}
+    {%- set loop_messages = messages -%}
+{%- endif -%}
+{%- for message in loop_messages -%}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
+        {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
+    {%- endif -%}
+    {%- if (message['role'] == 'assistant') -%}
+        {%- set role = "model" -%}
+    {%- else -%}
+        {%- set role = message['role'] -%}
+    {%- endif -%}
+    {{ '<start_of_turn>' + role + '
+' + (first_user_prefix if loop.first else "") }}
+    {%- if message['content'] is string -%}
+        {{ message['content'] | trim }}
+    {%- elif message['content'] is iterable -%}
+        {%- for item in message['content'] -%}
+            {%- if item['type'] == 'image' -%}
+                {{ '<start_of_image>' }}
+            {%- elif item['type'] == 'text' -%}
+                {{ item['text'] | trim }}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- else -%}
+        {{ raise_exception("Invalid content type") }}
+    {%- endif -%}
+    {{ '<end_of_turn>
+' }}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+    {{'<start_of_turn>model
+'}}
+{%- endif -%}

runs/Jul20_17-33-00_meedgxh100a/events.out.tfevents.1753047182.meedgxh100a.2023753.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0c56c796f26a94c0ecce8c68b316405d52cda87acf40d7bc948259609714e558
+size 9269

tokenizer_config.json CHANGED Viewed

@@ -51325,7 +51325,6 @@
   },
   "boi_token": "<start_of_image>",
   "bos_token": "<bos>",
-  "chat_template": "{{ bos_token }}\n{%- if messages[0]['role'] == 'system' -%}\n    {%- if messages[0]['content'] is string -%}\n        {%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}\n    {%- else -%}\n        {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}\n    {%- endif -%}\n    {%- set loop_messages = messages[1:] -%}\n{%- else -%}\n    {%- set first_user_prefix = \"\" -%}\n    {%- set loop_messages = messages -%}\n{%- endif -%}\n{%- for message in loop_messages -%}\n    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n        {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n    {%- endif -%}\n    {%- if (message['role'] == 'assistant') -%}\n        {%- set role = \"model\" -%}\n    {%- else -%}\n        {%- set role = message['role'] -%}\n    {%- endif -%}\n    {{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}\n    {%- if message['content'] is string -%}\n        {{ message['content'] | trim }}\n    {%- elif message['content'] is iterable -%}\n        {%- for item in message['content'] -%}\n            {%- if item['type'] == 'image' -%}\n                {{ '<start_of_image>' }}\n            {%- elif item['type'] == 'text' -%}\n                {{ item['text'] | trim }}\n            {%- endif -%}\n        {%- endfor -%}\n    {%- else -%}\n        {{ raise_exception(\"Invalid content type\") }}\n    {%- endif -%}\n    {{ '<end_of_turn>\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n    {{'<start_of_turn>model\n'}}\n{%- endif -%}\n",
   "clean_up_tokenization_spaces": false,
   "eoi_token": "<end_of_image>",
   "eos_token": "<eos>",

   },
   "boi_token": "<start_of_image>",
   "bos_token": "<bos>",
   "clean_up_tokenization_spaces": false,
   "eoi_token": "<end_of_image>",
   "eos_token": "<eos>",

train_medgemma_ft_copy.py CHANGED Viewed

@@ -24,17 +24,17 @@ from torch.utils.data import DataLoader
 from torch.utils.tensorboard import SummaryWriter
 # === Custom ===
-import tools.imutils as imutils
-import tools.utils as utils
-import tools.pyutils as pyutils
-from tools.utils import compute_es_auc, compute_group_auc, ImprovedBalancedBatchSampler, compute_es_auc_multi
 # === Evaluation ===
 from sklearn.metrics import roc_curve, accuracy_score, roc_auc_score
 # === Transformers ===
-from transformers import  AutoProcessor, AutoModelForImageTextToText, BitsAndBytesConfig, pipeline
-from peft import LoraConfig, get_peft_model
 from trl import SFTTrainer, SFTConfig
 import wandb
@@ -54,7 +54,7 @@ def collate_fn(examples):
     images = []
     for example in examples:
         image = example["image"].convert("RGB")
-        image = image.resize((512,512))
         images.append([image])
         texts.append(processor.apply_chat_template(
             example["messages"], add_generation_prompt=False, tokenize=False
@@ -121,14 +121,7 @@ def format_data_for_inference(sample):
         ]},
         # {"role": "assistant", "content": [{"type": "text", "text": str(label)}]}
     ]
-    # prompt = f"Please diagnose whether the {disease_name} exist or not based on the given image."
-    # return [
-    #     {"role": "system", "content": [{"type": "text", "text": system_message}]},
-    #     {"role": "user", "content": [
-    #         {"type": "image", "image": os.path.join(img_root_path, sample[1])},
-    #         {"type": "text", "text": prompt}
-    #     ]}
-    # ]
     return example
 # === Logit Preprocessing ===
@@ -191,8 +184,10 @@ if __name__ == '__main__':
     parser.add_argument("--name", required=True)
     parser.add_argument("--use_subset", action='store_true')
     args = parser.parse_args()
-    pyutils.same_seeds(0)
     task_map = {'dr': (-3, 'Diabetic Retinopathy'), 'amd': (-2, 'Aged Macular Degeneration'), 'glaucoma': (-1, 'Glaucoma')}
     task_idx, disease_name = task_map[args.task]
@@ -204,13 +199,13 @@ if __name__ == '__main__':
     3. Avoid overexplaining unless requested.\n
     4. Tone: confident, professional, precise.\n
     Do not include any explanation or thought.\n
     If {disease_name} is present, answer exactly 'positive'. Otherwise answer 'negative'."""
-    # Diabetic Retinopathy (DR) is a diabetes-related eye disease that affects the retina — the light-sensitive tissue at the back of the eye. It occurs when chronically high blood sugar levels damage the small blood vessels in the retina, leading to leakage, blockage, or abnormal blood vessel growth.\n
     cudnn.benchmark = True
-    img_root_path = '/shared/ssd_30T/yoon/exEYE/Eyeproject/data'
-    train_dataset = np.load('/shared/ssd_30T/yoon/exEYE/datasplit/train_final.npy')
-    val_dataset_raw = np.load('/shared/ssd_30T/yoon/exEYE/datasplit/val_final.npy')
     if args.use_subset:
         def subset(data,train=True):
@@ -218,11 +213,11 @@ if __name__ == '__main__':
             pos = [s for s in data if s[task_idx] != '0.0']
             num_sample = len(pos)
             if train:
-                return random.sample(neg, 7*num_sample), random.sample(pos, num_sample)
             else:
-                return random.sample(neg, 3*num_sample), random.sample(pos, num_sample)
                 # return random.sample(neg, 15), random.sample(pos, 15)
-                # return neg, random.sample(pos, num_sample)
         train_dataset = sum(subset(train_dataset,train=True), [])
         val_dataset_raw = sum(subset(val_dataset_raw,train=False), [])
@@ -235,7 +230,8 @@ if __name__ == '__main__':
     print(f"Total number of Data| Train: {len(train_dataset)} | Val : {len(val_dataset)}")
     print("="*50)
-    model_id = "google/medgemma-4b-it"
     model_kwargs = dict(
     attn_implementation="eager",
     torch_dtype=torch.bfloat16,
@@ -250,23 +246,29 @@ if __name__ == '__main__':
         bnb_4bit_quant_storage=model_kwargs["torch_dtype"],
     )
-    model = AutoModelForImageTextToText.from_pretrained(model_id, **model_kwargs)
     processor = AutoProcessor.from_pretrained(model_id)
     # Use right padding to avoid issues during training
     processor.tokenizer.padding_side = "right"
-    # processor.image_processor.size = {"height": 512, "width": 512}
-    # processor.image_processor.crop_size = {"height": 512, "width": 512}
     POS_ID = processor.tokenizer.convert_tokens_to_ids(processor.tokenizer.tokenize("positive")) #30558
     NEG_ID = processor.tokenizer.convert_tokens_to_ids(processor.tokenizer.tokenize("negative")) #27851
     ASST_ID = processor.tokenizer.convert_tokens_to_ids(processor.tokenizer.tokenize("model\n"))
     peft_config = LoraConfig(
         lora_alpha=16,
         lora_dropout=0.05,
-        r=32,
         bias="none",
         target_modules="all-linear",
         # target_modules=["q_proj", "v_proj"],
@@ -284,11 +286,12 @@ if __name__ == '__main__':
         from peft import PeftModel
         print("🔁 Loading trained PEFT weights...")
         model = PeftModel.from_pretrained(model, exp_name)
-        # model = PeftModel.from_pretrained(model, exp_name+"/checkpoint-242")
         # model = PeftModel.from_pretrained(model, "llava-1.5-7b-hf-dr-all/checkpoint-80")
         phase= "val"
     else:
         print("🚀 Initializing new LoRA model...")
         model = get_peft_model(model, peft_config)
         model.print_trainable_parameters()
         phase= "train"
@@ -296,7 +299,7 @@ if __name__ == '__main__':
     training_args = SFTConfig(
         output_dir=exp_name,
-        num_train_epochs= 16,                       # Number of training epochs
         per_device_train_batch_size=4,                           # Batch size per device during training
         per_device_eval_batch_size=4,                            # Batch size per device during evaluation
         gradient_accumulation_steps=8,                           # Number of steps before performing a backward/update pass
@@ -306,11 +309,12 @@ if __name__ == '__main__':
         save_strategy="epoch",                                   # Save checkpoint every epoch
         eval_strategy="steps",                                   # Evaluate every `eval_steps`
         eval_steps=10000,                                           # Number of steps between evaluations
-        learning_rate=8e-4,                             # Learning rate based on QLoRA paper
         bf16=True,                                               # Use bfloat16 precision
         max_grad_norm=0.3,                                       # Max gradient norm based on QLoRA paper
         warmup_ratio=0.03,                                       # Warmup ratio based on QLoRA paper
         lr_scheduler_type="linear",                              # Use linear learning rate scheduler
         push_to_hub=True,                                        # Push model to Hub
         report_to="tensorboard",                                 # Report metrics to tensorboard
         gradient_checkpointing_kwargs={"use_reentrant": False},  # Set gradient checkpointing to non-reentrant to avoid issues
@@ -334,47 +338,13 @@ if __name__ == '__main__':
         # preprocess_logits_for_metrics=slice_logits,
     )
-    if not os.path.exists(exp_name):
-        shutil.copy("/shared/ssd_30T/yoon/exEYE/Eyeproject/train_medgemma_ft.py",os.path.join(".",exp_name,"train_medgemma_ft_copy.py"))
     if phase == 'train':
         trainer.train()
         trainer.save_model(training_args.output_dir)
-    # custom_eval_metrics = run_custom_evaluation(trainer, val_dataset, val_labels)
-    # else:
-        # ft_pipe = pipeline(
-        #     "image-text-to-text",
-        #     model=exp_name,
-        #     processor=processor,
-        #     torch_dtype=torch.bfloat16,
-        # )
-        # # Set `do_sample = False` for deterministic responses
-        # ft_pipe.model.generation_config.do_sample = False
-        # ft_pipe.model.generation_config.pad_token_id = processor.tokenizer.eos_token_id
-        # # Use left padding during inference
-        # processor.tokenizer.padding_side = "left"
-        # texts = []
-        # images = []
-        # for example in val_dataset:
-        #     text = processor.apply_chat_template(
-        #         example["messages"], add_generation_prompt=True, tokenize=False
-        #     ).strip()
-        #     texts.append(text)
-        #     image = example["image"].convert("RGB").resize((512, 512))
-        #     images.append([image])  # 리스트로 감싸야 MedGEMMA가 기대하는 batched format
-        # # pdb.set_trace()
-        # ft_outputs = ft_pipe(
-        #     text=texts,
-        #     images=images,
-        #     max_new_tokens=5,
-        #     batch_size=1,
-        #     return_full_text=False,
-        # )
     batch_size = 1
     model.eval()
@@ -391,7 +361,7 @@ if __name__ == '__main__':
                 example["messages"], add_generation_prompt=True, tokenize=False
             ).strip()
             texts.append(text)
-            image = example["image"].convert("RGB").resize((512, 512))
             images.append([image])
         # tokenizer & image processor

 from torch.utils.tensorboard import SummaryWriter
 # === Custom ===
+# import tools.imutils as imutils
+# import tools.utils as utils
+# import tools.pyutils as pyutils
+# from tools.utils import compute_es_auc, compute_group_auc, ImprovedBalancedBatchSampler, compute_es_auc_multi
 # === Evaluation ===
 from sklearn.metrics import roc_curve, accuracy_score, roc_auc_score
 # === Transformers ===
+from transformers import  AutoProcessor, AutoModelForImageTextToText, BitsAndBytesConfig, pipeline, AutoModelForCausalLM
+from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
 from trl import SFTTrainer, SFTConfig
 import wandb
     images = []
     for example in examples:
         image = example["image"].convert("RGB")
+        image = image.resize((IM_SIZE,IM_SIZE))
         images.append([image])
         texts.append(processor.apply_chat_template(
             example["messages"], add_generation_prompt=False, tokenize=False
         ]},
         # {"role": "assistant", "content": [{"type": "text", "text": str(label)}]}
     ]
     return example
 # === Logit Preprocessing ===
     parser.add_argument("--name", required=True)
     parser.add_argument("--use_subset", action='store_true')
     args = parser.parse_args()
+    random.seed(42)
+    # pyutils.same_seeds(0)
     task_map = {'dr': (-3, 'Diabetic Retinopathy'), 'amd': (-2, 'Aged Macular Degeneration'), 'glaucoma': (-1, 'Glaucoma')}
     task_idx, disease_name = task_map[args.task]
     3. Avoid overexplaining unless requested.\n
     4. Tone: confident, professional, precise.\n
     Do not include any explanation or thought.\n
+    Diabetic Retinopathy (DR) is a diabetes-related eye disease that affects the retina — the light-sensitive tissue at the back of the eye. It occurs when chronically high blood sugar levels damage the small blood vessels in the retina, leading to leakage, blockage, or abnormal blood vessel growth.\n
     If {disease_name} is present, answer exactly 'positive'. Otherwise answer 'negative'."""
     cudnn.benchmark = True
+    img_root_path = '/PHShome/sy1081/exeye/data'
+    train_dataset = np.load('/PHShome/sy1081/exeye/data/train_final.npy')
+    val_dataset_raw = np.load('/PHShome/sy1081/exeye/data/val_final.npy')
     if args.use_subset:
         def subset(data,train=True):
             pos = [s for s in data if s[task_idx] != '0.0']
             num_sample = len(pos)
             if train:
+                return random.sample(neg, 5*num_sample), random.sample(pos, num_sample)
             else:
+                return random.sample(neg, num_sample), pos
                 # return random.sample(neg, 15), random.sample(pos, 15)
+                # return neg, pos
         train_dataset = sum(subset(train_dataset,train=True), [])
         val_dataset_raw = sum(subset(val_dataset_raw,train=False), [])
     print(f"Total number of Data| Train: {len(train_dataset)} | Val : {len(val_dataset)}")
     print("="*50)
+    # model_id = "google/medgemma-4b-it"
+    model_id = "google/medgemma-27b-it"
     model_kwargs = dict(
     attn_implementation="eager",
     torch_dtype=torch.bfloat16,
         bnb_4bit_quant_storage=model_kwargs["torch_dtype"],
     )
+    # model = AutoModelForImageTextToText.from_pretrained(model_id, **model_kwargs)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        **model_kwargs
+        # torch_dtype=torch.bfloat16,
+        # device_map="auto",
+    )
     processor = AutoProcessor.from_pretrained(model_id)
     # Use right padding to avoid issues during training
     processor.tokenizer.padding_side = "right"
     POS_ID = processor.tokenizer.convert_tokens_to_ids(processor.tokenizer.tokenize("positive")) #30558
     NEG_ID = processor.tokenizer.convert_tokens_to_ids(processor.tokenizer.tokenize("negative")) #27851
     ASST_ID = processor.tokenizer.convert_tokens_to_ids(processor.tokenizer.tokenize("model\n"))
+    IM_SIZE = 1024
     peft_config = LoraConfig(
         lora_alpha=16,
         lora_dropout=0.05,
+        r=16,
         bias="none",
         target_modules="all-linear",
         # target_modules=["q_proj", "v_proj"],
         from peft import PeftModel
         print("🔁 Loading trained PEFT weights...")
         model = PeftModel.from_pretrained(model, exp_name)
+        # model = PeftModel.from_pretrained(model, exp_name+"/checkpoint-690")
         # model = PeftModel.from_pretrained(model, "llava-1.5-7b-hf-dr-all/checkpoint-80")
         phase= "val"
     else:
         print("🚀 Initializing new LoRA model...")
+        # model = prepare_model_for_kbit_training(model)
         model = get_peft_model(model, peft_config)
         model.print_trainable_parameters()
         phase= "train"
     training_args = SFTConfig(
         output_dir=exp_name,
+        num_train_epochs= 20,                       # Number of training epochs
         per_device_train_batch_size=4,                           # Batch size per device during training
         per_device_eval_batch_size=4,                            # Batch size per device during evaluation
         gradient_accumulation_steps=8,                           # Number of steps before performing a backward/update pass
         save_strategy="epoch",                                   # Save checkpoint every epoch
         eval_strategy="steps",                                   # Evaluate every `eval_steps`
         eval_steps=10000,                                           # Number of steps between evaluations
+        learning_rate=5e-4,                             # Learning rate based on QLoRA paper
         bf16=True,                                               # Use bfloat16 precision
         max_grad_norm=0.3,                                       # Max gradient norm based on QLoRA paper
         warmup_ratio=0.03,                                       # Warmup ratio based on QLoRA paper
         lr_scheduler_type="linear",                              # Use linear learning rate scheduler
+        # lr_scheduler_type="constant",                              # Use linear learning rate scheduler
         push_to_hub=True,                                        # Push model to Hub
         report_to="tensorboard",                                 # Report metrics to tensorboard
         gradient_checkpointing_kwargs={"use_reentrant": False},  # Set gradient checkpointing to non-reentrant to avoid issues
         # preprocess_logits_for_metrics=slice_logits,
     )
+    # if not os.path.exists(exp_name):
+    shutil.copy("/PHShome/sy1081/exeye/train_medgemma_ft.py",os.path.join(".",exp_name,"train_medgemma_ft_copy.py"))
     if phase == 'train':
         trainer.train()
         trainer.save_model(training_args.output_dir)
     batch_size = 1
     model.eval()
                 example["messages"], add_generation_prompt=True, tokenize=False
             ).strip()
             texts.append(text)
+            image = example["image"].convert("RGB").resize((IM_SIZE, IM_SIZE))
             images.append([image])
         # tokenizer & image processor

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:61713d7b70980b1dac1979fbf4fa512bed3f7bbc0fa63cf78beb8efa0e918976
-size 5752

 version https://git-lfs.github.com/spec/v1
+oid sha256:2c0ab1f9caf759796d310240a8f917319ddf8b52bbe1f0b2c42b4b965b668b1c
+size 5816