syntheticbot commited on
Commit
9a5479a
·
verified ·
1 Parent(s): 99d6217

Upload 5 files

Browse files

uploading weigths

Files changed (5) hide show
  1. evaluate.py +150 -0
  2. model.safetensors +3 -0
  3. preprocessor_config.json +27 -0
  4. requirements.txt +35 -0
  5. train_clip.py +232 -0
evaluate.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import torch
3
+ from PIL import Image
4
+ from sklearn.metrics import classification_report, accuracy_score
5
+ from transformers import CLIPImageProcessor
6
+ import os
7
+ from tqdm import tqdm
8
+
9
+ # IMPORTANT: This line imports your custom model class from the training script.
10
+ # Ensure 'train_clip.py' is in the same directory.
11
+ from train_clip import MultiTaskClipVisionModel
12
+
13
+ # --- 1. Configuration ---
14
+ # Verify this path is correct. It should point to the directory where the
15
+ # 'pytorch_model.bin' and 'preprocessor_config.json' files for your best model are located.
16
+ MODEL_PATH = "./clip-fairface-finetuned/best_model" # Or "./clip-fairface-finetuned/checkpoint-XXXX"
17
+
18
+ VAL_CSV = './fairface_label_val.csv'
19
+ BASE_PATH = './'
20
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
21
+
22
+ print(f"Using device: {DEVICE}")
23
+ print(f"Loading model from: {MODEL_PATH}")
24
+
25
+ # --- 2. Load Label Mappings (must be identical to training) ---
26
+ # We load the TRAIN csv to ensure the label mappings are consistent with what the model was trained on.
27
+ train_df = pd.read_csv('./fairface_label_train.csv')
28
+ age_labels = sorted(train_df['age'].unique())
29
+ gender_labels = sorted(train_df['gender'].unique())
30
+ race_labels = sorted(train_df['race'].unique())
31
+
32
+ label_mappings = {
33
+ 'age': {label: i for i, label in enumerate(age_labels)},
34
+ 'gender': {label: i for i, label in enumerate(gender_labels)},
35
+ 'race': {label: i for i, label in enumerate(race_labels)},
36
+ }
37
+
38
+ # Create reverse mappings from ID back to human-readable label
39
+ id_mappings = {
40
+ 'age': {i: label for label, i in label_mappings['age'].items()},
41
+ 'gender': {i: label for label, i in label_mappings['gender'].items()},
42
+ 'race': {i: label for label, i in label_mappings['race'].items()},
43
+ }
44
+
45
+ NUM_LABELS = {
46
+ 'age': len(age_labels),
47
+ 'gender': len(gender_labels),
48
+ 'race': len(race_labels),
49
+ }
50
+
51
+
52
+ # --- 3. Load Model and Processor ---
53
+ print("Loading processor and model...")
54
+ processor = CLIPImageProcessor.from_pretrained(MODEL_PATH)
55
+ model = MultiTaskClipVisionModel(num_labels=NUM_LABELS)
56
+
57
+ # Load the saved model weights. `map_location` ensures it works even if you trained on GPU and now use CPU.
58
+ model.load_state_dict(torch.load(os.path.join(MODEL_PATH, 'pytorch_model.bin'), map_location=torch.device(DEVICE)))
59
+ model.to(DEVICE)
60
+ model.eval() # Set the model to evaluation mode
61
+ print("Model loaded successfully.")
62
+
63
+
64
+ # --- 4. Evaluation on Validation Set ---
65
+ def evaluate_on_dataset():
66
+ print(f"\nEvaluating on validation data from: {VAL_CSV}")
67
+ val_df = pd.read_csv(VAL_CSV)
68
+
69
+ # Lists to store all predictions and true labels
70
+ all_preds = {'age': [], 'gender': [], 'race': []}
71
+ all_true = {'age': [], 'gender': [], 'race': []}
72
+
73
+ # Disable gradient calculations for efficiency
74
+ with torch.no_grad():
75
+ # Use tqdm for a nice progress bar
76
+ for index, row in tqdm(val_df.iterrows(), total=val_df.shape[0], desc="Evaluating"):
77
+ image_path = os.path.join(BASE_PATH, row['file'])
78
+ image = Image.open(image_path).convert("RGB")
79
+
80
+ # Process the image and move to the correct device
81
+ inputs = processor(images=image, return_tensors="pt").to(DEVICE)
82
+
83
+ # Get model predictions
84
+ outputs = model(pixel_values=inputs['pixel_values'])
85
+ logits = outputs['logits']
86
+
87
+ # Process predictions for each task
88
+ for task in ['age', 'gender', 'race']:
89
+ pred_id = torch.argmax(logits[task], dim=-1).item()
90
+ true_label = row[task]
91
+ true_id = label_mappings[task][true_label]
92
+
93
+ all_preds[task].append(pred_id)
94
+ all_true[task].append(true_id)
95
+
96
+ # --- Print Reports ---
97
+ print("\n--- Evaluation Results ---")
98
+ for task in ['age', 'gender', 'race']:
99
+ task_preds = all_preds[task]
100
+ task_true = all_true[task]
101
+ task_labels = list(label_mappings[task].keys())
102
+ task_target_names = [id_mappings[task][i] for i in range(len(task_labels))]
103
+
104
+ accuracy = accuracy_score(task_true, task_preds)
105
+ report = classification_report(
106
+ task_true,
107
+ task_preds,
108
+ target_names=task_target_names,
109
+ zero_division=0
110
+ )
111
+
112
+ print(f"\n--- {task.upper()} CLASSIFICATION REPORT ---")
113
+ print(f"Overall Accuracy: {accuracy:.4f}")
114
+ print(report)
115
+
116
+
117
+ # --- 5. Function for Single Image Prediction ---
118
+ def predict_single_image(image_path):
119
+ print(f"\n--- Predicting for single image: {image_path} ---")
120
+ if not os.path.exists(image_path):
121
+ print(f"Error: Image path not found at '{image_path}'")
122
+ return
123
+
124
+ image = Image.open(image_path).convert("RGB")
125
+ inputs = processor(images=image, return_tensors="pt").to(DEVICE)
126
+
127
+ with torch.no_grad():
128
+ outputs = model(pixel_values=inputs['pixel_values'])
129
+ logits = outputs['logits']
130
+
131
+ predictions = {}
132
+ for task in ['age', 'gender', 'race']:
133
+ pred_id = torch.argmax(logits[task], dim=-1).item()
134
+ pred_label = id_mappings[task][pred_id]
135
+ predictions[task] = pred_label
136
+
137
+ print("Predictions:")
138
+ for task, label in predictions.items():
139
+ print(f" - {task.capitalize()}: {label}")
140
+ return predictions
141
+
142
+
143
+ if __name__ == "__main__":
144
+ # Run the full evaluation on the validation dataset
145
+ evaluate_on_dataset()
146
+
147
+ # --- Example of single image prediction ---
148
+ # IMPORTANT: Change this path to an image you want to test
149
+ sample_image_path = 'val/1.jpg'
150
+ predict_single_image(sample_image_path)
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e811883e6f247acc61a869a938b9523d1eb1d34fa3c1e882b3f033a49b8cb72d
3
+ size 1212846240
preprocessor_config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": {
3
+ "height": 224,
4
+ "width": 224
5
+ },
6
+ "do_center_crop": true,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_rescale": true,
10
+ "do_resize": true,
11
+ "image_mean": [
12
+ 0.48145466,
13
+ 0.4578275,
14
+ 0.40821073
15
+ ],
16
+ "image_processor_type": "CLIPImageProcessor",
17
+ "image_std": [
18
+ 0.26862954,
19
+ 0.26130258,
20
+ 0.27577711
21
+ ],
22
+ "resample": 3,
23
+ "rescale_factor": 0.00392156862745098,
24
+ "size": {
25
+ "shortest_edge": 224
26
+ }
27
+ }
requirements.txt ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file lists the required packages for the clip-face-attribute-classifier project.
2
+ # Install them using: pip install -r requirements.txt
3
+
4
+ # --- Hugging Face Libraries ---
5
+ # Core library for models, Trainer, TrainingArguments, and processors
6
+ transformers==4.38.2
7
+ # Used for data handling and creating Dataset objects
8
+ datasets==2.18.0
9
+ # For efficient training and hardware acceleration with the Trainer
10
+ accelerate==0.27.2
11
+ # For interacting with the Hugging Face Hub (login, upload, etc.)
12
+ huggingface_hub==0.21.4
13
+
14
+
15
+ # --- Core Deep Learning Framework ---
16
+ # The fundamental deep learning library
17
+ torch==2.2.1
18
+ # Companion library for computer vision tasks in PyTorch
19
+ torchvision==0.17.1
20
+
21
+
22
+ # --- Data Handling and Metrics ---
23
+ # For reading and manipulating the .csv label files
24
+ pandas==2.2.1
25
+ # For calculating evaluation metrics like accuracy, precision, recall, and F1-score
26
+ scikit-learn==1.4.1.post1
27
+
28
+
29
+ # --- Utilities ---
30
+ # For opening and handling image files
31
+ Pillow==10.2.0
32
+ # For creating progress bars during evaluation
33
+ tqdm==4.66.2
34
+ # For loading the safer .safetensors model format
35
+ safetensors==0.4.2
train_clip.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import torch
3
+ import torch.nn as nn
4
+ from PIL import Image
5
+ from sklearn.metrics import accuracy_score
6
+ from transformers import (
7
+ Trainer,
8
+ TrainingArguments,
9
+ CLIPVisionModel,
10
+ CLIPImageProcessor,
11
+ )
12
+ from torch.utils.data import Dataset
13
+ import os
14
+ os.environ["WANDB_DISABLED"] = "true"
15
+ # --- 1. Configuration ---
16
+ # Define paths and model name
17
+ BASE_PATH = './' # Assumes the script is run from the 'fairface' directory
18
+ TRAIN_CSV = os.path.join(BASE_PATH, 'fairface_label_train.csv')
19
+ VAL_CSV = os.path.join(BASE_PATH, 'fairface_label_val.csv')
20
+ MODEL_NAME = "openai/clip-vit-large-patch14"
21
+ OUTPUT_DIR = "./clip-fairface-finetuned"
22
+
23
+ # --- 2. Load and Prepare Label Mappings ---
24
+ # Load training data to create consistent label-to-ID mappings
25
+ train_df = pd.read_csv(TRAIN_CSV)
26
+
27
+ # Create sorted unique label lists to ensure consistent mapping
28
+ age_labels = sorted(train_df['age'].unique())
29
+ gender_labels = sorted(train_df['gender'].unique())
30
+ race_labels = sorted(train_df['race'].unique())
31
+
32
+ # Create label-to-ID mappings for each task
33
+ label_mappings = {
34
+ 'age': {label: i for i, label in enumerate(age_labels)},
35
+ 'gender': {label: i for i, label in enumerate(gender_labels)},
36
+ 'race': {label: i for i, label in enumerate(race_labels)},
37
+ }
38
+
39
+ NUM_LABELS = {
40
+ 'age': len(age_labels),
41
+ 'gender': len(gender_labels),
42
+ 'race': len(race_labels),
43
+ }
44
+
45
+ print(f"Number of labels: Age={NUM_LABELS['age']}, Gender={NUM_LABELS['gender']}, Race={NUM_LABELS['race']}")
46
+
47
+ # --- 3. Custom Dataset ---
48
+ class FairFaceDataset(Dataset):
49
+ def __init__(self, csv_file, image_processor, label_maps, base_path):
50
+ self.df = pd.read_csv(csv_file)
51
+ self.image_processor = image_processor
52
+ self.label_maps = label_maps
53
+ self.base_path = base_path
54
+
55
+ def __len__(self):
56
+ return len(self.df)
57
+
58
+ def __getitem__(self, idx):
59
+ row = self.df.iloc[idx]
60
+ # Construct the full path to the image
61
+ image_path = os.path.join(self.base_path, row['file'])
62
+ image = Image.open(image_path).convert("RGB")
63
+
64
+ # Process the image
65
+ inputs = {}
66
+ inputs['pixel_values'] = self.image_processor(images=image, return_tensors="pt").pixel_values.squeeze(0)
67
+
68
+ # Process labels into a dictionary of tensors
69
+ inputs['labels'] = {
70
+ 'age': torch.tensor(self.label_maps['age'][row['age']], dtype=torch.long),
71
+ 'gender': torch.tensor(self.label_maps['gender'][row['gender']], dtype=torch.long),
72
+ 'race': torch.tensor(self.label_maps['race'][row['race']], dtype=torch.long),
73
+ }
74
+ return inputs
75
+
76
+ # --- 4. Custom Model Definition ---
77
+ # --- 4. Custom Model Definition (Corrected for Gradient Checkpointing) ---
78
+ class MultiTaskClipVisionModel(nn.Module):
79
+ # Add this class attribute to signal to the Trainer that we support this
80
+ supports_gradient_checkpointing = True
81
+
82
+ def __init__(self, num_labels):
83
+ super(MultiTaskClipVisionModel, self).__init__()
84
+ self.vision_model = CLIPVisionModel.from_pretrained(MODEL_NAME)
85
+
86
+ # Freeze all parameters of the vision model first
87
+ for param in self.vision_model.parameters():
88
+ param.requires_grad = False
89
+
90
+ # Unfreeze the last few layers for fine-tuning.
91
+ for layer in self.vision_model.vision_model.encoder.layers[-3:]: # Unfreeze last 3 transformer layers
92
+ for param in layer.parameters():
93
+ param.requires_grad = True
94
+
95
+ # Define classification heads for each task
96
+ hidden_size = self.vision_model.config.hidden_size
97
+ self.age_head = nn.Linear(hidden_size, num_labels['age'])
98
+ self.gender_head = nn.Linear(hidden_size, num_labels['gender'])
99
+ self.race_head = nn.Linear(hidden_size, num_labels['race'])
100
+
101
+ # ADD THIS METHOD: This will be called by the Trainer
102
+ def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None):
103
+ """Activates gradient checkpointing for the underlying vision model."""
104
+ self.vision_model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs)
105
+
106
+ def forward(self, pixel_values, labels=None):
107
+ # The forward pass now works seamlessly with gradient checkpointing enabled
108
+ outputs = self.vision_model(pixel_values=pixel_values)
109
+ pooled_output = outputs.pooler_output
110
+
111
+ age_logits = self.age_head(pooled_output)
112
+ gender_logits = self.gender_head(pooled_output)
113
+ race_logits = self.race_head(pooled_output)
114
+
115
+ loss = None
116
+ # If labels are provided, calculate the combined loss
117
+ if labels is not None:
118
+ loss_fct = nn.CrossEntropyLoss()
119
+ age_loss = loss_fct(age_logits, labels['age'])
120
+ gender_loss = loss_fct(gender_logits, labels['gender'])
121
+ race_loss = loss_fct(race_logits, labels['race'])
122
+ # Total loss is the sum of individual task losses
123
+ loss = age_loss + gender_loss + race_loss
124
+
125
+ return {
126
+ 'loss': loss,
127
+ 'logits': {
128
+ 'age': age_logits,
129
+ 'gender': gender_logits,
130
+ 'race': race_logits,
131
+ },
132
+ }
133
+
134
+ # --- 5. Data Collator and Metrics ---
135
+ def collate_fn(batch):
136
+ # Stacks pixel values and organizes labels into a dictionary of tensors
137
+ pixel_values = torch.stack([item['pixel_values'] for item in batch])
138
+ labels = {
139
+ 'age': torch.tensor([item['labels']['age'] for item in batch], dtype=torch.long),
140
+ 'gender': torch.tensor([item['labels']['gender'] for item in batch], dtype=torch.long),
141
+ 'race': torch.tensor([item['labels']['race'] for item in batch], dtype=torch.long),
142
+ }
143
+ return {'pixel_values': pixel_values, 'labels': labels}
144
+
145
+ def compute_metrics(p):
146
+ # p is an EvalPrediction object containing predictions and label_ids
147
+ logits = p.predictions
148
+ labels = p.label_ids
149
+
150
+ # Extract predictions and labels for each task
151
+ age_preds = logits['age'].argmax(-1)
152
+ gender_preds = logits['gender'].argmax(-1)
153
+ race_preds = logits['race'].argmax(-1)
154
+
155
+ age_labels = labels['age']
156
+ gender_labels = labels['gender']
157
+ race_labels = labels['race']
158
+
159
+ # Calculate accuracy for each task
160
+ return {
161
+ 'age_accuracy': accuracy_score(age_labels, age_preds),
162
+ 'gender_accuracy': accuracy_score(gender_labels, gender_preds),
163
+ 'race_accuracy': accuracy_score(race_labels, race_preds),
164
+ }
165
+
166
+ # --- 6. Trainer Setup and Execution ---
167
+ def main():
168
+ # Initialize the image processor and our custom model
169
+ image_processor = CLIPImageProcessor.from_pretrained(MODEL_NAME)
170
+ model = MultiTaskClipVisionModel(num_labels=NUM_LABELS)
171
+
172
+ # Initialize the training and validation datasets
173
+ train_dataset = FairFaceDataset(
174
+ csv_file=TRAIN_CSV, image_processor=image_processor, label_maps=label_mappings, base_path=BASE_PATH
175
+ )
176
+ val_dataset = FairFaceDataset(
177
+ csv_file=VAL_CSV, image_processor=image_processor, label_maps=label_mappings, base_path=BASE_PATH
178
+ )
179
+
180
+ # Define the training arguments
181
+ # In your main() function, replace the old TrainingArguments with this one
182
+
183
+ # Define the training arguments
184
+ training_args = TrainingArguments(
185
+ output_dir=OUTPUT_DIR,
186
+ num_train_epochs=5,
187
+ # Set a batch size that fits in memory
188
+ per_device_train_batch_size=24,
189
+ per_device_eval_batch_size=32, # Evaluation does not need accumulation and can use a larger batch size
190
+ # Set accumulation steps to reach the desired effective batch size (24 * 22 = 528)
191
+ gradient_accumulation_steps=22,
192
+ # Enable gradient checkpointing to save more memory
193
+ gradient_checkpointing=True,
194
+ warmup_steps=500,
195
+ weight_decay=0.01,
196
+ logging_dir='./logs',
197
+ logging_steps=10, # Log more frequently to see progress within a large effective batch
198
+ evaluation_strategy="steps",
199
+ eval_steps=250, # You might want to evaluate less frequently with larger batches
200
+ save_strategy="steps",
201
+ save_steps=250,
202
+ load_best_model_at_end=True,
203
+ metric_for_best_model='gender_accuracy',
204
+ save_total_limit=3,
205
+ fp16=True, # Mixed-precision training is essential for large models
206
+ remove_unused_columns=False,
207
+ report_to="none", # Disables wandb logging
208
+ )
209
+
210
+ # Initialize the Trainer
211
+ trainer = Trainer(
212
+ model=model,
213
+ args=training_args,
214
+ train_dataset=train_dataset,
215
+ eval_dataset=val_dataset,
216
+ data_collator=collate_fn,
217
+ compute_metrics=compute_metrics,
218
+ )
219
+
220
+ # Start training
221
+ print("Starting model training...")
222
+ trainer.train()
223
+
224
+ # Save the final model and processor
225
+ print("Saving the best model...")
226
+ trainer.save_model(os.path.join(OUTPUT_DIR, "best_model"))
227
+ image_processor.save_pretrained(os.path.join(OUTPUT_DIR, "best_model"))
228
+
229
+ print("Training complete!")
230
+
231
+ if __name__ == "__main__":
232
+ main()