|
|
import os |
|
|
import argparse |
|
|
import torch |
|
|
from torch.utils.data import Dataset, DataLoader |
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig |
|
|
from tqdm import tqdm |
|
|
import pandas as pd |
|
|
import torch.nn.functional as F |
|
|
|
|
|
class CSVDataset(Dataset): |
|
|
def __init__(self, filepath, tokenizer, seq_length, rows_per_sample): |
|
|
self.data = pd.read_csv(filepath) |
|
|
self.text_data = self.data['Text'].tolist() |
|
|
self.tokenizer = tokenizer |
|
|
self.seq_length = seq_length |
|
|
self.rows_per_sample = rows_per_sample |
|
|
|
|
|
|
|
|
self.CAP_SAMPLE_LEN = 17500 |
|
|
|
|
|
if self.tokenizer.eos_token is None: |
|
|
self.tokenizer.add_special_tokens({'eos_token': '<|endoftext|>'}) |
|
|
|
|
|
if self.tokenizer.pad_token is None: |
|
|
self.tokenizer.add_special_tokens({'pad_token': '<|pad|>'}) |
|
|
|
|
|
self.eos_token_id = self.tokenizer.eos_token_id |
|
|
self.pad_token_id = self.tokenizer.pad_token_id |
|
|
|
|
|
def __len__(self): |
|
|
return (len(self.text_data) + self.rows_per_sample - 1) // self.rows_per_sample |
|
|
|
|
|
def __getitem__(self, idx): |
|
|
start_idx = idx * self.rows_per_sample |
|
|
end_idx = min(start_idx + self.rows_per_sample, len(self.text_data)) |
|
|
|
|
|
lines = self.text_data[start_idx:end_idx] |
|
|
|
|
|
|
|
|
truncated_lines = [] |
|
|
for text in lines: |
|
|
if len(text) > self.CAP_SAMPLE_LEN: |
|
|
l = text.rfind(' ', 0, self.CAP_SAMPLE_LEN) |
|
|
if l < 0: |
|
|
l = self.CAP_SAMPLE_LEN |
|
|
text = text[:l] |
|
|
truncated_lines.append(text) |
|
|
|
|
|
|
|
|
|
|
|
batch_encodings = self.tokenizer( |
|
|
truncated_lines, |
|
|
add_special_tokens=False, |
|
|
truncation=True, |
|
|
max_length=self.seq_length - 2, |
|
|
return_tensors=None |
|
|
) |
|
|
|
|
|
|
|
|
input_ids_list = [] |
|
|
for tokens in batch_encodings["input_ids"]: |
|
|
|
|
|
tokens.append(self.eos_token_id) |
|
|
input_ids_list.extend(tokens) |
|
|
|
|
|
|
|
|
|
|
|
if input_ids_list[-1] != self.eos_token_id: |
|
|
input_ids_list.append(self.eos_token_id) |
|
|
|
|
|
|
|
|
if len(input_ids_list) > self.seq_length: |
|
|
|
|
|
tokens_to_remove = len(input_ids_list) - self.seq_length |
|
|
input_ids_list = input_ids_list[:-tokens_to_remove] |
|
|
|
|
|
if input_ids_list[-1] != self.eos_token_id: |
|
|
input_ids_list[-1] = self.eos_token_id |
|
|
elif len(input_ids_list) < self.seq_length: |
|
|
|
|
|
padding_length = self.seq_length - len(input_ids_list) |
|
|
input_ids_list.extend([self.pad_token_id] * padding_length) |
|
|
|
|
|
input_ids_list[-1] = self.eos_token_id |
|
|
|
|
|
input_ids = torch.tensor(input_ids_list, dtype=torch.long) |
|
|
return input_ids |
|
|
|
|
|
|
|
|
def evaluate_model(model, dataloader, device): |
|
|
""" |
|
|
Evaluate the model batch by batch and print the losses for each batch. |
|
|
""" |
|
|
model.eval() |
|
|
total_loss = 0 |
|
|
|
|
|
with torch.no_grad(): |
|
|
for batch_idx, input_ids in enumerate(tqdm(dataloader, desc="Evaluating Model")): |
|
|
input_ids = input_ids.to(device) |
|
|
|
|
|
|
|
|
outputs = model(input_ids, labels=input_ids) |
|
|
loss = outputs.loss.item() |
|
|
total_loss += loss |
|
|
|
|
|
|
|
|
print(f"Batch {batch_idx + 1} Loss: {loss:.4f}") |
|
|
|
|
|
avg_loss = total_loss / len(dataloader) |
|
|
return avg_loss |
|
|
|
|
|
|
|
|
def evaluate_single_model(model_path, tokenizer_path, csv_path, seq_length, batch_size, device): |
|
|
""" |
|
|
Evaluate a single model on the dataset and print losses for each batch. |
|
|
""" |
|
|
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) |
|
|
dataset = CSVDataset(csv_path, tokenizer, seq_length, rows_per_sample=50) |
|
|
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=4) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
model_path, |
|
|
|
|
|
torch_dtype=torch.float16, |
|
|
|
|
|
).to(device) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(model.dtype) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("Evaluating Model...") |
|
|
avg_loss = evaluate_model(model, dataloader, device) |
|
|
print(f"Average Loss: {avg_loss:.4f}") |
|
|
|
|
|
return avg_loss |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
parser = argparse.ArgumentParser() |
|
|
parser.add_argument("--model_path", type=str, required=True, help="Path to the model.") |
|
|
parser.add_argument("--tokenizer_path", type=str, required=True, help="Path to the tokenizer.") |
|
|
parser.add_argument("--csv_path", type=str, required=True, help="Path to the CSV file with 'Text' column.") |
|
|
parser.add_argument("--seq_length", type=int, default=4096, help="Maximum sequence length.") |
|
|
parser.add_argument("--batch_size", type=int, default=2, help="Batch size for evaluation.") |
|
|
parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device to use.") |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
evaluate_single_model( |
|
|
args.model_path, |
|
|
args.tokenizer_path, |
|
|
args.csv_path, |
|
|
args.seq_length, |
|
|
args.batch_size, |
|
|
args.device |
|
|
) |
|
|
|