Update modeleval.py

072f790 verified about 1 year ago

6.7 kB

	import os
	import argparse
	import torch
	from torch.utils.data import Dataset, DataLoader
	from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
	from tqdm import tqdm
	import pandas as pd
	import torch.nn.functional as F

	class CSVDataset(Dataset):
	def __init__(self, filepath, tokenizer, seq_length, rows_per_sample):
	self.data = pd.read_csv(filepath)
	self.text_data = self.data['Text'].tolist()
	self.tokenizer = tokenizer
	self.seq_length = seq_length
	self.rows_per_sample = rows_per_sample # Number of rows to pack per sample

	# Define CAP_SAMPLE_LEN
	self.CAP_SAMPLE_LEN = 17500 # 15000 for Phi3 Model # Maximum number of characters per sample

	if self.tokenizer.eos_token is None:
	self.tokenizer.add_special_tokens({'eos_token': '<\|endoftext\|>'})

	if self.tokenizer.pad_token is None:
	self.tokenizer.add_special_tokens({'pad_token': '<\|pad\|>'})

	self.eos_token_id = self.tokenizer.eos_token_id
	self.pad_token_id = self.tokenizer.pad_token_id

	def __len__(self):
	return (len(self.text_data) + self.rows_per_sample - 1) // self.rows_per_sample

	def __getitem__(self, idx):
	start_idx = idx * self.rows_per_sample
	end_idx = min(start_idx + self.rows_per_sample, len(self.text_data))

	lines = self.text_data[start_idx:end_idx]

	# Truncate each line at CAP_SAMPLE_LEN (preferably at a space boundary)
	truncated_lines = []
	for text in lines:
	if len(text) > self.CAP_SAMPLE_LEN:
	l = text.rfind(' ', 0, self.CAP_SAMPLE_LEN)
	if l < 0:
	l = self.CAP_SAMPLE_LEN
	text = text[:l]
	truncated_lines.append(text)

	# Tokenize all lines at once. Each line will be tokenized independently.
	# We use add_special_tokens=False to avoid introducing BOS/EOS tokens automatically.
	batch_encodings = self.tokenizer(
	truncated_lines,
	add_special_tokens=False,
	truncation=True,
	max_length=self.seq_length - 2, # Reserve space for EOS tokens
	return_tensors=None
	)

	# batch_encodings["input_ids"] is a list of lists, each sub-list is token_ids for a line.
	input_ids_list = []
	for tokens in batch_encodings["input_ids"]:
	# Append an EOS token after each line
	tokens.append(self.eos_token_id)
	input_ids_list.extend(tokens)

	# Now we have a single list of input_ids for all rows.
	# Ensure final token is EOS
	if input_ids_list[-1] != self.eos_token_id:
	input_ids_list.append(self.eos_token_id)

	# Handle length adjustments
	if len(input_ids_list) > self.seq_length:
	# Truncate from the end
	tokens_to_remove = len(input_ids_list) - self.seq_length
	input_ids_list = input_ids_list[:-tokens_to_remove]
	# Ensure EOS at the end after truncation
	if input_ids_list[-1] != self.eos_token_id:
	input_ids_list[-1] = self.eos_token_id
	elif len(input_ids_list) < self.seq_length:
	# Pad until we reach seq_length
	padding_length = self.seq_length - len(input_ids_list)
	input_ids_list.extend([self.pad_token_id] * padding_length)
	# Ensure EOS at the end
	input_ids_list[-1] = self.eos_token_id

	input_ids = torch.tensor(input_ids_list, dtype=torch.long)
	return input_ids


	def evaluate_model(model, dataloader, device):
	"""
	Evaluate the model batch by batch and print the losses for each batch.
	"""
	model.eval()
	total_loss = 0

	with torch.no_grad():
	for batch_idx, input_ids in enumerate(tqdm(dataloader, desc="Evaluating Model")):
	input_ids = input_ids.to(device)

	# Evaluate the model
	outputs = model(input_ids, labels=input_ids)
	loss = outputs.loss.item()
	total_loss += loss

	# Print loss for the current batch
	print(f"Batch {batch_idx + 1} Loss: {loss:.4f}")

	avg_loss = total_loss / len(dataloader)
	return avg_loss


	def evaluate_single_model(model_path, tokenizer_path, csv_path, seq_length, batch_size, device):
	"""
	Evaluate a single model on the dataset and print losses for each batch.
	"""
	tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
	dataset = CSVDataset(csv_path, tokenizer, seq_length, rows_per_sample=50)
	dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=4)

	# model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16).to(device)

	# Load model in 4-bit precision
	# bnb_config = BitsAndBytesConfig(load_in_4bit=True)

	# Load the quantized model
	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	# quantization_config=bnb_config, # Use quantization
	torch_dtype=torch.float16, # 4-bit models compute in FP32
	# device_map="auto"
	).to(device)

	# Convert model to bfloat16
	# model.to(torch.bfloat16)

	# # Remove quantization metadata from config
	# if hasattr(model.config, "quantization_config"):
	# delattr(model.config, "quantization_config")
	# print("Removed quantization_config from model configuration.")

	# Check model's dtype
	print(model.dtype) # Should print torch.bfloat16

	# Save the model in bfloat16 precision
	# model.save_pretrained("model_bfloat16")

	print("Evaluating Model...")
	avg_loss = evaluate_model(model, dataloader, device)
	print(f"Average Loss: {avg_loss:.4f}")

	return avg_loss


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("--model_path", type=str, required=True, help="Path to the model.")
	parser.add_argument("--tokenizer_path", type=str, required=True, help="Path to the tokenizer.")
	parser.add_argument("--csv_path", type=str, required=True, help="Path to the CSV file with 'Text' column.")
	parser.add_argument("--seq_length", type=int, default=4096, help="Maximum sequence length.")
	parser.add_argument("--batch_size", type=int, default=2, help="Batch size for evaluation.")
	parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device to use.")

	args = parser.parse_args()

	evaluate_single_model(
	args.model_path,
	args.tokenizer_path,
	args.csv_path,
	args.seq_length,
	args.batch_size,
	args.device
	)