""" Evaluation Script for TinyLlama Models This script helps evaluate the performance of a trained TinyLlama model. """ import os import argparse import torch import numpy as np from transformers import AutoModelForCausalLM, AutoTokenizer from datasets import load_dataset import logging from tqdm import tqdm logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def parse_args(): parser = argparse.ArgumentParser(description="Evaluate a TinyLlama model") parser.add_argument( "--model_path", type=str, required=True, help="Path to the fine-tuned model directory" ) parser.add_argument( "--eval_data", type=str, required=True, help="Path to evaluation data file (JSON or CSV)" ) parser.add_argument( "--prompt_column", type=str, default="prompt", help="Column name containing the prompts" ) parser.add_argument( "--completion_column", type=str, default="completion", help="Column name containing the expected completions (optional)" ) parser.add_argument( "--max_new_tokens", type=int, default=256, help="Maximum number of tokens to generate" ) parser.add_argument( "--temperature", type=float, default=0.7, help="Sampling temperature" ) parser.add_argument( "--top_p", type=float, default=0.9, help="Top-p sampling parameter" ) parser.add_argument( "--output_file", type=str, default="evaluation_results.json", help="Path to save evaluation results" ) parser.add_argument( "--batch_size", type=int, default=4, help="Batch size for evaluation" ) return parser.parse_args() def generate_responses(model, tokenizer, prompts, args): """Generate responses from the model for a list of prompts""" responses = [] # Process in batches for i in tqdm(range(0, len(prompts), args.batch_size), desc="Generating responses"): batch_prompts = prompts[i:i+args.batch_size] inputs = tokenizer(batch_prompts, return_tensors="pt", padding=True) inputs = {k: v.to(model.device) for k, v in inputs.items()} with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=args.max_new_tokens, temperature=args.temperature, top_p=args.top_p, do_sample=True, pad_token_id=tokenizer.eos_token_id ) # Decode and extract only the newly generated text for j, output in enumerate(outputs): input_length = inputs.input_ids[j].size(0) generated_text = tokenizer.decode(output[input_length:], skip_special_tokens=True) responses.append(generated_text) return responses def calculate_metrics(generated_texts, reference_texts): """Calculate evaluation metrics if reference texts are available""" metrics = {} try: from rouge import Rouge from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction import nltk nltk.download('punkt', quiet=True) # Calculate ROUGE scores rouge = Rouge() rouge_scores = rouge.get_scores(generated_texts, reference_texts, avg=True) # Calculate BLEU scores bleu_scores = [] for gen, ref in zip(generated_texts, reference_texts): gen_tokens = nltk.word_tokenize(gen.lower()) ref_tokens = [nltk.word_tokenize(ref.lower())] if gen_tokens and ref_tokens[0]: score = sentence_bleu(ref_tokens, gen_tokens, smoothing_function=SmoothingFunction().method1) bleu_scores.append(score) avg_bleu = np.mean(bleu_scores) if bleu_scores else 0 metrics = { "rouge": rouge_scores, "bleu": avg_bleu } except ImportError: logger.warning("Rouge or NLTK not installed. Skipping metric calculation.") metrics = {"note": "Metrics calculation skipped due to missing dependencies"} return metrics def main(): args = parse_args() # Load model and tokenizer logger.info(f"Loading model from: {args.model_path}") model = AutoModelForCausalLM.from_pretrained(args.model_path) tokenizer = AutoTokenizer.from_pretrained(args.model_path) # Move model to GPU if available device = "cuda" if torch.cuda.is_available() else "cpu" model = model.to(device) model.eval() # Load evaluation data logger.info(f"Loading evaluation data from: {args.eval_data}") data_extension = os.path.splitext(args.eval_data)[1].replace(".", "") eval_dataset = load_dataset(data_extension, data_files=args.eval_data)["train"] # Get prompts and expected completions prompts = eval_dataset[args.prompt_column] has_completions = args.completion_column in eval_dataset.column_names completions = eval_dataset[args.completion_column] if has_completions else None # Generate responses logger.info("Generating responses...") generated_texts = generate_responses(model, tokenizer, prompts, args) # Calculate metrics if completions are available metrics = {} if has_completions: logger.info("Calculating evaluation metrics...") metrics = calculate_metrics(generated_texts, completions) # Prepare results results = { "model_path": args.model_path, "samples": [ {"prompt": prompt, "generated": generated, "reference": ref} for prompt, generated, ref in zip( prompts, generated_texts, completions if has_completions else [""] * len(prompts) ) ], "metrics": metrics } # Save results import json with open(args.output_file, "w") as f: json.dump(results, f, indent=2) logger.info(f"Evaluation complete. Results saved to {args.output_file}") # Print summary if metrics and "rouge" in metrics: logger.info(f"ROUGE-1: {metrics['rouge']['rouge-1']['f']:.4f}") logger.info(f"ROUGE-2: {metrics['rouge']['rouge-2']['f']:.4f}") logger.info(f"ROUGE-L: {metrics['rouge']['rouge-l']['f']:.4f}") logger.info(f"BLEU: {metrics['bleu']:.4f}") if __name__ == "__main__": main()