import json import os import sys from datetime import datetime from src.evaluation.perplexity_eval import evaluate_perplexity, create_perplexity_result from src.envs import EVAL_RESULTS_PATH, API, RESULTS_REPO def run_dynamic_perplexity_eval(model_name, revision="main", precision="float16"): """ Run perplexity evaluation and save results. """ try: sys.stderr.write(f"Starting dynamic evaluation for {model_name}\n") sys.stderr.flush() # Run evaluation sys.stderr.write("Running perplexity evaluation...\n") sys.stderr.flush() perplexity_score = evaluate_perplexity(model_name, revision) sys.stderr.write(f"Perplexity evaluation completed: {perplexity_score}\n") sys.stderr.flush() # Create result structure result = create_perplexity_result(model_name, revision, precision, perplexity_score) sys.stderr.write(f"Created result structure: {result}\n") sys.stderr.flush() # Save result file timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") result_filename = f"results_{model_name.replace('/', '_')}_{timestamp}.json" # Create directory structure org, model = model_name.split("/") if "/" in model_name else ("", model_name) result_dir = os.path.join(EVAL_RESULTS_PATH, org) if org else EVAL_RESULTS_PATH os.makedirs(result_dir, exist_ok=True) result_path = os.path.join(result_dir, result_filename) sys.stderr.write(f"Saving result to: {result_path}\n") sys.stderr.flush() with open(result_path, "w") as f: json.dump(result, f, indent=2) sys.stderr.write("Result file saved locally\n") sys.stderr.flush() # Upload to Hugging Face dataset try: sys.stderr.write(f"Uploading to HF dataset: {RESULTS_REPO}\n") sys.stderr.flush() API.upload_file( path_or_fileobj=result_path, path_in_repo=result_path.split("eval-results/")[1], repo_id=RESULTS_REPO, repo_type="dataset", commit_message=f"Add perplexity results for {model_name}", ) sys.stderr.write("Upload completed successfully\n") sys.stderr.flush() except Exception as upload_error: sys.stderr.write(f"Upload failed: {upload_error}\n") sys.stderr.flush() # Don't fail the whole process if upload fails return True, perplexity_score except Exception as e: import traceback sys.stderr.write(f"Error in run_dynamic_perplexity_eval: {e}\n") sys.stderr.write(f"Traceback: {traceback.format_exc()}\n") sys.stderr.flush() return False, str(e)