Spaces:

ahmedsqrd
/

model_trace

Runtime error

File size: 10,325 Bytes

de071e9

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
import itertools
import os
from datasets import load_dataset
from tqdm import tqdm
import math
import matplotlib.pyplot as plt
import csv
from utils import interpolate_models
import time
import argparse
import glob
import gc

block_size = 2048
"""
Script for running ablation of tests on m2d2 dataset rather
than simply wikitext
"""


def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result


def load_model(model_name):
    return AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)


def main(args):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    os.environ["WANDB_MODE"] = "disabled"
    os.environ["TOKENIZERS_PARALLELISM"] = "false"

    model_arch = args.model_arch
    if model_arch == "llama":
        model_list = [
            "meta-llama/Llama-2-7b-hf",
            "meta-llama/Llama-2-7b-chat-hf",
            "meta-llama/CodeLlama-7b-Python-hf",
            "meta-llama/CodeLlama-7b-Instruct-hf",
            "codellama/CodeLlama-7b-hf",
            "lmsys/vicuna-7b-v1.5",
            "lmsys/vicuna-7b-v1.1",
            "EleutherAI/llemma_7b",
            "LLM360/Amber",
        ]
    elif model_arch == "olmo":
        model_list = [
            "/scr/ahmedah/olmo/step1000_4B_tokens/seed_0_4B",
            "/scr/ahmedah/olmo/step1000_4B_tokens/seed_42_4B",
        ]

    tokenizer = AutoTokenizer.from_pretrained(model_list[0])
    tokenizer.pad_token = tokenizer.eos_token

    test_cases = [
        {
            "test_name": folder_name,
            "json_dir": f"/juice4/scr4/nlp/model-tracing/m2d2_s2orc/{folder_name}",
            "save_dir": f"/juice4/scr4/nlp/model-tracing/m2d2_s2orc/results_{folder_name}",
            "columns_ignored": ["text", "added", "id", "source", "timestamp", "subdomain"],
        }
        for folder_name in [
            "AI",
            "CV",
            "ET",
            "IM",
            "mtrl-sci",
            "stat-mech",
            "AR",
            "CY",
            "IR",
            "NA",
            "str-el",
            "art",
            "DB",
            "FL",
            "supr-con",
            "CC",
            "DC",
            "GA",
            "LG",
            "phil",
            "CE",
            "dis-nn",
            "GL",
            "LO",
            "CG",
            "DL",
            "GR",
            "MA",
            "quant-gas",
            "CL",
            "DM",
            "GT",
            "mes-hall",
            "CO",
            "DS",
            "HC",
            "MM",
            "soft",
            "CR",
            "EP",
            "HE",
            "MS",
            "SR",
        ]
    ]

    for test_case in test_cases:
        test_name = test_case["test_name"]
        json_dir = test_case["json_dir"]
        save_dir = test_case["save_dir"]
        columns_ignored = ["text", "added", "id", "source", "subdomain"]

        json_files = glob.glob(f"{json_dir}/*.json")
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        for json_file in json_files:
            print(f"Processing {json_file}")

            eval_dataset = load_dataset("json", data_files=json_file)

            def tokenize_function(examples):
                return tokenizer(examples["text"])

            tokenized_datasets = eval_dataset.map(
                tokenize_function, batched=True, num_proc=4, remove_columns=columns_ignored
            )
            lm_datasets = tokenized_datasets.map(
                group_texts,
                batched=True,
                batch_size=1000,
                num_proc=8,
            )

            training_args = TrainingArguments(
                output_dir="./hf_results",
                per_device_eval_batch_size=15,
                do_eval=True,
                report_to=None,
                dataloader_num_workers=8,
                use_cpu=True,
            )
            alphas = [0.0, 0.3, 0.5, 0.7, 1.0]
            initial_model = load_model(model_list[0])
            trainer = Trainer(model=initial_model, args=training_args, eval_dataset=lm_datasets)
            eval_dataloader = trainer.get_test_dataloader(lm_datasets["train"])
            del initial_model

            model_pairs = list(itertools.combinations(enumerate(model_list), 2))

            base_dir = f"{save_dir}/{test_name}"
            os.makedirs(base_dir, exist_ok=True)
            imgs_dir = os.path.join(base_dir, "imgs")
            os.makedirs(imgs_dir, exist_ok=True)
            csv_dir = os.path.join(base_dir, "csv")
            os.makedirs(csv_dir, exist_ok=True)

            current_model_a, current_model_b = None, None
            current_model_a_name, current_model_b_name = None, None

            for (idx_a, model_a_name), (idx_b, model_b_name) in tqdm(
                model_pairs, desc="Model Interpolation"
            ):
                if idx_a < idx_b:
                    perplexities = []

                    if current_model_a is None or current_model_a_name != model_a_name:
                        if current_model_a is not None:
                            del current_model_a
                            torch.cuda.empty_cache()
                        current_model_a = load_model(model_a_name).to("cpu")
                        current_model_a_name = model_a_name

                    if current_model_b is None or current_model_b_name != model_b_name:
                        if current_model_b is not None:
                            del current_model_b
                            torch.cuda.empty_cache()
                        current_model_b = load_model(model_b_name).to("cpu")
                        current_model_b_name = model_b_name

                    with torch.no_grad():
                        for alpha in tqdm(
                            alphas,
                            desc=f" \n Alpha Perplexities for {model_a_name} and {model_b_name}",
                        ):
                            interpolated_model = interpolate_models(
                                current_model_a, current_model_b, alpha, model_arch=model_arch
                            )
                            interpolated_model = interpolated_model.half().to(device)

                            start_time = time.time()
                            losses = []

                            for batch in tqdm(eval_dataloader, desc=f"\n Evaluating {alpha}"):
                                input_ids = batch["input_ids"].to(device)
                                attention_mask = batch["attention_mask"].to(device)
                                labels = batch["labels"].to(device)

                                outputs = interpolated_model(
                                    input_ids=input_ids,
                                    attention_mask=attention_mask,
                                    labels=labels,
                                )
                                loss = outputs.loss
                                losses.append(loss.item())

                            loss_mean = sum(losses) / len(losses)
                            print(f"Loss mean: {loss_mean}")
                            end_time = time.time()
                            execution_time = end_time - start_time
                            print(f"Execution time base: {execution_time} seconds")

                            perplexity = math.exp(loss_mean)
                            perplexities.append(perplexity)

                            interpolated_model.to("cpu")
                            del interpolated_model, input_ids, attention_mask, labels, outputs, loss
                            torch.cuda.empty_cache()
                            gc.collect()

                    model_a_name = model_a_name.split("/")[-1]
                    model_b_name = model_b_name.split("/")[-1]
                    json_filename = os.path.splitext(os.path.basename(json_file))[0]
                    csv_filename = f"{csv_dir}/perplexities_{json_filename}.csv"
                    csv_header = ["Model Pair"] + [f"Alpha {alpha}" for alpha in alphas]

                    if not os.path.exists(csv_filename):
                        with open(csv_filename, "w", newline="") as csvfile:
                            writer = csv.writer(csvfile)
                            writer.writerow(csv_header)

                    with open(csv_filename, "a", newline="") as csvfile:
                        writer = csv.writer(csvfile)
                        model_pair = f"{model_a_name} vs {model_b_name}"
                        row = [model_pair] + perplexities
                        writer.writerow(row)

                    plt.figure(figsize=(8, 6))
                    plt.plot(alphas, perplexities)
                    plt.xlabel("Alpha")
                    plt.ylabel("Perplexity")
                    plt.title(f"{model_a_name} (Left) vs {model_b_name} (Right)")

                    plot_filename = (
                        f"alpha_vs_perplexity_{model_a_name}_vs_{model_b_name}_{json_filename}.png"
                    )
                    plot_path = f"{imgs_dir}/{plot_filename}"
                    plt.savefig(plot_path, dpi=300, bbox_inches="tight")
                    plt.close()


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Model Interpolation")
    parser.add_argument(
        "--model_arch",
        choices=["llama", "olmo"],
        default="llama",
        help="default model architecture to use",
    )
    args = parser.parse_args()
    main(args)