sft_save / run_1024_binary.sh
nancyH's picture
Upload folder using huggingface_hub
ef7d2db verified
#!/bin/bash
# --- initialize conda ---
source /root/miniconda3/etc/profile.d/conda.sh
# --- activate env ---
conda activate bpe_v2
# Limit visible GPUs
# export CUDA_VISIBLE_DEVICES=6
export CUDA_VISIBLE_DEVICES=0
#7
# Positional args:
# 1) base directory containing per-class folders with train.csv/dev.csv/test.csv
# 2) learning rate
# 3) base output directory
# 4) wandb project name
# 5) optional comma-separated seeds (default: 42)
# 6) optional num_train_epochs (default: 5)
data_root=$1
lr=$2
output_root=$3
project_name=$4
seeds=$5
#vocab=117M
# Model / tokenizer pairs to sweep
# Model / tokenizer pairs to sweep
MODELS=(
# "/root/NaN/dna-tokenizer/pretrain/models/base_2048/checkpoint-100000"
# "/root/NaN/dna-tokenizer/pretrain/models/len2_2048/checkpoint-100000"
# "/root/NaN/dna-tokenizer/pretrain/models/len2_3072/checkpoint-100000"
# "/root/NaN/dna-tokenizer/pretrain/models/base_3072/checkpoint-100000"
"/root/NaN/dna-tokenizer/pretrain/models/base_4096/checkpoint-100000"
# "/root/NaN/dna-tokenizer/pretrain/models/model_len2_4096/checkpoint-100000"
)
TOKENIZERS=(
# "/root/NaN/dna-tokenizer/baseline_bpe/vocab_2048/2048_tokenizer.json"
# "/root/NaN/dna-tokenizer/merge_bpe/vocab_2048/merge_tokenizer_unigram_len2.json"
# "/root/NaN/dna-tokenizer/merge_bpe/vocab_3072/merge_tokenizer_unigram_len2.json"
# "/root/NaN/dna-tokenizer/baseline_bpe/vocab_3072/3072_tokenizer.json"
"/root/NaN/dna-tokenizer/baseline_bpe/vocab_4096/4096_tokenizer.json"
# "/root/NaN/dna-tokenizer/merge_bpe/vocab_4096/merge_tokenizer_unigram_len2.json"
)
# MODEL_NAMES=("base_3072" "base_4096")
MODEL_NAMES=("base_4096")
IFS=',' read -ra SEED_LIST <<< "${seeds}"
for dataset_path in "${data_root}"/*; do
[ -d "${dataset_path}" ] || continue
dataset_name=$(basename "${dataset_path}")
echo "Running fine-tune for ${dataset_name} from ${dataset_path}"
for idx in "${!MODELS[@]}"; do
model=${MODELS[$idx]}
tokenizer=${TOKENIZERS[$idx]}
model_name=${MODEL_NAMES[$idx]}
for seed in "${SEED_LIST[@]}"; do
run_name="hg38_${model_name}_binary_${dataset_name}_${lr}_seed${seed}"
torchrun --nproc_per_node=1 \
--master_port=${MASTER_PORT:-29500} \
/root/NaN/dna-tokenizer/SFT/train.py \
--model_name_or_path ${model} \
--tokenizer_path ${tokenizer} \
--trust_remote_code True \
--data_path ${dataset_path} \
--kmer -1 \
--run_name ${run_name} \
--model_max_length 200 \
--per_device_train_batch_size 128 \
--per_device_eval_batch_size 128 \
--gradient_accumulation_steps 1 \
--learning_rate ${lr} \
--num_train_epochs 8 \
--fp16 \
--save_steps 2000 \
--output_dir ${output_root}/${dataset_name}/${model_name}/${lr} \
--evaluation_strategy steps \
--eval_steps 2000 \
--warmup_steps 30 \
--logging_steps 100000 \
--overwrite_output_dir True \
--log_level info \
--seed ${seed} \
--find_unused_parameters False \
--project_name ${project_name}
done
done
done