| #!/bin/bash |
| |
| source /root/miniconda3/etc/profile.d/conda.sh |
|
|
| |
| conda activate bpe_v2 |
|
|
| |
| |
| export CUDA_VISIBLE_DEVICES=0 |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
| data_root=$1 |
| lr=$2 |
| output_root=$3 |
| project_name=$4 |
| seeds=$5 |
| |
|
|
| |
| |
| MODELS=( |
| |
| |
| |
| |
| "/root/NaN/dna-tokenizer/pretrain/models/base_4096/checkpoint-100000" |
| |
| ) |
| TOKENIZERS=( |
| |
| |
| |
| |
| "/root/NaN/dna-tokenizer/baseline_bpe/vocab_4096/4096_tokenizer.json" |
| |
| ) |
|
|
|
|
| |
| MODEL_NAMES=("base_4096") |
|
|
|
|
|
|
| IFS=',' read -ra SEED_LIST <<< "${seeds}" |
|
|
| for dataset_path in "${data_root}"/*; do |
| [ -d "${dataset_path}" ] || continue |
| dataset_name=$(basename "${dataset_path}") |
| echo "Running fine-tune for ${dataset_name} from ${dataset_path}" |
|
|
| for idx in "${!MODELS[@]}"; do |
| model=${MODELS[$idx]} |
| tokenizer=${TOKENIZERS[$idx]} |
| model_name=${MODEL_NAMES[$idx]} |
|
|
| for seed in "${SEED_LIST[@]}"; do |
| run_name="hg38_${model_name}_binary_${dataset_name}_${lr}_seed${seed}" |
| torchrun --nproc_per_node=1 \ |
| --master_port=${MASTER_PORT:-29500} \ |
| /root/NaN/dna-tokenizer/SFT/train.py \ |
| --model_name_or_path ${model} \ |
| --tokenizer_path ${tokenizer} \ |
| --trust_remote_code True \ |
| --data_path ${dataset_path} \ |
| --kmer -1 \ |
| --run_name ${run_name} \ |
| --model_max_length 200 \ |
| --per_device_train_batch_size 128 \ |
| --per_device_eval_batch_size 128 \ |
| --gradient_accumulation_steps 1 \ |
| --learning_rate ${lr} \ |
| --num_train_epochs 8 \ |
| --fp16 \ |
| --save_steps 2000 \ |
| --output_dir ${output_root}/${dataset_name}/${model_name}/${lr} \ |
| --evaluation_strategy steps \ |
| --eval_steps 2000 \ |
| --warmup_steps 30 \ |
| --logging_steps 100000 \ |
| --overwrite_output_dir True \ |
| --log_level info \ |
| --seed ${seed} \ |
| --find_unused_parameters False \ |
| --project_name ${project_name} |
| done |
| done |
| done |
|
|