quick-tokenizer-accuracy / serve_on_killarney.sh
Gül Sena Altıntaş
Fixed hf issue, fixed OOM
6383574
#!/bin/bash
# Cluster connection configuration
CLUSTER_HOST="killarney"
CLUSTER_USER="gsa"
# Job configuration
ACCOUNT="aip-craffel"
SCRIPT_NAME="gradio_job.slurm"
APP_DIR="/project/$ACCOUNT/$CLUSTER_USER/quick-tokenizer-accuracy"
APP_PATH="app.py"
JOB_NAME="gradio-app"
GPU_TYPE="l40s"
NUM_GPUS=1
NODES=1
NTASKS_PER_NODE=1
CPUS_PER_TASK=4
### request more memory to run on more models
MEM="16G"
TIME="02:00:00"
GRADIO_PORT=7861
script_location="$APP_DIR/$SCRIPT_NAME"
ENV_PATH="/home/$CLUSTER_USER/tokenizers/.venv/bin/activate"
OUTPUT_DIR="/project/$ACCOUNT/$CLUSTER_USER/.slurm"
# Function to cleanup temporary files
cleanup() {
echo "Cleaning up..."
if [ -f "$SCRIPT_NAME" ]; then
rm "$SCRIPT_NAME"
fi
exit 0
}
# Set trap for cleanup on script exit
trap cleanup EXIT INT TERM
# Generate SLURM job script locally
cat > "$SCRIPT_NAME" << EOF
#!/bin/bash
#SBATCH --job-name=$JOB_NAME
#SBATCH --gres=gpu:$GPU_TYPE:$NUM_GPUS
#SBATCH --nodes=$NODES
#SBATCH --ntasks-per-node=$NTASKS_PER_NODE
#SBATCH --cpus-per-task=$CPUS_PER_TASK
#SBATCH --mem=$MEM
#SBATCH --time=$TIME
#SBATCH --account=$ACCOUNT
#SBATCH --output=$OUTPUT_DIR/%j.out
# Print job info
echo "Job started on node: \$(hostname)"
echo "Job ID: \$SLURM_JOB_ID"
echo "Allocated nodes: \$SLURM_JOB_NODELIST"
echo "Working directory: \$(pwd)"
echo "Starting time: \$(date)"
source /home/$CLUSTER_USER/.bashrc
# Load necessary modules
module load slurm/killarney/24.05.7 StdEnv/2023 gcc/13.3 openmpi/5.0.3 cuda/12.6 python/3.10.13
# Activate virtual environment
source "${ENV_PATH}"
echo $HF_TOKEN
hf auth login --token $HF_TOKEN
hf auth whoami
# Set up environment
export GRADIO_SERVER_NAME="0.0.0.0"
export GRADIO_SERVER_PORT=$GRADIO_PORT
# Start Gradio app
echo "Starting Gradio app on port ${GRADIO_PORT}..."
gradio "${APP_PATH}" --watch-dirs "${APP_DIR}"
# python "${APP_PATH}" --watch-dirs "${APP_DIR}"
# Keep the job alive
echo "Gradio app finished at: \$(date)"
EOF
echo "Generated SLURM job script: $SCRIPT_NAME"
# Transfer the job script to the cluster and submit it
scp "$SCRIPT_NAME" "$CLUSTER_USER@$CLUSTER_HOST:$script_location"
if [ $? -ne 0 ]; then
echo "Error: Failed to transfer job script to cluster"
exit 1
fi
echo "Submitting job to cluster..."
JOB_ID=$(ssh -t "$CLUSTER_USER@$CLUSTER_HOST" \
"bash -l -c 'cd \"$APP_DIR\" && sbatch --parsable \"$script_location\"'" \
| tr -d '\r\n')
if [ $? -ne 0 ]; then
echo "Error: Failed to submit job to cluster"
exit 1
fi
echo "Job submitted with ID: $JOB_ID"
# Monitor job status from local machine
echo "Monitoring job status from local machine..."
while true; do
JOB_STATUS=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'squeue -j $JOB_ID -h -o \"%T\" 2>/dev/null'")
echo "Job status: $JOB_STATUS"
if [ -z "$JOB_STATUS" ]; then
echo "Error: Job $JOB_ID not found. It may have failed to start."
echo "Checking job output..."
ssh "$CLUSTER_USER@$CLUSTER_HOST" "ls -la ${JOB_ID}.* 2>/dev/null && echo 'Output files:' && cat ${JOB_ID}.out ${JOB_ID}.err 2>/dev/null"
exit 1
elif [ "$JOB_STATUS" = "RUNNING" ]; then
echo "Job is now running!"
break
elif [ "$JOB_STATUS" = "PENDING" ]; then
echo "Job is pending... (waiting for resources)"
sleep 5
else
echo "Job status: $JOB_STATUS"
if [[ "$JOB_STATUS" =~ ^(FAILED|CANCELLED|TIMEOUT|COMPLETED)$ ]]; then
echo "Job ended with status: $JOB_STATUS"
echo "Checking job output files..."
ssh "$CLUSTER_USER@$CLUSTER_HOST" "cat ${JOB_ID}.out ${JOB_ID}.err 2>/dev/null"
exit 1
fi
sleep 5
fi
done
# Get the allocated node
NODE=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'squeue -j $JOB_ID -h -o \"%N\"'")
echo "Job (${JOB_ID}) is running on node: ${NODE}"
# Wait a moment for the Gradio app to start
echo "Waiting for Gradio app to initialize..."
sleep 10
# Check if Gradio is actually running
echo "Checking if Gradio app started successfully..."
GRADIO_CHECK=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'ssh $NODE \"ps aux | grep gradio | grep -v grep\"' 2>/dev/null")
# Get NODE locally
NODE=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" \
"bash -l -c 'squeue -j $JOB_ID -h -o \"%N\"'")
# Check Gradio process on that node
GRADIO_CHECK=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" \
"bash -l -c 'ssh $NODE \"ps aux | grep gradio | grep -v grep\"' 2>/dev/null")
# Handle process check
if [ -n "$GRADIO_CHECK" ]; then
echo "✓ Gradio app appears to be running"
else
echo "⚠ Warning: Gradio app may not have started properly"
echo "Check the job output:"
ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'tail \"${OUTPUT_DIR}/${JOB_ID}.out\"'"
fi
cancel_job() {
read -p "Would you like to cancel the job? (y/n): " -n 1 -r
if [[ $REPLY =~ ^[Yy]$ ]]; then
ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'scancel ${JOB_ID} '"
else
echo "Job $JOB_ID is still running on $CLUSTER_HOST:$NODE"
fi
}
# Optional port forwarding
read -p "Would you like to set up port forwarding now? (y/n): " -n 1 -r
echo ""
if [[ $REPLY =~ ^[Yy]$ ]]; then
# If GRADIO_PORT is in use locally, pick a random free port
if lsof -iTCP:"$GRADIO_PORT" -sTCP:LISTEN >/dev/null 2>&1; then
echo "Port $GRADIO_PORT is already in use locally — selecting a free one..."
LOCAL_PORT=$(comm -23 \
<(seq 1024 65535 | sort) \
<(lsof -nP -iTCP -sTCP:LISTEN | awk 'NR>1 {print $9}' | awk -F: '{print $NF}' | sort -u) \
| awk 'BEGIN{srand()} {ports[NR]=$0} END{print ports[int(rand()*NR)+1]}')
else
LOCAL_PORT="$GRADIO_PORT"
fi
echo "Using local port: $LOCAL_PORT"
echo "Setting up port forwarding... Open http://localhost:${LOCAL_PORT} in your browser to access the app."
ssh -L "${LOCAL_PORT}:${NODE}:${GRADIO_PORT}" "$CLUSTER_USER@$CLUSTER_HOST" \
-t "echo 'Port forwarding active: localhost:${LOCAL_PORT} -> ${NODE}:${GRADIO_PORT}'; bash"
echo ""
echo "Port forwarding ended."
cancel_job
else
echo "Skipping port forwarding."
# Connection info
cat <<EOF
=========================================
Gradio app should be running on:
Cluster: $CLUSTER_HOST
Node: $NODE
Port: $GRADIO_PORT
To access from your local machine:
ssh -L $GRADIO_PORT:$NODE:$GRADIO_PORT $CLUSTER_USER@$CLUSTER_HOST
Then open: http://localhost:$GRADIO_PORT
Alternative direct SSH with forwarding:
ssh -L $GRADIO_PORT:localhost:$GRADIO_PORT $CLUSTER_USER@$NODE.$CLUSTER_HOST
Check job status:
ssh $CLUSTER_USER@$CLUSTER_HOST \"'squeue -j $JOB_ID '\"
Cancel job:
ssh $CLUSTER_USER@$CLUSTER_HOST \"'scancel $JOB_ID '\"
=========================================
EOF
echo "Later you can run: ssh -L $GRADIO_PORT:$NODE:$GRADIO_PORT $CLUSTER_USER@$CLUSTER_HOST"
fi