#!/bin/bash # Cluster connection configuration CLUSTER_HOST="killarney" CLUSTER_USER="gsa" # Job configuration ACCOUNT="aip-craffel" SCRIPT_NAME="gradio_job.slurm" APP_DIR="/project/$ACCOUNT/$CLUSTER_USER/quick-tokenizer-accuracy" APP_PATH="app.py" JOB_NAME="gradio-app" GPU_TYPE="l40s" NUM_GPUS=1 NODES=1 NTASKS_PER_NODE=1 CPUS_PER_TASK=4 ### request more memory to run on more models MEM="16G" TIME="02:00:00" GRADIO_PORT=7861 script_location="$APP_DIR/$SCRIPT_NAME" ENV_PATH="/home/$CLUSTER_USER/tokenizers/.venv/bin/activate" OUTPUT_DIR="/project/$ACCOUNT/$CLUSTER_USER/.slurm" # Function to cleanup temporary files cleanup() { echo "Cleaning up..." if [ -f "$SCRIPT_NAME" ]; then rm "$SCRIPT_NAME" fi exit 0 } # Set trap for cleanup on script exit trap cleanup EXIT INT TERM # Generate SLURM job script locally cat > "$SCRIPT_NAME" << EOF #!/bin/bash #SBATCH --job-name=$JOB_NAME #SBATCH --gres=gpu:$GPU_TYPE:$NUM_GPUS #SBATCH --nodes=$NODES #SBATCH --ntasks-per-node=$NTASKS_PER_NODE #SBATCH --cpus-per-task=$CPUS_PER_TASK #SBATCH --mem=$MEM #SBATCH --time=$TIME #SBATCH --account=$ACCOUNT #SBATCH --output=$OUTPUT_DIR/%j.out # Print job info echo "Job started on node: \$(hostname)" echo "Job ID: \$SLURM_JOB_ID" echo "Allocated nodes: \$SLURM_JOB_NODELIST" echo "Working directory: \$(pwd)" echo "Starting time: \$(date)" source /home/$CLUSTER_USER/.bashrc # Load necessary modules module load slurm/killarney/24.05.7 StdEnv/2023 gcc/13.3 openmpi/5.0.3 cuda/12.6 python/3.10.13 # Activate virtual environment source "${ENV_PATH}" echo $HF_TOKEN hf auth login --token $HF_TOKEN hf auth whoami # Set up environment export GRADIO_SERVER_NAME="0.0.0.0" export GRADIO_SERVER_PORT=$GRADIO_PORT # Start Gradio app echo "Starting Gradio app on port ${GRADIO_PORT}..." gradio "${APP_PATH}" --watch-dirs "${APP_DIR}" # python "${APP_PATH}" --watch-dirs "${APP_DIR}" # Keep the job alive echo "Gradio app finished at: \$(date)" EOF echo "Generated SLURM job script: $SCRIPT_NAME" # Transfer the job script to the cluster and submit it scp "$SCRIPT_NAME" "$CLUSTER_USER@$CLUSTER_HOST:$script_location" if [ $? -ne 0 ]; then echo "Error: Failed to transfer job script to cluster" exit 1 fi echo "Submitting job to cluster..." JOB_ID=$(ssh -t "$CLUSTER_USER@$CLUSTER_HOST" \ "bash -l -c 'cd \"$APP_DIR\" && sbatch --parsable \"$script_location\"'" \ | tr -d '\r\n') if [ $? -ne 0 ]; then echo "Error: Failed to submit job to cluster" exit 1 fi echo "Job submitted with ID: $JOB_ID" # Monitor job status from local machine echo "Monitoring job status from local machine..." while true; do JOB_STATUS=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'squeue -j $JOB_ID -h -o \"%T\" 2>/dev/null'") echo "Job status: $JOB_STATUS" if [ -z "$JOB_STATUS" ]; then echo "Error: Job $JOB_ID not found. It may have failed to start." echo "Checking job output..." ssh "$CLUSTER_USER@$CLUSTER_HOST" "ls -la ${JOB_ID}.* 2>/dev/null && echo 'Output files:' && cat ${JOB_ID}.out ${JOB_ID}.err 2>/dev/null" exit 1 elif [ "$JOB_STATUS" = "RUNNING" ]; then echo "Job is now running!" break elif [ "$JOB_STATUS" = "PENDING" ]; then echo "Job is pending... (waiting for resources)" sleep 5 else echo "Job status: $JOB_STATUS" if [[ "$JOB_STATUS" =~ ^(FAILED|CANCELLED|TIMEOUT|COMPLETED)$ ]]; then echo "Job ended with status: $JOB_STATUS" echo "Checking job output files..." ssh "$CLUSTER_USER@$CLUSTER_HOST" "cat ${JOB_ID}.out ${JOB_ID}.err 2>/dev/null" exit 1 fi sleep 5 fi done # Get the allocated node NODE=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'squeue -j $JOB_ID -h -o \"%N\"'") echo "Job (${JOB_ID}) is running on node: ${NODE}" # Wait a moment for the Gradio app to start echo "Waiting for Gradio app to initialize..." sleep 10 # Check if Gradio is actually running echo "Checking if Gradio app started successfully..." GRADIO_CHECK=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'ssh $NODE \"ps aux | grep gradio | grep -v grep\"' 2>/dev/null") # Get NODE locally NODE=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" \ "bash -l -c 'squeue -j $JOB_ID -h -o \"%N\"'") # Check Gradio process on that node GRADIO_CHECK=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" \ "bash -l -c 'ssh $NODE \"ps aux | grep gradio | grep -v grep\"' 2>/dev/null") # Handle process check if [ -n "$GRADIO_CHECK" ]; then echo "✓ Gradio app appears to be running" else echo "⚠ Warning: Gradio app may not have started properly" echo "Check the job output:" ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'tail \"${OUTPUT_DIR}/${JOB_ID}.out\"'" fi cancel_job() { read -p "Would you like to cancel the job? (y/n): " -n 1 -r if [[ $REPLY =~ ^[Yy]$ ]]; then ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'scancel ${JOB_ID} '" else echo "Job $JOB_ID is still running on $CLUSTER_HOST:$NODE" fi } # Optional port forwarding read -p "Would you like to set up port forwarding now? (y/n): " -n 1 -r echo "" if [[ $REPLY =~ ^[Yy]$ ]]; then # If GRADIO_PORT is in use locally, pick a random free port if lsof -iTCP:"$GRADIO_PORT" -sTCP:LISTEN >/dev/null 2>&1; then echo "Port $GRADIO_PORT is already in use locally — selecting a free one..." LOCAL_PORT=$(comm -23 \ <(seq 1024 65535 | sort) \ <(lsof -nP -iTCP -sTCP:LISTEN | awk 'NR>1 {print $9}' | awk -F: '{print $NF}' | sort -u) \ | awk 'BEGIN{srand()} {ports[NR]=$0} END{print ports[int(rand()*NR)+1]}') else LOCAL_PORT="$GRADIO_PORT" fi echo "Using local port: $LOCAL_PORT" echo "Setting up port forwarding... Open http://localhost:${LOCAL_PORT} in your browser to access the app." ssh -L "${LOCAL_PORT}:${NODE}:${GRADIO_PORT}" "$CLUSTER_USER@$CLUSTER_HOST" \ -t "echo 'Port forwarding active: localhost:${LOCAL_PORT} -> ${NODE}:${GRADIO_PORT}'; bash" echo "" echo "Port forwarding ended." cancel_job else echo "Skipping port forwarding." # Connection info cat <