File size: 6,891 Bytes
1d3a5fe 889a42a 1d3a5fe 889a42a 1d3a5fe 889a42a 1d3a5fe 6383574 1d3a5fe 889a42a 1d3a5fe 889a42a 1d3a5fe 889a42a 1d3a5fe 889a42a 1d3a5fe fb396ec 1d3a5fe 889a42a 6383574 1d3a5fe 889a42a 6383574 1d3a5fe fb396ec 1d3a5fe 889a42a 1d3a5fe 889a42a 1d3a5fe 889a42a 6383574 889a42a 6383574 889a42a 1d3a5fe 889a42a 1d3a5fe 889a42a 1d3a5fe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 |
#!/bin/bash
# Cluster connection configuration
CLUSTER_HOST="killarney"
CLUSTER_USER="gsa"
# Job configuration
ACCOUNT="aip-craffel"
SCRIPT_NAME="gradio_job.slurm"
APP_DIR="/project/$ACCOUNT/$CLUSTER_USER/quick-tokenizer-accuracy"
APP_PATH="app.py"
JOB_NAME="gradio-app"
GPU_TYPE="l40s"
NUM_GPUS=1
NODES=1
NTASKS_PER_NODE=1
CPUS_PER_TASK=4
### request more memory to run on more models
MEM="16G"
TIME="02:00:00"
GRADIO_PORT=7861
script_location="$APP_DIR/$SCRIPT_NAME"
ENV_PATH="/home/$CLUSTER_USER/tokenizers/.venv/bin/activate"
OUTPUT_DIR="/project/$ACCOUNT/$CLUSTER_USER/.slurm"
# Function to cleanup temporary files
cleanup() {
echo "Cleaning up..."
if [ -f "$SCRIPT_NAME" ]; then
rm "$SCRIPT_NAME"
fi
exit 0
}
# Set trap for cleanup on script exit
trap cleanup EXIT INT TERM
# Generate SLURM job script locally
cat > "$SCRIPT_NAME" << EOF
#!/bin/bash
#SBATCH --job-name=$JOB_NAME
#SBATCH --gres=gpu:$GPU_TYPE:$NUM_GPUS
#SBATCH --nodes=$NODES
#SBATCH --ntasks-per-node=$NTASKS_PER_NODE
#SBATCH --cpus-per-task=$CPUS_PER_TASK
#SBATCH --mem=$MEM
#SBATCH --time=$TIME
#SBATCH --account=$ACCOUNT
#SBATCH --output=$OUTPUT_DIR/%j.out
# Print job info
echo "Job started on node: \$(hostname)"
echo "Job ID: \$SLURM_JOB_ID"
echo "Allocated nodes: \$SLURM_JOB_NODELIST"
echo "Working directory: \$(pwd)"
echo "Starting time: \$(date)"
source /home/$CLUSTER_USER/.bashrc
# Load necessary modules
module load slurm/killarney/24.05.7 StdEnv/2023 gcc/13.3 openmpi/5.0.3 cuda/12.6 python/3.10.13
# Activate virtual environment
source "${ENV_PATH}"
echo $HF_TOKEN
hf auth login --token $HF_TOKEN
hf auth whoami
# Set up environment
export GRADIO_SERVER_NAME="0.0.0.0"
export GRADIO_SERVER_PORT=$GRADIO_PORT
# Start Gradio app
echo "Starting Gradio app on port ${GRADIO_PORT}..."
gradio "${APP_PATH}" --watch-dirs "${APP_DIR}"
# python "${APP_PATH}" --watch-dirs "${APP_DIR}"
# Keep the job alive
echo "Gradio app finished at: \$(date)"
EOF
echo "Generated SLURM job script: $SCRIPT_NAME"
# Transfer the job script to the cluster and submit it
scp "$SCRIPT_NAME" "$CLUSTER_USER@$CLUSTER_HOST:$script_location"
if [ $? -ne 0 ]; then
echo "Error: Failed to transfer job script to cluster"
exit 1
fi
echo "Submitting job to cluster..."
JOB_ID=$(ssh -t "$CLUSTER_USER@$CLUSTER_HOST" \
"bash -l -c 'cd \"$APP_DIR\" && sbatch --parsable \"$script_location\"'" \
| tr -d '\r\n')
if [ $? -ne 0 ]; then
echo "Error: Failed to submit job to cluster"
exit 1
fi
echo "Job submitted with ID: $JOB_ID"
# Monitor job status from local machine
echo "Monitoring job status from local machine..."
while true; do
JOB_STATUS=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'squeue -j $JOB_ID -h -o \"%T\" 2>/dev/null'")
echo "Job status: $JOB_STATUS"
if [ -z "$JOB_STATUS" ]; then
echo "Error: Job $JOB_ID not found. It may have failed to start."
echo "Checking job output..."
ssh "$CLUSTER_USER@$CLUSTER_HOST" "ls -la ${JOB_ID}.* 2>/dev/null && echo 'Output files:' && cat ${JOB_ID}.out ${JOB_ID}.err 2>/dev/null"
exit 1
elif [ "$JOB_STATUS" = "RUNNING" ]; then
echo "Job is now running!"
break
elif [ "$JOB_STATUS" = "PENDING" ]; then
echo "Job is pending... (waiting for resources)"
sleep 5
else
echo "Job status: $JOB_STATUS"
if [[ "$JOB_STATUS" =~ ^(FAILED|CANCELLED|TIMEOUT|COMPLETED)$ ]]; then
echo "Job ended with status: $JOB_STATUS"
echo "Checking job output files..."
ssh "$CLUSTER_USER@$CLUSTER_HOST" "cat ${JOB_ID}.out ${JOB_ID}.err 2>/dev/null"
exit 1
fi
sleep 5
fi
done
# Get the allocated node
NODE=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'squeue -j $JOB_ID -h -o \"%N\"'")
echo "Job (${JOB_ID}) is running on node: ${NODE}"
# Wait a moment for the Gradio app to start
echo "Waiting for Gradio app to initialize..."
sleep 10
# Check if Gradio is actually running
echo "Checking if Gradio app started successfully..."
GRADIO_CHECK=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'ssh $NODE \"ps aux | grep gradio | grep -v grep\"' 2>/dev/null")
# Get NODE locally
NODE=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" \
"bash -l -c 'squeue -j $JOB_ID -h -o \"%N\"'")
# Check Gradio process on that node
GRADIO_CHECK=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" \
"bash -l -c 'ssh $NODE \"ps aux | grep gradio | grep -v grep\"' 2>/dev/null")
# Handle process check
if [ -n "$GRADIO_CHECK" ]; then
echo "✓ Gradio app appears to be running"
else
echo "⚠ Warning: Gradio app may not have started properly"
echo "Check the job output:"
ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'tail \"${OUTPUT_DIR}/${JOB_ID}.out\"'"
fi
cancel_job() {
read -p "Would you like to cancel the job? (y/n): " -n 1 -r
if [[ $REPLY =~ ^[Yy]$ ]]; then
ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'scancel ${JOB_ID} '"
else
echo "Job $JOB_ID is still running on $CLUSTER_HOST:$NODE"
fi
}
# Optional port forwarding
read -p "Would you like to set up port forwarding now? (y/n): " -n 1 -r
echo ""
if [[ $REPLY =~ ^[Yy]$ ]]; then
# If GRADIO_PORT is in use locally, pick a random free port
if lsof -iTCP:"$GRADIO_PORT" -sTCP:LISTEN >/dev/null 2>&1; then
echo "Port $GRADIO_PORT is already in use locally — selecting a free one..."
LOCAL_PORT=$(comm -23 \
<(seq 1024 65535 | sort) \
<(lsof -nP -iTCP -sTCP:LISTEN | awk 'NR>1 {print $9}' | awk -F: '{print $NF}' | sort -u) \
| awk 'BEGIN{srand()} {ports[NR]=$0} END{print ports[int(rand()*NR)+1]}')
else
LOCAL_PORT="$GRADIO_PORT"
fi
echo "Using local port: $LOCAL_PORT"
echo "Setting up port forwarding... Open http://localhost:${LOCAL_PORT} in your browser to access the app."
ssh -L "${LOCAL_PORT}:${NODE}:${GRADIO_PORT}" "$CLUSTER_USER@$CLUSTER_HOST" \
-t "echo 'Port forwarding active: localhost:${LOCAL_PORT} -> ${NODE}:${GRADIO_PORT}'; bash"
echo ""
echo "Port forwarding ended."
cancel_job
else
echo "Skipping port forwarding."
# Connection info
cat <<EOF
=========================================
Gradio app should be running on:
Cluster: $CLUSTER_HOST
Node: $NODE
Port: $GRADIO_PORT
To access from your local machine:
ssh -L $GRADIO_PORT:$NODE:$GRADIO_PORT $CLUSTER_USER@$CLUSTER_HOST
Then open: http://localhost:$GRADIO_PORT
Alternative direct SSH with forwarding:
ssh -L $GRADIO_PORT:localhost:$GRADIO_PORT $CLUSTER_USER@$NODE.$CLUSTER_HOST
Check job status:
ssh $CLUSTER_USER@$CLUSTER_HOST \"'squeue -j $JOB_ID '\"
Cancel job:
ssh $CLUSTER_USER@$CLUSTER_HOST \"'scancel $JOB_ID '\"
=========================================
EOF
echo "Later you can run: ssh -L $GRADIO_PORT:$NODE:$GRADIO_PORT $CLUSTER_USER@$CLUSTER_HOST"
fi
|