|
#!/bin/bash |
|
|
|
|
|
CLUSTER_HOST="killarney" |
|
CLUSTER_USER="gsa" |
|
|
|
|
|
ACCOUNT="aip-craffel" |
|
SCRIPT_NAME="gradio_job.slurm" |
|
APP_DIR="/project/$ACCOUNT/$CLUSTER_USER/quick-tokenizer-accuracy" |
|
APP_PATH="app.py" |
|
JOB_NAME="gradio-app" |
|
GPU_TYPE="l40s" |
|
NUM_GPUS=1 |
|
NODES=1 |
|
NTASKS_PER_NODE=1 |
|
CPUS_PER_TASK=4 |
|
|
|
MEM="16G" |
|
TIME="02:00:00" |
|
GRADIO_PORT=7861 |
|
script_location="$APP_DIR/$SCRIPT_NAME" |
|
|
|
ENV_PATH="/home/$CLUSTER_USER/tokenizers/.venv/bin/activate" |
|
OUTPUT_DIR="/project/$ACCOUNT/$CLUSTER_USER/.slurm" |
|
|
|
|
|
cleanup() { |
|
echo "Cleaning up..." |
|
if [ -f "$SCRIPT_NAME" ]; then |
|
rm "$SCRIPT_NAME" |
|
fi |
|
exit 0 |
|
} |
|
|
|
|
|
trap cleanup EXIT INT TERM |
|
|
|
|
|
cat > "$SCRIPT_NAME" << EOF |
|
#!/bin/bash |
|
#SBATCH --job-name=$JOB_NAME |
|
#SBATCH --gres=gpu:$GPU_TYPE:$NUM_GPUS |
|
#SBATCH --nodes=$NODES |
|
#SBATCH --ntasks-per-node=$NTASKS_PER_NODE |
|
#SBATCH --cpus-per-task=$CPUS_PER_TASK |
|
#SBATCH --mem=$MEM |
|
#SBATCH --time=$TIME |
|
#SBATCH --account=$ACCOUNT |
|
#SBATCH --output=$OUTPUT_DIR/%j.out |
|
|
|
# Print job info |
|
echo "Job started on node: \$(hostname)" |
|
echo "Job ID: \$SLURM_JOB_ID" |
|
echo "Allocated nodes: \$SLURM_JOB_NODELIST" |
|
echo "Working directory: \$(pwd)" |
|
echo "Starting time: \$(date)" |
|
|
|
source /home/$CLUSTER_USER/.bashrc |
|
|
|
# Load necessary modules |
|
module load slurm/killarney/24.05.7 StdEnv/2023 gcc/13.3 openmpi/5.0.3 cuda/12.6 python/3.10.13 |
|
|
|
# Activate virtual environment |
|
source "${ENV_PATH}" |
|
echo $HF_TOKEN |
|
hf auth login --token $HF_TOKEN |
|
hf auth whoami |
|
|
|
# Set up environment |
|
export GRADIO_SERVER_NAME="0.0.0.0" |
|
export GRADIO_SERVER_PORT=$GRADIO_PORT |
|
|
|
# Start Gradio app |
|
echo "Starting Gradio app on port ${GRADIO_PORT}..." |
|
gradio "${APP_PATH}" --watch-dirs "${APP_DIR}" |
|
# python "${APP_PATH}" --watch-dirs "${APP_DIR}" |
|
|
|
# Keep the job alive |
|
echo "Gradio app finished at: \$(date)" |
|
EOF |
|
|
|
echo "Generated SLURM job script: $SCRIPT_NAME" |
|
|
|
|
|
scp "$SCRIPT_NAME" "$CLUSTER_USER@$CLUSTER_HOST:$script_location" |
|
if [ $? -ne 0 ]; then |
|
echo "Error: Failed to transfer job script to cluster" |
|
exit 1 |
|
fi |
|
|
|
echo "Submitting job to cluster..." |
|
|
|
JOB_ID=$(ssh -t "$CLUSTER_USER@$CLUSTER_HOST" \ |
|
"bash -l -c 'cd \"$APP_DIR\" && sbatch --parsable \"$script_location\"'" \ |
|
| tr -d '\r\n') |
|
|
|
if [ $? -ne 0 ]; then |
|
echo "Error: Failed to submit job to cluster" |
|
exit 1 |
|
fi |
|
|
|
echo "Job submitted with ID: $JOB_ID" |
|
|
|
|
|
|
|
echo "Monitoring job status from local machine..." |
|
while true; do |
|
JOB_STATUS=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'squeue -j $JOB_ID -h -o \"%T\" 2>/dev/null'") |
|
echo "Job status: $JOB_STATUS" |
|
|
|
if [ -z "$JOB_STATUS" ]; then |
|
echo "Error: Job $JOB_ID not found. It may have failed to start." |
|
echo "Checking job output..." |
|
ssh "$CLUSTER_USER@$CLUSTER_HOST" "ls -la ${JOB_ID}.* 2>/dev/null && echo 'Output files:' && cat ${JOB_ID}.out ${JOB_ID}.err 2>/dev/null" |
|
exit 1 |
|
elif [ "$JOB_STATUS" = "RUNNING" ]; then |
|
echo "Job is now running!" |
|
break |
|
elif [ "$JOB_STATUS" = "PENDING" ]; then |
|
echo "Job is pending... (waiting for resources)" |
|
sleep 5 |
|
else |
|
echo "Job status: $JOB_STATUS" |
|
if [[ "$JOB_STATUS" =~ ^(FAILED|CANCELLED|TIMEOUT|COMPLETED)$ ]]; then |
|
echo "Job ended with status: $JOB_STATUS" |
|
echo "Checking job output files..." |
|
ssh "$CLUSTER_USER@$CLUSTER_HOST" "cat ${JOB_ID}.out ${JOB_ID}.err 2>/dev/null" |
|
exit 1 |
|
fi |
|
sleep 5 |
|
fi |
|
done |
|
|
|
|
|
NODE=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'squeue -j $JOB_ID -h -o \"%N\"'") |
|
echo "Job (${JOB_ID}) is running on node: ${NODE}" |
|
|
|
|
|
echo "Waiting for Gradio app to initialize..." |
|
sleep 10 |
|
|
|
|
|
echo "Checking if Gradio app started successfully..." |
|
GRADIO_CHECK=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'ssh $NODE \"ps aux | grep gradio | grep -v grep\"' 2>/dev/null") |
|
|
|
|
|
NODE=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" \ |
|
"bash -l -c 'squeue -j $JOB_ID -h -o \"%N\"'") |
|
|
|
|
|
GRADIO_CHECK=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" \ |
|
"bash -l -c 'ssh $NODE \"ps aux | grep gradio | grep -v grep\"' 2>/dev/null") |
|
|
|
|
|
if [ -n "$GRADIO_CHECK" ]; then |
|
echo "✓ Gradio app appears to be running" |
|
else |
|
echo "⚠ Warning: Gradio app may not have started properly" |
|
echo "Check the job output:" |
|
ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'tail \"${OUTPUT_DIR}/${JOB_ID}.out\"'" |
|
fi |
|
|
|
|
|
cancel_job() { |
|
read -p "Would you like to cancel the job? (y/n): " -n 1 -r |
|
if [[ $REPLY =~ ^[Yy]$ ]]; then |
|
ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'scancel ${JOB_ID} '" |
|
else |
|
echo "Job $JOB_ID is still running on $CLUSTER_HOST:$NODE" |
|
fi |
|
} |
|
|
|
|
|
read -p "Would you like to set up port forwarding now? (y/n): " -n 1 -r |
|
echo "" |
|
if [[ $REPLY =~ ^[Yy]$ ]]; then |
|
|
|
if lsof -iTCP:"$GRADIO_PORT" -sTCP:LISTEN >/dev/null 2>&1; then |
|
echo "Port $GRADIO_PORT is already in use locally — selecting a free one..." |
|
LOCAL_PORT=$(comm -23 \ |
|
<(seq 1024 65535 | sort) \ |
|
<(lsof -nP -iTCP -sTCP:LISTEN | awk 'NR>1 {print $9}' | awk -F: '{print $NF}' | sort -u) \ |
|
| awk 'BEGIN{srand()} {ports[NR]=$0} END{print ports[int(rand()*NR)+1]}') |
|
else |
|
LOCAL_PORT="$GRADIO_PORT" |
|
fi |
|
|
|
echo "Using local port: $LOCAL_PORT" |
|
|
|
echo "Setting up port forwarding... Open http://localhost:${LOCAL_PORT} in your browser to access the app." |
|
ssh -L "${LOCAL_PORT}:${NODE}:${GRADIO_PORT}" "$CLUSTER_USER@$CLUSTER_HOST" \ |
|
-t "echo 'Port forwarding active: localhost:${LOCAL_PORT} -> ${NODE}:${GRADIO_PORT}'; bash" |
|
|
|
|
|
echo "" |
|
echo "Port forwarding ended." |
|
cancel_job |
|
else |
|
echo "Skipping port forwarding." |
|
|
|
cat <<EOF |
|
|
|
========================================= |
|
Gradio app should be running on: |
|
Cluster: $CLUSTER_HOST |
|
Node: $NODE |
|
Port: $GRADIO_PORT |
|
|
|
To access from your local machine: |
|
ssh -L $GRADIO_PORT:$NODE:$GRADIO_PORT $CLUSTER_USER@$CLUSTER_HOST |
|
Then open: http://localhost:$GRADIO_PORT |
|
|
|
Alternative direct SSH with forwarding: |
|
ssh -L $GRADIO_PORT:localhost:$GRADIO_PORT $CLUSTER_USER@$NODE.$CLUSTER_HOST |
|
|
|
Check job status: |
|
ssh $CLUSTER_USER@$CLUSTER_HOST \"'squeue -j $JOB_ID '\" |
|
|
|
Cancel job: |
|
ssh $CLUSTER_USER@$CLUSTER_HOST \"'scancel $JOB_ID '\" |
|
========================================= |
|
|
|
EOF |
|
echo "Later you can run: ssh -L $GRADIO_PORT:$NODE:$GRADIO_PORT $CLUSTER_USER@$CLUSTER_HOST" |
|
fi |
|
|
|
|
|
|