File size: 6,891 Bytes
1d3a5fe
 
 
 
 
 
 
889a42a
1d3a5fe
889a42a
 
1d3a5fe
889a42a
 
1d3a5fe
 
 
6383574
 
1d3a5fe
889a42a
 
1d3a5fe
889a42a
 
1d3a5fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
889a42a
1d3a5fe
 
 
 
 
 
889a42a
1d3a5fe
 
 
 
 
 
 
 
fb396ec
 
1d3a5fe
 
 
 
889a42a
6383574
 
 
1d3a5fe
 
 
 
 
 
889a42a
 
6383574
1d3a5fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fb396ec
 
 
 
1d3a5fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
889a42a
1d3a5fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
889a42a
1d3a5fe
 
889a42a
 
 
 
 
6383574
 
889a42a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6383574
889a42a
 
 
 
 
 
 
 
 
1d3a5fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
889a42a
1d3a5fe
 
889a42a
1d3a5fe
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
#!/bin/bash

# Cluster connection configuration
CLUSTER_HOST="killarney"
CLUSTER_USER="gsa"  

# Job configuration
ACCOUNT="aip-craffel"
SCRIPT_NAME="gradio_job.slurm"
APP_DIR="/project/$ACCOUNT/$CLUSTER_USER/quick-tokenizer-accuracy"
APP_PATH="app.py"
JOB_NAME="gradio-app"
GPU_TYPE="l40s"
NUM_GPUS=1
NODES=1
NTASKS_PER_NODE=1
CPUS_PER_TASK=4
### request more memory to run on more models
MEM="16G"
TIME="02:00:00"
GRADIO_PORT=7861
script_location="$APP_DIR/$SCRIPT_NAME"

ENV_PATH="/home/$CLUSTER_USER/tokenizers/.venv/bin/activate"
OUTPUT_DIR="/project/$ACCOUNT/$CLUSTER_USER/.slurm"

# Function to cleanup temporary files
cleanup() {
    echo "Cleaning up..."
    if [ -f "$SCRIPT_NAME" ]; then
        rm "$SCRIPT_NAME"
    fi
    exit 0
}

# Set trap for cleanup on script exit
trap cleanup EXIT INT TERM

# Generate SLURM job script locally
cat > "$SCRIPT_NAME" << EOF
#!/bin/bash
#SBATCH --job-name=$JOB_NAME
#SBATCH --gres=gpu:$GPU_TYPE:$NUM_GPUS
#SBATCH --nodes=$NODES
#SBATCH --ntasks-per-node=$NTASKS_PER_NODE
#SBATCH --cpus-per-task=$CPUS_PER_TASK
#SBATCH --mem=$MEM
#SBATCH --time=$TIME
#SBATCH --account=$ACCOUNT
#SBATCH --output=$OUTPUT_DIR/%j.out

# Print job info
echo "Job started on node: \$(hostname)"
echo "Job ID: \$SLURM_JOB_ID"
echo "Allocated nodes: \$SLURM_JOB_NODELIST"
echo "Working directory: \$(pwd)"
echo "Starting time: \$(date)"

source /home/$CLUSTER_USER/.bashrc

# Load necessary modules
module load slurm/killarney/24.05.7 StdEnv/2023 gcc/13.3 openmpi/5.0.3 cuda/12.6 python/3.10.13

# Activate virtual environment
source "${ENV_PATH}"
echo $HF_TOKEN
hf auth login --token $HF_TOKEN
hf auth whoami 

# Set up environment
export GRADIO_SERVER_NAME="0.0.0.0"
export GRADIO_SERVER_PORT=$GRADIO_PORT

# Start Gradio app
echo "Starting Gradio app on port ${GRADIO_PORT}..."
gradio "${APP_PATH}" --watch-dirs "${APP_DIR}"
# python "${APP_PATH}" --watch-dirs "${APP_DIR}"

# Keep the job alive
echo "Gradio app finished at: \$(date)"
EOF

echo "Generated SLURM job script: $SCRIPT_NAME"

# Transfer the job script to the cluster and submit it
scp "$SCRIPT_NAME" "$CLUSTER_USER@$CLUSTER_HOST:$script_location"
if [ $? -ne 0 ]; then
    echo "Error: Failed to transfer job script to cluster"
    exit 1
fi

echo "Submitting job to cluster..."

JOB_ID=$(ssh -t "$CLUSTER_USER@$CLUSTER_HOST" \
    "bash -l -c 'cd \"$APP_DIR\" && sbatch --parsable \"$script_location\"'" \
    | tr -d '\r\n')

if [ $? -ne 0 ]; then
    echo "Error: Failed to submit job to cluster"
    exit 1
fi

echo "Job submitted with ID: $JOB_ID"


# Monitor job status from local machine
echo "Monitoring job status from local machine..."
while true; do
    JOB_STATUS=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'squeue -j $JOB_ID -h -o \"%T\" 2>/dev/null'")
    echo "Job status: $JOB_STATUS"
    
    if [ -z "$JOB_STATUS" ]; then
        echo "Error: Job $JOB_ID not found. It may have failed to start."
        echo "Checking job output..."
        ssh "$CLUSTER_USER@$CLUSTER_HOST" "ls -la ${JOB_ID}.* 2>/dev/null && echo 'Output files:' && cat ${JOB_ID}.out ${JOB_ID}.err 2>/dev/null"
        exit 1
    elif [ "$JOB_STATUS" = "RUNNING" ]; then
        echo "Job is now running!"
        break
    elif [ "$JOB_STATUS" = "PENDING" ]; then
        echo "Job is pending... (waiting for resources)"
        sleep 5
    else
        echo "Job status: $JOB_STATUS"
        if [[ "$JOB_STATUS" =~ ^(FAILED|CANCELLED|TIMEOUT|COMPLETED)$ ]]; then
            echo "Job ended with status: $JOB_STATUS"
            echo "Checking job output files..."
            ssh "$CLUSTER_USER@$CLUSTER_HOST" "cat ${JOB_ID}.out ${JOB_ID}.err 2>/dev/null"
            exit 1
        fi
        sleep 5
    fi
done

# Get the allocated node
NODE=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'squeue -j $JOB_ID -h -o \"%N\"'")
echo "Job (${JOB_ID}) is running on node: ${NODE}"

# Wait a moment for the Gradio app to start
echo "Waiting for Gradio app to initialize..."
sleep 10

# Check if Gradio is actually running
echo "Checking if Gradio app started successfully..."
GRADIO_CHECK=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'ssh $NODE \"ps aux | grep gradio | grep -v grep\"' 2>/dev/null")

# Get NODE locally
NODE=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" \
    "bash -l -c 'squeue -j $JOB_ID -h -o \"%N\"'")

# Check Gradio process on that node
GRADIO_CHECK=$(ssh "$CLUSTER_USER@$CLUSTER_HOST" \
    "bash -l -c 'ssh $NODE \"ps aux | grep gradio | grep -v grep\"' 2>/dev/null")

# Handle process check
if [ -n "$GRADIO_CHECK" ]; then
    echo "✓ Gradio app appears to be running"
else
    echo "⚠ Warning: Gradio app may not have started properly"
    echo "Check the job output:"
    ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'tail \"${OUTPUT_DIR}/${JOB_ID}.out\"'"
fi


cancel_job() {
    read -p "Would you like to cancel the job? (y/n): " -n 1 -r
    if [[ $REPLY =~ ^[Yy]$ ]]; then
        ssh "$CLUSTER_USER@$CLUSTER_HOST" "bash -l -c 'scancel  ${JOB_ID} '"
    else
        echo "Job $JOB_ID is still running on $CLUSTER_HOST:$NODE"
    fi
}

# Optional port forwarding
read -p "Would you like to set up port forwarding now? (y/n): " -n 1 -r
echo ""
if [[ $REPLY =~ ^[Yy]$ ]]; then
    # If GRADIO_PORT is in use locally, pick a random free port
    if lsof -iTCP:"$GRADIO_PORT" -sTCP:LISTEN >/dev/null 2>&1; then
        echo "Port $GRADIO_PORT is already in use locally — selecting a free one..."
        LOCAL_PORT=$(comm -23 \
            <(seq 1024 65535 | sort) \
            <(lsof -nP -iTCP -sTCP:LISTEN | awk 'NR>1 {print $9}' | awk -F: '{print $NF}' | sort -u) \
            | awk 'BEGIN{srand()} {ports[NR]=$0} END{print ports[int(rand()*NR)+1]}')
    else
        LOCAL_PORT="$GRADIO_PORT"
    fi

    echo "Using local port: $LOCAL_PORT"

    echo "Setting up port forwarding... Open http://localhost:${LOCAL_PORT} in your browser to access the app."
    ssh -L "${LOCAL_PORT}:${NODE}:${GRADIO_PORT}" "$CLUSTER_USER@$CLUSTER_HOST" \
        -t "echo 'Port forwarding active: localhost:${LOCAL_PORT} -> ${NODE}:${GRADIO_PORT}'; bash"


    echo ""
    echo "Port forwarding ended."
    cancel_job
else
    echo "Skipping port forwarding."
# Connection info
cat <<EOF

=========================================
Gradio app should be running on:
  Cluster: $CLUSTER_HOST
  Node: $NODE
  Port: $GRADIO_PORT

To access from your local machine:
  ssh -L $GRADIO_PORT:$NODE:$GRADIO_PORT $CLUSTER_USER@$CLUSTER_HOST
  Then open: http://localhost:$GRADIO_PORT

Alternative direct SSH with forwarding:
  ssh -L $GRADIO_PORT:localhost:$GRADIO_PORT $CLUSTER_USER@$NODE.$CLUSTER_HOST

Check job status:
  ssh $CLUSTER_USER@$CLUSTER_HOST \"'squeue -j $JOB_ID '\"

Cancel job:
  ssh $CLUSTER_USER@$CLUSTER_HOST \"'scancel $JOB_ID '\"
=========================================

EOF
    echo "Later you can run: ssh -L $GRADIO_PORT:$NODE:$GRADIO_PORT $CLUSTER_USER@$CLUSTER_HOST"
fi