if [ -z "$QUANT_METHOD" ]; then | |
echo "Using native precision"; | |
python3 -m sglang.launch_server \ | |
--model-path $MODEL_ID \ | |
--kv-cache-dtype $KV_CACHE_DTYPE \ | |
--tensor-parallel-size $TP_SIZE \ | |
--expert-parallel-size $TP_SIZE \ | |
--enable-torch-compile \ | |
--enable-ep-moe \ | |
--tool-call-parser qwen25 \ | |
--host 0.0.0.0 \ | |
--port 80; | |
else | |
echo "Using ${QUANT_METHOD} quantization schema"; | |
python3 -m sglang.launch_server \ | |
--model-path $MODEL_ID \ | |
--kv-cache-dtype $KV_CACHE_DTYPE \ | |
--tensor-parallel-size $TP_SIZE \ | |
--expert-parallel-size $TP_SIZE \ | |
--quantization $QUANT_METHOD \ | |
--enable-torch-compile \ | |
--enable-ep-moe \ | |
--tool-call-parser qwen25 \ | |
--host 0.0.0.0 \ | |
--port 80; | |
fi | |