Sync with upstream

Browse files

Files changed (9) hide show

activation/activation_kernels.cu +28 -7
activation/cuda_compat.h +3 -3
activation/dispatch_utils.h +48 -0
build/torch26-cxx98-cu124-x86_64-linux/activation/layers.py +47 -0
tests/kernels/test_activation.py +17 -2
torch-ext/activation/__init__.py +5 -0
torch-ext/activation/layers.py +49 -0
torch-ext/torch_binding.cpp +3 -0
torch-ext/torch_binding.h +2 -0

activation/activation_kernels.cu CHANGED Viewed

@@ -9,8 +9,16 @@
 namespace vllm {
 // Activation and gating kernel template.
-template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
 __global__ void act_and_mul_kernel(
     scalar_t* __restrict__ out,          // [..., d]
     const scalar_t* __restrict__ input,  // [..., 2, d]
@@ -19,7 +27,7 @@ __global__ void act_and_mul_kernel(
   for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
     const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
     const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
-    out[token_idx * d + idx] = ACT_FN(x) * y;
   }
 }
@@ -55,16 +63,21 @@ __device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
 }  // namespace vllm
 // Launch activation and gating kernel.
-#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL)                            \
   int d = input.size(-1) / 2;                                            \
   int64_t num_tokens = input.numel() / input.size(-1);                   \
   dim3 grid(num_tokens);                                                 \
   dim3 block(std::min(d, 1024));                                         \
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));      \
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();          \
   VLLM_DISPATCH_FLOATING_TYPES(                                          \
       input.scalar_type(), "act_and_mul_kernel", [&] {                   \
-        vllm::act_and_mul_kernel<scalar_t, KERNEL<scalar_t>>             \
             <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),       \
                                          input.data_ptr<scalar_t>(), d); \
       });
@@ -72,19 +85,27 @@ __device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
 void silu_and_mul(torch::Tensor& out,    // [..., d]
                   torch::Tensor& input)  // [..., 2 * d]
 {
-  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel);
 }
 void gelu_and_mul(torch::Tensor& out,    // [..., d]
                   torch::Tensor& input)  // [..., 2 * d]
 {
-  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel);
 }
 void gelu_tanh_and_mul(torch::Tensor& out,    // [..., d]
                        torch::Tensor& input)  // [..., 2 * d]
 {
-  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel);
 }
 namespace vllm {

 namespace vllm {
+template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&),
+          bool act_first>
+__device__ __forceinline__ scalar_t compute(const scalar_t& x,
+                                            const scalar_t& y) {
+  return act_first ? ACT_FN(x) * y : x * ACT_FN(y);
+}
 // Activation and gating kernel template.
+template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&),
+          bool act_first>
 __global__ void act_and_mul_kernel(
     scalar_t* __restrict__ out,          // [..., d]
     const scalar_t* __restrict__ input,  // [..., 2, d]
   for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
     const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
     const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
+    out[token_idx * d + idx] = compute<scalar_t, ACT_FN, act_first>(x, y);
   }
 }
 }  // namespace vllm
 // Launch activation and gating kernel.
+// Use ACT_FIRST (bool) indicating whether to apply the activation function
+// first.
+#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL, ACT_FIRST)                 \
   int d = input.size(-1) / 2;                                            \
   int64_t num_tokens = input.numel() / input.size(-1);                   \
   dim3 grid(num_tokens);                                                 \
   dim3 block(std::min(d, 1024));                                         \
+  if (num_tokens == 0) {                                                 \
+    return;                                                              \
+  }                                                                      \
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));      \
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();          \
   VLLM_DISPATCH_FLOATING_TYPES(                                          \
       input.scalar_type(), "act_and_mul_kernel", [&] {                   \
+        vllm::act_and_mul_kernel<scalar_t, KERNEL<scalar_t>, ACT_FIRST>  \
             <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),       \
                                          input.data_ptr<scalar_t>(), d); \
       });
 void silu_and_mul(torch::Tensor& out,    // [..., d]
                   torch::Tensor& input)  // [..., 2 * d]
 {
+  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, true);
+}
+void mul_and_silu(torch::Tensor& out,    // [..., d]
+                  torch::Tensor& input)  // [..., 2 * d]
+{
+  // The difference between mul_and_silu and silu_and_mul is that mul_and_silu
+  // applies the silu to the latter half of the input.
+  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, false);
 }
 void gelu_and_mul(torch::Tensor& out,    // [..., d]
                   torch::Tensor& input)  // [..., 2 * d]
 {
+  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel, true);
 }
 void gelu_tanh_and_mul(torch::Tensor& out,    // [..., d]
                        torch::Tensor& input)  // [..., 2 * d]
 {
+  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel, true);
 }
 namespace vllm {

activation/cuda_compat.h CHANGED Viewed

@@ -4,10 +4,10 @@
   #include <hip/hip_runtime.h>
 #endif
-#ifndef USE_ROCM
-  #define WARP_SIZE 32
 #else
-  #define WARP_SIZE warpSize
 #endif
 #ifndef USE_ROCM

   #include <hip/hip_runtime.h>
 #endif
+#if defined(USE_ROCM) && defined(__GFX9__)
+  #define WARP_SIZE 64
 #else
+  #define WARP_SIZE 32
 #endif
 #ifndef USE_ROCM

activation/dispatch_utils.h CHANGED Viewed

@@ -6,6 +6,11 @@
 #include <torch/all.h>
 #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)         \
   AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
   AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)  \
@@ -14,6 +19,35 @@
 #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
 #define VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(...)   \
   AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)    \
   AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)     \
@@ -31,5 +65,19 @@
   AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__)   \
   AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__)
 #define VLLM_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \
   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__))

 #include <torch/all.h>
+// Need a special dispatch case macro since we will nest the FP8 dispatch.
+// Instead of the usual 'scalar_t', this names the dispatched type 'fp8_t'.
+#define AT_DISPATCH_FP8_CASE(enum_type, ...) \
+  AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, fp8_t, __VA_ARGS__)
 #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)         \
   AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
   AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)  \
 #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+// ROCm devices might use either fn or fnuz, so set up dispatch table for both.
+// A host-based check at runtime will create a preferred FP8 type for ROCm
+// such that the correct kernel is dispatched.
+#ifdef USE_ROCM
+  #define VLLM_DISPATCH_CASE_FP8_TYPES(...)                          \
+    AT_DISPATCH_FP8_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) \
+    AT_DISPATCH_FP8_CASE(at::ScalarType::Float8_e4m3fnuz, __VA_ARGS__)
+  #define VLLM_DISPATCH_CASE_QUANT_TYPES(...)                      \
+    AT_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__)   \
+    AT_DISPATCH_CASE(at::ScalarType::Float8_e4m3fnuz, __VA_ARGS__) \
+    AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)
+#else
+  #define VLLM_DISPATCH_CASE_FP8_TYPES(...) \
+    AT_DISPATCH_FP8_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__)
+  #define VLLM_DISPATCH_CASE_QUANT_TYPES(...)                    \
+    AT_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) \
+    AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)
+#endif
+// When using this dispatch macro, the type is 'fp8_t' not 'scalar_t'.
+// See AT_DISPATCH_FP8_CASE above.
+#define VLLM_DISPATCH_FP8_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FP8_TYPES(__VA_ARGS__))
+#define VLLM_DISPATCH_QUANT_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_QUANT_TYPES(__VA_ARGS__))
 #define VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(...)   \
   AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)    \
   AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)     \
   AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__)   \
   AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__)
+#define VLLM_DISPATCH_CASE_INTEGRAL_AND_UNSIGNED_TYPES(...) \
+  AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)       \
+  AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)       \
+  AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__)      \
+  AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__)        \
+  AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__)       \
+  AT_DISPATCH_CASE(at::ScalarType::UInt16, __VA_ARGS__)     \
+  AT_DISPATCH_CASE(at::ScalarType::UInt32, __VA_ARGS__)     \
+  AT_DISPATCH_CASE(at::ScalarType::UInt64, __VA_ARGS__)
 #define VLLM_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \
   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__))
+#define VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                              \
+      TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_AND_UNSIGNED_TYPES(__VA_ARGS__))

build/torch26-cxx98-cu124-x86_64-linux/activation/layers.py CHANGED Viewed

@@ -5,6 +5,15 @@ from ._ops import ops
 class SiluAndMul(nn.Module):
     can_torch_compile: bool = True
     def forward(self, x: torch.Tensor):
@@ -14,8 +23,35 @@ class SiluAndMul(nn.Module):
         ops.silu_and_mul(out, x)
         return out
 class GeluAndMul(nn.Module):
     can_torch_compile: bool = True
     def forward(self, x: torch.Tensor):
@@ -38,6 +74,17 @@ class GeluTanhAndMul(nn.Module):
 class FatreluAndMul(nn.Module):
     can_torch_compile: bool = True
     def __init__(self, threshold: float = 0.0):

 class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
     can_torch_compile: bool = True
     def forward(self, x: torch.Tensor):
         ops.silu_and_mul(out, x)
         return out
+class MulAndSilu(CustomOp):
+    """An activation function for SwiGLU.
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+    can_torch_compile: bool = True
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = (x.shape[:-1] + (d, ))
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        self.mul_and_silu(out, x)
+        return out
 class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
     can_torch_compile: bool = True
     def forward(self, x: torch.Tensor):
 class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
     can_torch_compile: bool = True
     def __init__(self, threshold: float = 0.0):

tests/kernels/test_activation.py CHANGED Viewed

@@ -1,3 +1,6 @@
 import math
 import random
 from typing import Type
@@ -43,12 +46,19 @@ def silu_and_mul(x: torch.Tensor) -> torch.Tensor:
     return F.silu(x[..., :d]) * x[..., d:]
 def gelu_and_mul(x: torch.Tensor, approximate: str) -> torch.Tensor:
     d = x.shape[-1] // 2
     return F.gelu(x[..., :d], approximate=approximate) * x[..., d:]
-@pytest.mark.parametrize("activation_name", ["silu", "gelu", "gelu_tanh", "fatrelu"])
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("dtype", DTYPES)
@@ -67,11 +77,16 @@ def test_act_and_mul(
     torch.manual_seed(seed)
     torch.set_default_device(device)
     x = torch.randn(num_tokens, 2 * d, dtype=dtype)
-    if activation_name == "silu":
         torch_fn = silu_and_mul
         fn = activation.silu_and_mul
         op = activation.ops.silu_and_mul
         layer = activation.layers.SiluAndMul()
     elif activation_name == "gelu":
         torch_fn = lambda x: gelu_and_mul(x, "none")
         fn = activation.gelu_and_mul

+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 import random
 from typing import Type
     return F.silu(x[..., :d]) * x[..., d:]
+def mul_and_silu(x: torch.Tensor) -> torch.Tensor:
+    d = x.shape[-1] // 2
+    return x[..., :d] * F.silu(x[..., d:])
 def gelu_and_mul(x: torch.Tensor, approximate: str) -> torch.Tensor:
     d = x.shape[-1] // 2
     return F.gelu(x[..., :d], approximate=approximate) * x[..., d:]
+@pytest.mark.parametrize(
+    "activation_name", ["silu_and_mul", "mul_and_silu", "gelu", "gelu_tanh", "fatrelu"]
+)
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("dtype", DTYPES)
     torch.manual_seed(seed)
     torch.set_default_device(device)
     x = torch.randn(num_tokens, 2 * d, dtype=dtype)
+    if activation_name == "silu_and_mul":
         torch_fn = silu_and_mul
         fn = activation.silu_and_mul
         op = activation.ops.silu_and_mul
         layer = activation.layers.SiluAndMul()
+    elif activation_name == "mul_and_silu":
+        torch_fn = mul_and_silu
+        fn = activation.mul_and_silu
+        op = activation.ops.mul_and_silu
+        layer = activation.layers.MulAndSilu()
     elif activation_name == "gelu":
         torch_fn = lambda x: gelu_and_mul(x, "none")
         fn = activation.gelu_and_mul

torch-ext/activation/__init__.py CHANGED Viewed

@@ -10,6 +10,11 @@ def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
     return out
 def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_and_mul(out, x)
     return out

     return out
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
 def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_and_mul(out, x)
     return out

torch-ext/activation/layers.py CHANGED Viewed

@@ -5,6 +5,15 @@ from ._ops import ops
 class SiluAndMul(nn.Module):
     can_torch_compile: bool = True
     def forward(self, x: torch.Tensor):
@@ -15,7 +24,36 @@ class SiluAndMul(nn.Module):
         return out
 class GeluAndMul(nn.Module):
     can_torch_compile: bool = True
     def forward(self, x: torch.Tensor):
@@ -38,6 +76,17 @@ class GeluTanhAndMul(nn.Module):
 class FatreluAndMul(nn.Module):
     can_torch_compile: bool = True
     def __init__(self, threshold: float = 0.0):

 class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
     can_torch_compile: bool = True
     def forward(self, x: torch.Tensor):
         return out
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+    can_torch_compile: bool = True
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
 class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
     can_torch_compile: bool = True
     def forward(self, x: torch.Tensor):
 class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
     can_torch_compile: bool = True
     def __init__(self, threshold: float = 0.0):

torch-ext/torch_binding.cpp CHANGED Viewed

@@ -9,6 +9,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("silu_and_mul(Tensor! out, Tensor input) -> ()");
   ops.impl("silu_and_mul", torch::kCUDA, &silu_and_mul);
   // Activation function used in GeGLU with `none` approximation.
   ops.def("gelu_and_mul(Tensor! out, Tensor input) -> ()");
   ops.impl("gelu_and_mul", torch::kCUDA, &gelu_and_mul);

   ops.def("silu_and_mul(Tensor! out, Tensor input) -> ()");
   ops.impl("silu_and_mul", torch::kCUDA, &silu_and_mul);
+  ops.def("mul_and_silu(Tensor! out, Tensor input) -> ()");
+  ops.impl("mul_and_silu", torch::kCUDA, &mul_and_silu);
   // Activation function used in GeGLU with `none` approximation.
   ops.def("gelu_and_mul(Tensor! out, Tensor input) -> ()");
   ops.impl("gelu_and_mul", torch::kCUDA, &gelu_and_mul);

torch-ext/torch_binding.h CHANGED Viewed

@@ -4,6 +4,8 @@
 void silu_and_mul(torch::Tensor &out, torch::Tensor &input);
 void gelu_and_mul(torch::Tensor &out, torch::Tensor &input);
 void gelu_tanh_and_mul(torch::Tensor &out, torch::Tensor &input);

 void silu_and_mul(torch::Tensor &out, torch::Tensor &input);
+void mul_and_silu(torch::Tensor& out, torch::Tensor& input);
 void gelu_and_mul(torch::Tensor &out, torch::Tensor &input);
 void gelu_tanh_and_mul(torch::Tensor &out, torch::Tensor &input);