diff --git a/build/torch26-cxx11-cu118-x86_64-linux/activation/__init__.py b/build/torch26-cxx11-cu118-x86_64-linux/activation/__init__.py
index ddb37490dad9d8ffcbeb13ed06b33f03fef8ed78..1c4f207354093c6ef83eb5d7f3a5a3b22b95d357 100644
--- a/build/torch26-cxx11-cu118-x86_64-linux/activation/__init__.py
+++ b/build/torch26-cxx11-cu118-x86_64-linux/activation/__init__.py
@@ -10,6 +10,11 @@ def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
     return out
 
 
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
 def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_and_mul(out, x)
     return out
diff --git a/build/torch26-cxx11-cu118-x86_64-linux/activation/_activation_be5bedb.abi3.so b/build/torch26-cxx11-cu118-x86_64-linux/activation/_activation_be5bedb.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..c1e52a91b4fa56b4ff39c854b33497b094135599
--- /dev/null
+++ b/build/torch26-cxx11-cu118-x86_64-linux/activation/_activation_be5bedb.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b6ba32ecc6fc898df3b0cebee85e9afc6881749fe58142280f051ca3332d913
+size 2546864
diff --git a/build/torch26-cxx11-cu118-x86_64-linux/activation/_activation_e99cc09_dirty.abi3.so b/build/torch26-cxx11-cu118-x86_64-linux/activation/_activation_e99cc09_dirty.abi3.so
deleted file mode 100755
index 0603eccc9144bee8f9704c4236947e42c905096d..0000000000000000000000000000000000000000
--- a/build/torch26-cxx11-cu118-x86_64-linux/activation/_activation_e99cc09_dirty.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b925dc27b6a9afd5b6d11e454275222c531a92f7ca27958ac81a78c580665e4d
-size 2448088
diff --git a/build/torch26-cxx11-cu118-x86_64-linux/activation/_ops.py b/build/torch26-cxx11-cu118-x86_64-linux/activation/_ops.py
index 6cfb9cfa80b63852c1a9a8641b25616ce4caffd8..0110324ade19f59f705c61d5c21912c958e92e96 100644
--- a/build/torch26-cxx11-cu118-x86_64-linux/activation/_ops.py
+++ b/build/torch26-cxx11-cu118-x86_64-linux/activation/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _activation_e99cc09_dirty
-ops = torch.ops._activation_e99cc09_dirty
+from . import _activation_be5bedb
+ops = torch.ops._activation_be5bedb
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_activation_e99cc09_dirty::{op_name}"
\ No newline at end of file
+    return f"_activation_be5bedb::{op_name}"
\ No newline at end of file
diff --git a/build/torch26-cxx11-cu118-x86_64-linux/activation/layers.py b/build/torch26-cxx11-cu118-x86_64-linux/activation/layers.py
index dea45935f51421e8ee87b05430c2e95840cb4ef8..45b31181ffb80509a85d729a7f7ee86fc2cf014a 100644
--- a/build/torch26-cxx11-cu118-x86_64-linux/activation/layers.py
+++ b/build/torch26-cxx11-cu118-x86_64-linux/activation/layers.py
@@ -5,6 +5,15 @@ from ._ops import ops
 
 
 class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
     can_torch_compile: bool = True
 
     def forward(self, x: torch.Tensor):
@@ -15,7 +24,36 @@ class SiluAndMul(nn.Module):
         return out
 
 
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
 class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
     can_torch_compile: bool = True
 
     def forward(self, x: torch.Tensor):
@@ -38,6 +76,17 @@ class GeluTanhAndMul(nn.Module):
 
 
 class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
     can_torch_compile: bool = True
 
     def __init__(self, threshold: float = 0.0):
diff --git a/build/torch26-cxx11-cu124-x86_64-linux/activation/__init__.py b/build/torch26-cxx11-cu124-x86_64-linux/activation/__init__.py
index ddb37490dad9d8ffcbeb13ed06b33f03fef8ed78..1c4f207354093c6ef83eb5d7f3a5a3b22b95d357 100644
--- a/build/torch26-cxx11-cu124-x86_64-linux/activation/__init__.py
+++ b/build/torch26-cxx11-cu124-x86_64-linux/activation/__init__.py
@@ -10,6 +10,11 @@ def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
     return out
 
 
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
 def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_and_mul(out, x)
     return out
diff --git a/build/torch26-cxx11-cu124-x86_64-linux/activation/_activation_be5bedb.abi3.so b/build/torch26-cxx11-cu124-x86_64-linux/activation/_activation_be5bedb.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..f45a6ffcf3f11e3b24919496e213a61acb258d2a
--- /dev/null
+++ b/build/torch26-cxx11-cu124-x86_64-linux/activation/_activation_be5bedb.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:331dcb3900d5e47a11d3577cdbac54f15a0b6e14910239293323c1d9e4eb9f49
+size 2616928
diff --git a/build/torch26-cxx11-cu124-x86_64-linux/activation/_activation_e99cc09_dirty.abi3.so b/build/torch26-cxx11-cu124-x86_64-linux/activation/_activation_e99cc09_dirty.abi3.so
deleted file mode 100755
index 494cce9f6166100fdb10f021911228b1cbfa2bdd..0000000000000000000000000000000000000000
--- a/build/torch26-cxx11-cu124-x86_64-linux/activation/_activation_e99cc09_dirty.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:cfdbe510752b57a8dc4671f744bb0a2da5b1646e0b9a19fec02f1505ba044c8c
-size 2509960
diff --git a/build/torch26-cxx11-cu124-x86_64-linux/activation/_ops.py b/build/torch26-cxx11-cu124-x86_64-linux/activation/_ops.py
index 6cfb9cfa80b63852c1a9a8641b25616ce4caffd8..0110324ade19f59f705c61d5c21912c958e92e96 100644
--- a/build/torch26-cxx11-cu124-x86_64-linux/activation/_ops.py
+++ b/build/torch26-cxx11-cu124-x86_64-linux/activation/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _activation_e99cc09_dirty
-ops = torch.ops._activation_e99cc09_dirty
+from . import _activation_be5bedb
+ops = torch.ops._activation_be5bedb
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_activation_e99cc09_dirty::{op_name}"
\ No newline at end of file
+    return f"_activation_be5bedb::{op_name}"
\ No newline at end of file
diff --git a/build/torch26-cxx11-cu124-x86_64-linux/activation/layers.py b/build/torch26-cxx11-cu124-x86_64-linux/activation/layers.py
index dea45935f51421e8ee87b05430c2e95840cb4ef8..45b31181ffb80509a85d729a7f7ee86fc2cf014a 100644
--- a/build/torch26-cxx11-cu124-x86_64-linux/activation/layers.py
+++ b/build/torch26-cxx11-cu124-x86_64-linux/activation/layers.py
@@ -5,6 +5,15 @@ from ._ops import ops
 
 
 class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
     can_torch_compile: bool = True
 
     def forward(self, x: torch.Tensor):
@@ -15,7 +24,36 @@ class SiluAndMul(nn.Module):
         return out
 
 
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
 class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
     can_torch_compile: bool = True
 
     def forward(self, x: torch.Tensor):
@@ -38,6 +76,17 @@ class GeluTanhAndMul(nn.Module):
 
 
 class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
     can_torch_compile: bool = True
 
     def __init__(self, threshold: float = 0.0):
diff --git a/build/torch26-cxx11-cu126-x86_64-linux/activation/__init__.py b/build/torch26-cxx11-cu126-x86_64-linux/activation/__init__.py
index ddb37490dad9d8ffcbeb13ed06b33f03fef8ed78..1c4f207354093c6ef83eb5d7f3a5a3b22b95d357 100644
--- a/build/torch26-cxx11-cu126-x86_64-linux/activation/__init__.py
+++ b/build/torch26-cxx11-cu126-x86_64-linux/activation/__init__.py
@@ -10,6 +10,11 @@ def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
     return out
 
 
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
 def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_and_mul(out, x)
     return out
diff --git a/build/torch26-cxx11-cu126-x86_64-linux/activation/_activation_be5bedb.abi3.so b/build/torch26-cxx11-cu126-x86_64-linux/activation/_activation_be5bedb.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..12f5777398872e7a3d93ab936e42ade8eeec3213
--- /dev/null
+++ b/build/torch26-cxx11-cu126-x86_64-linux/activation/_activation_be5bedb.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ce11492b9675a44afb3b896ed80e425f2a47e29481c4aad9c4a6ac59520f011
+size 2621472
diff --git a/build/torch26-cxx11-cu126-x86_64-linux/activation/_activation_e99cc09_dirty.abi3.so b/build/torch26-cxx11-cu126-x86_64-linux/activation/_activation_e99cc09_dirty.abi3.so
deleted file mode 100755
index d18a35d3e459fa1ecfc1ca166e55cb6ac118a6bb..0000000000000000000000000000000000000000
--- a/build/torch26-cxx11-cu126-x86_64-linux/activation/_activation_e99cc09_dirty.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:70e544ad6448a5576d26147f48403f3e9e593f4a2e24167dc8acb81ce3b7932e
-size 2518600
diff --git a/build/torch26-cxx11-cu126-x86_64-linux/activation/_ops.py b/build/torch26-cxx11-cu126-x86_64-linux/activation/_ops.py
index 6cfb9cfa80b63852c1a9a8641b25616ce4caffd8..0110324ade19f59f705c61d5c21912c958e92e96 100644
--- a/build/torch26-cxx11-cu126-x86_64-linux/activation/_ops.py
+++ b/build/torch26-cxx11-cu126-x86_64-linux/activation/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _activation_e99cc09_dirty
-ops = torch.ops._activation_e99cc09_dirty
+from . import _activation_be5bedb
+ops = torch.ops._activation_be5bedb
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_activation_e99cc09_dirty::{op_name}"
\ No newline at end of file
+    return f"_activation_be5bedb::{op_name}"
\ No newline at end of file
diff --git a/build/torch26-cxx11-cu126-x86_64-linux/activation/layers.py b/build/torch26-cxx11-cu126-x86_64-linux/activation/layers.py
index dea45935f51421e8ee87b05430c2e95840cb4ef8..45b31181ffb80509a85d729a7f7ee86fc2cf014a 100644
--- a/build/torch26-cxx11-cu126-x86_64-linux/activation/layers.py
+++ b/build/torch26-cxx11-cu126-x86_64-linux/activation/layers.py
@@ -5,6 +5,15 @@ from ._ops import ops
 
 
 class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
     can_torch_compile: bool = True
 
     def forward(self, x: torch.Tensor):
@@ -15,7 +24,36 @@ class SiluAndMul(nn.Module):
         return out
 
 
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
 class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
     can_torch_compile: bool = True
 
     def forward(self, x: torch.Tensor):
@@ -38,6 +76,17 @@ class GeluTanhAndMul(nn.Module):
 
 
 class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
     can_torch_compile: bool = True
 
     def __init__(self, threshold: float = 0.0):
diff --git a/build/torch26-cxx98-cu118-x86_64-linux/activation/__init__.py b/build/torch26-cxx98-cu118-x86_64-linux/activation/__init__.py
index ddb37490dad9d8ffcbeb13ed06b33f03fef8ed78..1c4f207354093c6ef83eb5d7f3a5a3b22b95d357 100644
--- a/build/torch26-cxx98-cu118-x86_64-linux/activation/__init__.py
+++ b/build/torch26-cxx98-cu118-x86_64-linux/activation/__init__.py
@@ -10,6 +10,11 @@ def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
     return out
 
 
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
 def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_and_mul(out, x)
     return out
diff --git a/build/torch26-cxx98-cu118-x86_64-linux/activation/_activation_be5bedb.abi3.so b/build/torch26-cxx98-cu118-x86_64-linux/activation/_activation_be5bedb.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..056de26936949cc36baf3caa9c4212d730da81f7
--- /dev/null
+++ b/build/torch26-cxx98-cu118-x86_64-linux/activation/_activation_be5bedb.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:434bd1ae43b7cbdb10d86b82da9a237ec05ef9d9fb4fc15cdc9096d3d5ed3fa7
+size 2539352
diff --git a/build/torch26-cxx98-cu118-x86_64-linux/activation/_activation_e99cc09_dirty.abi3.so b/build/torch26-cxx98-cu118-x86_64-linux/activation/_activation_e99cc09_dirty.abi3.so
deleted file mode 100755
index 13989de7ff0a055c8e40e1e1f4d0a9ed9197c1fa..0000000000000000000000000000000000000000
--- a/build/torch26-cxx98-cu118-x86_64-linux/activation/_activation_e99cc09_dirty.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:60fd224c33657558f03be5be57cc8d35ade23225b1abd71557b170c8a7010cd1
-size 2440576
diff --git a/build/torch26-cxx98-cu118-x86_64-linux/activation/_ops.py b/build/torch26-cxx98-cu118-x86_64-linux/activation/_ops.py
index 6cfb9cfa80b63852c1a9a8641b25616ce4caffd8..0110324ade19f59f705c61d5c21912c958e92e96 100644
--- a/build/torch26-cxx98-cu118-x86_64-linux/activation/_ops.py
+++ b/build/torch26-cxx98-cu118-x86_64-linux/activation/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _activation_e99cc09_dirty
-ops = torch.ops._activation_e99cc09_dirty
+from . import _activation_be5bedb
+ops = torch.ops._activation_be5bedb
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_activation_e99cc09_dirty::{op_name}"
\ No newline at end of file
+    return f"_activation_be5bedb::{op_name}"
\ No newline at end of file
diff --git a/build/torch26-cxx98-cu118-x86_64-linux/activation/layers.py b/build/torch26-cxx98-cu118-x86_64-linux/activation/layers.py
index dea45935f51421e8ee87b05430c2e95840cb4ef8..45b31181ffb80509a85d729a7f7ee86fc2cf014a 100644
--- a/build/torch26-cxx98-cu118-x86_64-linux/activation/layers.py
+++ b/build/torch26-cxx98-cu118-x86_64-linux/activation/layers.py
@@ -5,6 +5,15 @@ from ._ops import ops
 
 
 class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
     can_torch_compile: bool = True
 
     def forward(self, x: torch.Tensor):
@@ -15,7 +24,36 @@ class SiluAndMul(nn.Module):
         return out
 
 
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
 class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
     can_torch_compile: bool = True
 
     def forward(self, x: torch.Tensor):
@@ -38,6 +76,17 @@ class GeluTanhAndMul(nn.Module):
 
 
 class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
     can_torch_compile: bool = True
 
     def __init__(self, threshold: float = 0.0):
diff --git a/build/torch26-cxx98-cu124-x86_64-linux/activation/__init__.py b/build/torch26-cxx98-cu124-x86_64-linux/activation/__init__.py
index ddb37490dad9d8ffcbeb13ed06b33f03fef8ed78..1c4f207354093c6ef83eb5d7f3a5a3b22b95d357 100644
--- a/build/torch26-cxx98-cu124-x86_64-linux/activation/__init__.py
+++ b/build/torch26-cxx98-cu124-x86_64-linux/activation/__init__.py
@@ -10,6 +10,11 @@ def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
     return out
 
 
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
 def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_and_mul(out, x)
     return out
diff --git a/build/torch26-cxx98-cu124-x86_64-linux/activation/_activation_be5bedb.abi3.so b/build/torch26-cxx98-cu124-x86_64-linux/activation/_activation_be5bedb.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..c31190f8f2be87dbb5d5a9c497c68cea2258fded
--- /dev/null
+++ b/build/torch26-cxx98-cu124-x86_64-linux/activation/_activation_be5bedb.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:53ddfb42466bfe01feb98348f5c2d6beefd589aeb3dec4c5c36609e11a6bde4c
+size 2605136
diff --git a/build/torch26-cxx98-cu124-x86_64-linux/activation/_activation_e99cc09_dirty.abi3.so b/build/torch26-cxx98-cu124-x86_64-linux/activation/_activation_e99cc09_dirty.abi3.so
deleted file mode 100755
index 76e8710b2a6d75f17d1c40c2ba116c096791c815..0000000000000000000000000000000000000000
--- a/build/torch26-cxx98-cu124-x86_64-linux/activation/_activation_e99cc09_dirty.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e364773259dc1b91f3c0d3b076da83c5a9c6ee18ffdace30315c602dffd1dabe
-size 2502264
diff --git a/build/torch26-cxx98-cu124-x86_64-linux/activation/_ops.py b/build/torch26-cxx98-cu124-x86_64-linux/activation/_ops.py
index 6cfb9cfa80b63852c1a9a8641b25616ce4caffd8..0110324ade19f59f705c61d5c21912c958e92e96 100644
--- a/build/torch26-cxx98-cu124-x86_64-linux/activation/_ops.py
+++ b/build/torch26-cxx98-cu124-x86_64-linux/activation/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _activation_e99cc09_dirty
-ops = torch.ops._activation_e99cc09_dirty
+from . import _activation_be5bedb
+ops = torch.ops._activation_be5bedb
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_activation_e99cc09_dirty::{op_name}"
\ No newline at end of file
+    return f"_activation_be5bedb::{op_name}"
\ No newline at end of file
diff --git a/build/torch26-cxx98-cu124-x86_64-linux/activation/layers.py b/build/torch26-cxx98-cu124-x86_64-linux/activation/layers.py
index 403cc5bbda3495c3c06a976e26dc5839ecc495b5..45b31181ffb80509a85d729a7f7ee86fc2cf014a 100644
--- a/build/torch26-cxx98-cu124-x86_64-linux/activation/layers.py
+++ b/build/torch26-cxx98-cu124-x86_64-linux/activation/layers.py
@@ -23,7 +23,8 @@ class SiluAndMul(nn.Module):
         ops.silu_and_mul(out, x)
         return out
 
-class MulAndSilu(CustomOp):
+
+class MulAndSilu(nn.Module):
     """An activation function for SwiGLU.
 
     The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
@@ -37,11 +38,12 @@ class MulAndSilu(CustomOp):
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         d = x.shape[-1] // 2
-        output_shape = (x.shape[:-1] + (d, ))
+        output_shape = x.shape[:-1] + (d,)
         out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        self.mul_and_silu(out, x)
+        ops.mul_and_silu(out, x)
         return out
 
+
 class GeluAndMul(nn.Module):
     """An activation function for GeGLU.
 
diff --git a/build/torch26-cxx98-cu126-x86_64-linux/activation/__init__.py b/build/torch26-cxx98-cu126-x86_64-linux/activation/__init__.py
index ddb37490dad9d8ffcbeb13ed06b33f03fef8ed78..1c4f207354093c6ef83eb5d7f3a5a3b22b95d357 100644
--- a/build/torch26-cxx98-cu126-x86_64-linux/activation/__init__.py
+++ b/build/torch26-cxx98-cu126-x86_64-linux/activation/__init__.py
@@ -10,6 +10,11 @@ def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
     return out
 
 
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
 def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_and_mul(out, x)
     return out
diff --git a/build/torch26-cxx98-cu126-x86_64-linux/activation/_activation_be5bedb.abi3.so b/build/torch26-cxx98-cu126-x86_64-linux/activation/_activation_be5bedb.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..516f085e9ac787a2454fb78975dbaec25d2a6576
--- /dev/null
+++ b/build/torch26-cxx98-cu126-x86_64-linux/activation/_activation_be5bedb.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac7174352dea307231f308c84ca32ee001cdbcefd976de860e76501c52aae591
+size 2613776
diff --git a/build/torch26-cxx98-cu126-x86_64-linux/activation/_activation_e99cc09_dirty.abi3.so b/build/torch26-cxx98-cu126-x86_64-linux/activation/_activation_e99cc09_dirty.abi3.so
deleted file mode 100755
index 92433b08d2ef878a9d6fc7dfd5281051412ea0b3..0000000000000000000000000000000000000000
--- a/build/torch26-cxx98-cu126-x86_64-linux/activation/_activation_e99cc09_dirty.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7ac88cc0d3c65ab283d20608f3a097be29ee572e7856f10f8d7919536efd95b4
-size 2506808
diff --git a/build/torch26-cxx98-cu126-x86_64-linux/activation/_ops.py b/build/torch26-cxx98-cu126-x86_64-linux/activation/_ops.py
index 6cfb9cfa80b63852c1a9a8641b25616ce4caffd8..0110324ade19f59f705c61d5c21912c958e92e96 100644
--- a/build/torch26-cxx98-cu126-x86_64-linux/activation/_ops.py
+++ b/build/torch26-cxx98-cu126-x86_64-linux/activation/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _activation_e99cc09_dirty
-ops = torch.ops._activation_e99cc09_dirty
+from . import _activation_be5bedb
+ops = torch.ops._activation_be5bedb
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_activation_e99cc09_dirty::{op_name}"
\ No newline at end of file
+    return f"_activation_be5bedb::{op_name}"
\ No newline at end of file
diff --git a/build/torch26-cxx98-cu126-x86_64-linux/activation/layers.py b/build/torch26-cxx98-cu126-x86_64-linux/activation/layers.py
index dea45935f51421e8ee87b05430c2e95840cb4ef8..45b31181ffb80509a85d729a7f7ee86fc2cf014a 100644
--- a/build/torch26-cxx98-cu126-x86_64-linux/activation/layers.py
+++ b/build/torch26-cxx98-cu126-x86_64-linux/activation/layers.py
@@ -5,6 +5,15 @@ from ._ops import ops
 
 
 class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
     can_torch_compile: bool = True
 
     def forward(self, x: torch.Tensor):
@@ -15,7 +24,36 @@ class SiluAndMul(nn.Module):
         return out
 
 
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
 class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
     can_torch_compile: bool = True
 
     def forward(self, x: torch.Tensor):
@@ -38,6 +76,17 @@ class GeluTanhAndMul(nn.Module):
 
 
 class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
     can_torch_compile: bool = True
 
     def __init__(self, threshold: float = 0.0):
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/activation/__init__.py b/build/torch27-cxx11-cu118-x86_64-linux/activation/__init__.py
index ddb37490dad9d8ffcbeb13ed06b33f03fef8ed78..1c4f207354093c6ef83eb5d7f3a5a3b22b95d357 100644
--- a/build/torch27-cxx11-cu118-x86_64-linux/activation/__init__.py
+++ b/build/torch27-cxx11-cu118-x86_64-linux/activation/__init__.py
@@ -10,6 +10,11 @@ def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
     return out
 
 
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
 def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_and_mul(out, x)
     return out
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc b/build/torch27-cxx11-cu118-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5155b241dff8af4302230c3ae23518cb41efa185
Binary files /dev/null and b/build/torch27-cxx11-cu118-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc b/build/torch27-cxx11-cu118-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..53b5508fec27cd0ece00b9b018694ba8da40c5ba
Binary files /dev/null and b/build/torch27-cxx11-cu118-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc b/build/torch27-cxx11-cu118-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7752cad4c2a06746b1a68c3637c7baef00bb5ddc
Binary files /dev/null and b/build/torch27-cxx11-cu118-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/activation/_activation_be5bedb_dirty.abi3.so b/build/torch27-cxx11-cu118-x86_64-linux/activation/_activation_be5bedb_dirty.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..7d5463c37b3f4a3dec8b15df1a13168019fb26e3
--- /dev/null
+++ b/build/torch27-cxx11-cu118-x86_64-linux/activation/_activation_be5bedb_dirty.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aee7c6869a9e318ad81cb84460c58ca0dac2dc85f4ed739b12fe57641f766332
+size 2546984
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/activation/_activation_e99cc09_dirty.abi3.so b/build/torch27-cxx11-cu118-x86_64-linux/activation/_activation_e99cc09_dirty.abi3.so
deleted file mode 100755
index 16e9df58edb8b6fe6885a0ed783306390db853b1..0000000000000000000000000000000000000000
--- a/build/torch27-cxx11-cu118-x86_64-linux/activation/_activation_e99cc09_dirty.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e4f9e647eea40d3d3801d5ee57d4917e4c2e8dbfd87cdfebdc40b1b0a1c571fe
-size 2448184
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/activation/_ops.py b/build/torch27-cxx11-cu118-x86_64-linux/activation/_ops.py
index 6cfb9cfa80b63852c1a9a8641b25616ce4caffd8..745e06b31cb5b9718d3b85236f4cc257459070d7 100644
--- a/build/torch27-cxx11-cu118-x86_64-linux/activation/_ops.py
+++ b/build/torch27-cxx11-cu118-x86_64-linux/activation/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _activation_e99cc09_dirty
-ops = torch.ops._activation_e99cc09_dirty
+from . import _activation_be5bedb_dirty
+ops = torch.ops._activation_be5bedb_dirty
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_activation_e99cc09_dirty::{op_name}"
\ No newline at end of file
+    return f"_activation_be5bedb_dirty::{op_name}"
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu118-x86_64-linux/activation/layers.py b/build/torch27-cxx11-cu118-x86_64-linux/activation/layers.py
index dea45935f51421e8ee87b05430c2e95840cb4ef8..45b31181ffb80509a85d729a7f7ee86fc2cf014a 100644
--- a/build/torch27-cxx11-cu118-x86_64-linux/activation/layers.py
+++ b/build/torch27-cxx11-cu118-x86_64-linux/activation/layers.py
@@ -5,6 +5,15 @@ from ._ops import ops
 
 
 class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
     can_torch_compile: bool = True
 
     def forward(self, x: torch.Tensor):
@@ -15,7 +24,36 @@ class SiluAndMul(nn.Module):
         return out
 
 
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
 class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
     can_torch_compile: bool = True
 
     def forward(self, x: torch.Tensor):
@@ -38,6 +76,17 @@ class GeluTanhAndMul(nn.Module):
 
 
 class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
     can_torch_compile: bool = True
 
     def __init__(self, threshold: float = 0.0):
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/activation/__init__.py b/build/torch27-cxx11-cu126-x86_64-linux/activation/__init__.py
index ddb37490dad9d8ffcbeb13ed06b33f03fef8ed78..1c4f207354093c6ef83eb5d7f3a5a3b22b95d357 100644
--- a/build/torch27-cxx11-cu126-x86_64-linux/activation/__init__.py
+++ b/build/torch27-cxx11-cu126-x86_64-linux/activation/__init__.py
@@ -10,6 +10,11 @@ def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
     return out
 
 
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
 def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_and_mul(out, x)
     return out
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc b/build/torch27-cxx11-cu126-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4b1fcc2dcde514cab92d358380824ca24616cd0b
Binary files /dev/null and b/build/torch27-cxx11-cu126-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc b/build/torch27-cxx11-cu126-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..665e89cb27b58c9caff761de28b7f6574cc2140e
Binary files /dev/null and b/build/torch27-cxx11-cu126-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc b/build/torch27-cxx11-cu126-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4602c567b14a674c4a56d0e1cf8ef073fbc50beb
Binary files /dev/null and b/build/torch27-cxx11-cu126-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/activation/_activation_be5bedb_dirty.abi3.so b/build/torch27-cxx11-cu126-x86_64-linux/activation/_activation_be5bedb_dirty.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..94c38d99b9593469317fe894be35b069017b493e
--- /dev/null
+++ b/build/torch27-cxx11-cu126-x86_64-linux/activation/_activation_be5bedb_dirty.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f075a6e0d47a2d382d16291b1c5d7d1d98111e2bbc5891b14b627e3c1778b699
+size 2621536
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/activation/_activation_e99cc09_dirty.abi3.so b/build/torch27-cxx11-cu126-x86_64-linux/activation/_activation_e99cc09_dirty.abi3.so
deleted file mode 100755
index 085ef09ae9488945275424ee7a507f0289143ed8..0000000000000000000000000000000000000000
--- a/build/torch27-cxx11-cu126-x86_64-linux/activation/_activation_e99cc09_dirty.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a2b72ff2a0f2253e4dfe028842b5f15cabf2647d7812bf4662a2de510ca0c489
-size 2518632
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/activation/_ops.py b/build/torch27-cxx11-cu126-x86_64-linux/activation/_ops.py
index 6cfb9cfa80b63852c1a9a8641b25616ce4caffd8..745e06b31cb5b9718d3b85236f4cc257459070d7 100644
--- a/build/torch27-cxx11-cu126-x86_64-linux/activation/_ops.py
+++ b/build/torch27-cxx11-cu126-x86_64-linux/activation/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _activation_e99cc09_dirty
-ops = torch.ops._activation_e99cc09_dirty
+from . import _activation_be5bedb_dirty
+ops = torch.ops._activation_be5bedb_dirty
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_activation_e99cc09_dirty::{op_name}"
\ No newline at end of file
+    return f"_activation_be5bedb_dirty::{op_name}"
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu126-x86_64-linux/activation/layers.py b/build/torch27-cxx11-cu126-x86_64-linux/activation/layers.py
index dea45935f51421e8ee87b05430c2e95840cb4ef8..45b31181ffb80509a85d729a7f7ee86fc2cf014a 100644
--- a/build/torch27-cxx11-cu126-x86_64-linux/activation/layers.py
+++ b/build/torch27-cxx11-cu126-x86_64-linux/activation/layers.py
@@ -5,6 +5,15 @@ from ._ops import ops
 
 
 class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
     can_torch_compile: bool = True
 
     def forward(self, x: torch.Tensor):
@@ -15,7 +24,36 @@ class SiluAndMul(nn.Module):
         return out
 
 
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
 class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
     can_torch_compile: bool = True
 
     def forward(self, x: torch.Tensor):
@@ -38,6 +76,17 @@ class GeluTanhAndMul(nn.Module):
 
 
 class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
     can_torch_compile: bool = True
 
     def __init__(self, threshold: float = 0.0):
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/activation/__init__.py b/build/torch27-cxx11-cu128-x86_64-linux/activation/__init__.py
index ddb37490dad9d8ffcbeb13ed06b33f03fef8ed78..1c4f207354093c6ef83eb5d7f3a5a3b22b95d357 100644
--- a/build/torch27-cxx11-cu128-x86_64-linux/activation/__init__.py
+++ b/build/torch27-cxx11-cu128-x86_64-linux/activation/__init__.py
@@ -10,6 +10,11 @@ def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
     return out
 
 
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
 def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
     ops.gelu_and_mul(out, x)
     return out
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc b/build/torch27-cxx11-cu128-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fe2206ed48c6e6b877620ac3db87af6ee49ddf07
Binary files /dev/null and b/build/torch27-cxx11-cu128-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc b/build/torch27-cxx11-cu128-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6a940427d39d1a12a0806315d03b02bdfed65a3d
Binary files /dev/null and b/build/torch27-cxx11-cu128-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc b/build/torch27-cxx11-cu128-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..725246ac4c8d6c4374d8250ea67f759a871b1c38
Binary files /dev/null and b/build/torch27-cxx11-cu128-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc differ
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/activation/_activation_be5bedb_dirty.abi3.so b/build/torch27-cxx11-cu128-x86_64-linux/activation/_activation_be5bedb_dirty.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..e5c17e44367c005d1c9f8d6b391be8d49079b2fc
--- /dev/null
+++ b/build/torch27-cxx11-cu128-x86_64-linux/activation/_activation_be5bedb_dirty.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc2406aa2fa09dd7bc1fd5e87cdcdf55edfc7e0853fad5f977e2500e08fa8899
+size 3565432
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/activation/_activation_e99cc09_dirty.abi3.so b/build/torch27-cxx11-cu128-x86_64-linux/activation/_activation_e99cc09_dirty.abi3.so
deleted file mode 100755
index ea1a9f1b610a4e3ca23afc5e13c26c3e0ef7758b..0000000000000000000000000000000000000000
--- a/build/torch27-cxx11-cu128-x86_64-linux/activation/_activation_e99cc09_dirty.abi3.so
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f4590c852899e4c11ddb74cfad61e26b07490a91f3c09e0fb0874a3fcc1f533e
-size 3331456
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/activation/_ops.py b/build/torch27-cxx11-cu128-x86_64-linux/activation/_ops.py
index 6cfb9cfa80b63852c1a9a8641b25616ce4caffd8..745e06b31cb5b9718d3b85236f4cc257459070d7 100644
--- a/build/torch27-cxx11-cu128-x86_64-linux/activation/_ops.py
+++ b/build/torch27-cxx11-cu128-x86_64-linux/activation/_ops.py
@@ -1,9 +1,9 @@
 import torch
-from . import _activation_e99cc09_dirty
-ops = torch.ops._activation_e99cc09_dirty
+from . import _activation_be5bedb_dirty
+ops = torch.ops._activation_be5bedb_dirty
 
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_activation_e99cc09_dirty::{op_name}"
\ No newline at end of file
+    return f"_activation_be5bedb_dirty::{op_name}"
\ No newline at end of file
diff --git a/build/torch27-cxx11-cu128-x86_64-linux/activation/layers.py b/build/torch27-cxx11-cu128-x86_64-linux/activation/layers.py
index dea45935f51421e8ee87b05430c2e95840cb4ef8..45b31181ffb80509a85d729a7f7ee86fc2cf014a 100644
--- a/build/torch27-cxx11-cu128-x86_64-linux/activation/layers.py
+++ b/build/torch27-cxx11-cu128-x86_64-linux/activation/layers.py
@@ -5,6 +5,15 @@ from ._ops import ops
 
 
 class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
     can_torch_compile: bool = True
 
     def forward(self, x: torch.Tensor):
@@ -15,7 +24,36 @@ class SiluAndMul(nn.Module):
         return out
 
 
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
 class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
     can_torch_compile: bool = True
 
     def forward(self, x: torch.Tensor):
@@ -38,6 +76,17 @@ class GeluTanhAndMul(nn.Module):
 
 
 class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
     can_torch_compile: bool = True
 
     def __init__(self, threshold: float = 0.0):
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/activation/__init__.py b/build/torch28-cxx11-cu126-x86_64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c4f207354093c6ef83eb5d7f3a5a3b22b95d357
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/activation/__init__.py
@@ -0,0 +1,57 @@
+import torch
+
+from ._ops import ops
+
+from . import layers
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
+def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_and_mul(out, x)
+    return out
+
+
+def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh_and_mul(out, x)
+    return out
+
+
+def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
+    ops.fatrelu_and_mul(out, x, threshold)
+    return out
+
+
+def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_fast(out, x)
+    return out
+
+
+def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_new(out, x)
+    return out
+
+
+def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_quick(out, x)
+    return out
+
+
+__all__ = [
+    "silu_and_mul",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "layers",
+]
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc b/build/torch28-cxx11-cu126-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5263d294bc5bc421b98d31436c896bbc244d0771
Binary files /dev/null and b/build/torch28-cxx11-cu126-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc b/build/torch28-cxx11-cu126-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fb753a567265e3db8b71afceb9a4442139a6aea7
Binary files /dev/null and b/build/torch28-cxx11-cu126-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc b/build/torch28-cxx11-cu126-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6dd25df0a6c63b7315d2c0d9f4b3894ff1626fc8
Binary files /dev/null and b/build/torch28-cxx11-cu126-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/activation/_activation_be5bedb_dirty.abi3.so b/build/torch28-cxx11-cu126-x86_64-linux/activation/_activation_be5bedb_dirty.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..40900ff2070ff72eb665fdd5fd78f12d3a287cd9
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/activation/_activation_be5bedb_dirty.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c88e87951b92ea55313ef79a34d284cb2a23713d3bdafee735caa4fc955b9dcb
+size 2610616
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/activation/_ops.py b/build/torch28-cxx11-cu126-x86_64-linux/activation/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..745e06b31cb5b9718d3b85236f4cc257459070d7
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/activation/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_be5bedb_dirty
+ops = torch.ops._activation_be5bedb_dirty
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_be5bedb_dirty::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu126-x86_64-linux/activation/layers.py b/build/torch28-cxx11-cu126-x86_64-linux/activation/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..45b31181ffb80509a85d729a7f7ee86fc2cf014a
--- /dev/null
+++ b/build/torch28-cxx11-cu126-x86_64-linux/activation/layers.py
@@ -0,0 +1,128 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+
+
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
+class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+
+
+class GeluTanhAndMul(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+
+
+class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+
+
+class FastGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+
+
+class NewGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+
+
+class QuickGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/activation/__init__.py b/build/torch28-cxx11-cu128-x86_64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c4f207354093c6ef83eb5d7f3a5a3b22b95d357
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/activation/__init__.py
@@ -0,0 +1,57 @@
+import torch
+
+from ._ops import ops
+
+from . import layers
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
+def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_and_mul(out, x)
+    return out
+
+
+def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh_and_mul(out, x)
+    return out
+
+
+def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
+    ops.fatrelu_and_mul(out, x, threshold)
+    return out
+
+
+def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_fast(out, x)
+    return out
+
+
+def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_new(out, x)
+    return out
+
+
+def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_quick(out, x)
+    return out
+
+
+__all__ = [
+    "silu_and_mul",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "layers",
+]
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc b/build/torch28-cxx11-cu128-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aedb284c8147a243ebfc99ec94000b62ae672077
Binary files /dev/null and b/build/torch28-cxx11-cu128-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc b/build/torch28-cxx11-cu128-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7ae3e6d861e600db32e9024ae7db059642f35a3f
Binary files /dev/null and b/build/torch28-cxx11-cu128-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc b/build/torch28-cxx11-cu128-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..51baab3cf4e592a2b8bed4cea0e9228a559b399d
Binary files /dev/null and b/build/torch28-cxx11-cu128-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/activation/_activation_be5bedb_dirty.abi3.so b/build/torch28-cxx11-cu128-x86_64-linux/activation/_activation_be5bedb_dirty.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..8b1ece63bdec0e63013816dae6bce9a87068f88e
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/activation/_activation_be5bedb_dirty.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf784c7ab178c476fc6268efe820b1948c7c5b8f049c046c851b03067da5dd59
+size 3558616
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/activation/_ops.py b/build/torch28-cxx11-cu128-x86_64-linux/activation/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..745e06b31cb5b9718d3b85236f4cc257459070d7
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/activation/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_be5bedb_dirty
+ops = torch.ops._activation_be5bedb_dirty
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_be5bedb_dirty::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu128-x86_64-linux/activation/layers.py b/build/torch28-cxx11-cu128-x86_64-linux/activation/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..45b31181ffb80509a85d729a7f7ee86fc2cf014a
--- /dev/null
+++ b/build/torch28-cxx11-cu128-x86_64-linux/activation/layers.py
@@ -0,0 +1,128 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+
+
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
+class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+
+
+class GeluTanhAndMul(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+
+
+class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+
+
+class FastGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+
+
+class NewGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+
+
+class QuickGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/activation/__init__.py b/build/torch28-cxx11-cu129-x86_64-linux/activation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c4f207354093c6ef83eb5d7f3a5a3b22b95d357
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/activation/__init__.py
@@ -0,0 +1,57 @@
+import torch
+
+from ._ops import ops
+
+from . import layers
+
+
+def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.silu_and_mul(out, x)
+    return out
+
+
+def mul_and_silu(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.mul_and_silu(out, x)
+    return out
+
+
+def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_and_mul(out, x)
+    return out
+
+
+def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_tanh_and_mul(out, x)
+    return out
+
+
+def fatrelu_and_mul(out: torch.Tensor, x: torch.Tensor, threshold: float = 0.0) -> None:
+    ops.fatrelu_and_mul(out, x, threshold)
+    return out
+
+
+def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_fast(out, x)
+    return out
+
+
+def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_new(out, x)
+    return out
+
+
+def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    ops.gelu_quick(out, x)
+    return out
+
+
+__all__ = [
+    "silu_and_mul",
+    "gelu_and_mul",
+    "gelu_tanh_and_mul",
+    "fatrelu_and_mul",
+    "gelu_fast",
+    "gelu_new",
+    "gelu_quick",
+    "layers",
+]
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc b/build/torch28-cxx11-cu129-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..01d30fced2b5392d0f6f4e6454cbe7d782a14daa
Binary files /dev/null and b/build/torch28-cxx11-cu129-x86_64-linux/activation/__pycache__/__init__.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc b/build/torch28-cxx11-cu129-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..75b0e5f83e10b053d8584f2607d9a9f3009d45dc
Binary files /dev/null and b/build/torch28-cxx11-cu129-x86_64-linux/activation/__pycache__/_ops.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc b/build/torch28-cxx11-cu129-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d6ed035d206ae523160771021be45010f234687e
Binary files /dev/null and b/build/torch28-cxx11-cu129-x86_64-linux/activation/__pycache__/layers.cpython-313.pyc differ
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/activation/_activation_be5bedb_dirty.abi3.so b/build/torch28-cxx11-cu129-x86_64-linux/activation/_activation_be5bedb_dirty.abi3.so
new file mode 100755
index 0000000000000000000000000000000000000000..33fb245664d9daef5b07440b390db2c19ef404f1
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/activation/_activation_be5bedb_dirty.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e7cca3169eea8cbd67c61706d102548e49aadc936f8c2943efef3e7c4c0ee0d
+size 3592400
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/activation/_ops.py b/build/torch28-cxx11-cu129-x86_64-linux/activation/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..745e06b31cb5b9718d3b85236f4cc257459070d7
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/activation/_ops.py
@@ -0,0 +1,9 @@
+import torch
+from . import _activation_be5bedb_dirty
+ops = torch.ops._activation_be5bedb_dirty
+
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_activation_be5bedb_dirty::{op_name}"
\ No newline at end of file
diff --git a/build/torch28-cxx11-cu129-x86_64-linux/activation/layers.py b/build/torch28-cxx11-cu129-x86_64-linux/activation/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..45b31181ffb80509a85d729a7f7ee86fc2cf014a
--- /dev/null
+++ b/build/torch28-cxx11-cu129-x86_64-linux/activation/layers.py
@@ -0,0 +1,128 @@
+import torch
+import torch.nn as nn
+
+from ._ops import ops
+
+
+class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+
+
+class MulAndSilu(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> x[:d] * silu(x[d:]) where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.mul_and_silu(out, x)
+        return out
+
+
+class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+
+
+class GeluTanhAndMul(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_tanh_and_mul(out, x)
+        return out
+
+
+class FatreluAndMul(nn.Module):
+    """An activation function for FATReLU.
+
+    The function computes x -> FATReLU(x[:d]) * x[d:] where
+    d = x.shape[-1] // 2.
+    This is used in openbmb/MiniCPM-S-1B-sft.
+
+    Shapes:
+        x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
+        return: (num_tokens, d) or (batch_size, seq_len, d)
+    """
+
+    can_torch_compile: bool = True
+
+    def __init__(self, threshold: float = 0.0):
+        super().__init__()
+        self.threshold = threshold
+
+    def forward(self, x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.fatrelu_and_mul(out, x, self.threshold)
+        return out
+
+
+class FastGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+
+
+class NewGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+
+
+class QuickGELU(nn.Module):
+    can_torch_compile: bool = True
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out