Add triton support

by YangKai0616 - opened 21 days ago

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+304

-0

Files changed (6) hide show

build/torch-universal/rotary/__init__.py +3 -0
build/torch-universal/rotary/_ops.py +8 -0
build/torch-universal/rotary/triton_rotary.py +144 -0
tests/__init__.py +0 -0
tests/test_rotary.py +126 -0
tests/utils.py +23 -0

build/torch-universal/rotary/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .triton_rotary import apply_rotary
2	+
3	+ __all__ = ["apply_rotary"]

build/torch-universal/rotary/_ops.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import torch
+ops = torch.ops._rotary_202507301320
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_202507301320::{op_name}"

build/torch-universal/rotary/triton_rotary.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import torch
+import triton
+import triton.language as tl
+@triton.jit
+def _rotary_kernel(
+    X1_ptr, X2_ptr, COS_ptr, SIN_ptr, OUT1_ptr, OUT2_ptr,
+    stride_x1_b, stride_x1_s, stride_x1_h, stride_x1_d,
+    stride_x2_b, stride_x2_s, stride_x2_h, stride_x2_d,
+    stride_cos_s, stride_cos_d,
+    stride_sin_s, stride_sin_d,
+    stride_o1_b, stride_o1_s, stride_o1_h, stride_o1_d,
+    stride_o2_b, stride_o2_s, stride_o2_h, stride_o2_d,
+    seq_len, num_heads, headdim,
+    IS_CONJ: tl.constexpr,
+    BLOCK_SIZE_D: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_H: tl.constexpr,
+):
+    """
+    Triton kernel for applying rotary position embedding.
+    """
+    # Get program IDs
+    pid_b = tl.program_id(0)
+    pid_s_block = tl.program_id(1)
+    pid_h_block = tl.program_id(2)
+    # Create block pointers
+    offs_d = tl.arange(0, BLOCK_SIZE_D)
+    offs_s = pid_s_block * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_h = pid_h_block * BLOCK_H + tl.arange(0, BLOCK_H)
+    # Pointers for x1, x2, out1, out2
+    x1_ptrs = X1_ptr + pid_b * stride_x1_b + \
+        (offs_s[:, None, None] * stride_x1_s + \
+         offs_h[None, :, None] * stride_x1_h + \
+         offs_d[None, None, :] * stride_x1_d)
+    x2_ptrs = X2_ptr + pid_b * stride_x2_b + \
+        (offs_s[:, None, None] * stride_x2_s + \
+         offs_h[None, :, None] * stride_x2_h + \
+         offs_d[None, None, :] * stride_x2_d)
+    o1_ptrs = OUT1_ptr + pid_b * stride_o1_b + \
+        (offs_s[:, None, None] * stride_o1_s + \
+         offs_h[None, :, None] * stride_o1_h + \
+         offs_d[None, None, :] * stride_o1_d)
+    o2_ptrs = OUT2_ptr + pid_b * stride_o2_b + \
+        (offs_s[:, None, None] * stride_o2_s + \
+         offs_h[None, :, None] * stride_o2_h + \
+         offs_d[None, None, :] * stride_o2_d)
+    # Pointers for cos, sin
+    cos_ptrs = COS_ptr + \
+        (offs_s[:, None, None] * stride_cos_s + \
+         offs_d[None, None, :] * stride_cos_d)
+    sin_ptrs = SIN_ptr + \
+        (offs_s[:, None, None] * stride_sin_s + \
+         offs_d[None, None, :] * stride_sin_d)
+    # Create mask for the last block if dimensions are not multiples of block sizes
+    mask_s = offs_s < seq_len
+    mask_h = offs_h < num_heads
+    mask_d = offs_d < headdim
+    # Combined mask for all tensors: [BLOCK_M, BLOCK_H, BLOCK_SIZE_D]
+    mask = mask_s[:, None, None] & mask_h[None, :, None] & mask_d[None, None, :]
+    mask_cs = mask_s[:, None, None] & mask_d[None, None, :]
+    # Load data
+    x1 = tl.load(x1_ptrs, mask=mask, other=0.0).to(tl.float32)
+    x2 = tl.load(x2_ptrs, mask=mask, other=0.0).to(tl.float32)
+    cos = tl.load(cos_ptrs, mask=mask_cs, other=0.0).to(tl.float32)
+    sin = tl.load(sin_ptrs, mask=mask_cs, other=0.0).to(tl.float32)
+    # Perform rotary transformation
+    if IS_CONJ:
+        out1 = x1 * cos + x2 * sin
+        out2 = -x1 * sin + x2 * cos
+    else:
+        out1 = x1 * cos - x2 * sin
+        out2 = x1 * sin + x2 * cos
+    # Store results
+    tl.store(o1_ptrs, out1, mask=mask)
+    tl.store(o2_ptrs, out2, mask=mask)
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool = False
+):
+    """
+    Applies rotary position embedding to the input tensors.
+    Args:
+        x1, x2: Input tensors. Shape [batch_size, seq_len, num_heads, headdim] or [num_tokens, num_heads, headdim]
+        cos, sin: Tensors with cosine and sine values. Shape [num_tokens, 1, rotary_dim]
+        out1, out2: Output tensors. Can be the same as x1, x2 for in-place operation.
+        conj: If True, applies the conjugate transformation.
+    """
+    # Shape checks
+    assert x1.shape == x2.shape and out1.shape == out2.shape and x1.shape == out1.shape
+    assert cos.shape == sin.shape
+    assert cos.dim() == 3
+    assert x1.device == x2.device == cos.device == sin.device == out1.device == out2.device
+    # Reshape to 4D if necessary
+    if x1.dim() == 3: # (num_tokens, num_heads, headdim)
+        x1, x2 = x1.unsqueeze(0), x2.unsqueeze(0)
+        out1, out2 = out1.unsqueeze(0), out2.unsqueeze(0)
+    elif x1.dim() != 4:
+        raise ValueError("Input tensors must be 3D or 4D")
+    batch_size, seq_len, num_heads, headdim = x1.shape
+    # Triton grid
+    BLOCK_M = 8 if headdim <= 128 else 4
+    BLOCK_H = 2
+    grid = (batch_size, triton.cdiv(seq_len, BLOCK_M), triton.cdiv(num_heads, BLOCK_H))
+    # Use the smallest power of 2 that is >= headdim as BLOCK_SIZE_D
+    BLOCK_SIZE_D = triton.next_power_of_2(headdim)
+    _rotary_kernel[grid](
+        x1, x2, cos, sin, out1, out2,
+        x1.stride(0), x1.stride(1), x1.stride(2), x1.stride(3),
+        x2.stride(0), x2.stride(1), x2.stride(2), x2.stride(3),
+        cos.stride(0), cos.stride(2),
+        sin.stride(0), sin.stride(2),
+        out1.stride(0), out1.stride(1), out1.stride(2), out1.stride(3),
+        out2.stride(0), out2.stride(1), out2.stride(2), out2.stride(3),
+        seq_len, num_heads, headdim,
+        IS_CONJ=conj,
+        BLOCK_SIZE_D=BLOCK_SIZE_D,
+        BLOCK_M=BLOCK_M,
+        BLOCK_H=BLOCK_H,
+    )

tests/__init__.py ADDED Viewed

File without changes

tests/test_rotary.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import pytest
+import torch
+from tests.utils import infer_device, supports_bfloat16
+from kernels import get_local_kernel
+from pathlib import Path
+# from transformers.trainer_utils import set_seed
+# set_seed(42)
+# Set the local repo path, relative path
+repo_path = Path(__file__).parent.parent
+def apply_rotary_torch(x1: torch.Tensor, x2: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, conj: bool = False):
+    assert x1.shape == x2.shape, "x1 and x2 must have the same shape"
+    if not conj:
+        out1 = x1 * cos - x2 * sin
+        out2 = x1 * sin + x2 * cos
+    else:
+        out1 = x1 * cos + x2 * sin
+        out2 = -x1 * sin + x2 * cos
+    return out1, out2
+def apply_rotary_torch_wrapper(q, k, cos, sin, conj: bool = False):
+    """the wrapper for apply_rotary_torch"""
+    rotary_dim = cos.shape[-1]
+    # apply rotation encoding to Q
+    q1 = q[..., :rotary_dim]
+    q2 = q[..., rotary_dim : 2 * rotary_dim]
+    q_out_1, q_out_2 = apply_rotary_torch(q1, q2, cos, sin, conj)
+    q_out = torch.cat([q_out_1, q_out_2, q[..., 2 * rotary_dim:]], dim=-1)
+    # apply rotation encoding to K
+    k1 = k[..., :rotary_dim]
+    k2 = k[..., rotary_dim : 2 * rotary_dim]
+    k_out_1, k_out_2 = apply_rotary_torch(k1, k2, cos, sin, conj)
+    k_out = torch.cat([k_out_1, k_out_2, k[..., 2 * rotary_dim:]], dim=-1)
+    return q_out, k_out
+def apply_rotary_kernel_wrapper(q, k, cos, sin, conj: bool = False):
+    """the wrapper for apply_rotary_kernel"""
+    rotary = get_local_kernel(repo_path=repo_path, package_name="rotary")
+    rotary_dim = cos.shape[-1]
+    # apply rotation encoding to Q
+    q1 = q[..., :rotary_dim]
+    q2 = q[..., rotary_dim : 2 * rotary_dim]
+    rotary.apply_rotary(q1, q2, cos, sin, q1, q2, conj)
+    # apply rotation encoding to K
+    k1 = k[..., :rotary_dim]
+    k2 = k[..., rotary_dim : 2 * rotary_dim]
+    rotary.apply_rotary(k1, k2, cos, sin, k1, k2, conj)
+@pytest.mark.parametrize("batch_size", [1, 2])
+@pytest.mark.parametrize("nheads", [8, 16])
+@pytest.mark.parametrize("seqlen", [128, 256])
+@pytest.mark.parametrize("headdim, rotary_dim", [(64, 32), (128, 64), (64, 30)])
+@pytest.mark.parametrize("qk_dim", [3, 4])
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float32, 1e-5, 1e-5),
+        pytest.param(
+            torch.bfloat16,
+            1e-1,
+            1e-5,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+    ],
+)
+@pytest.mark.parametrize("conj", [False, True])
+@pytest.mark.flaky(max_runs=2, min_passes=1)
+def test_rotary_equivalence(batch_size, nheads, seqlen, headdim, rotary_dim, qk_dim, dtype, atol, rtol, conj):
+    device = infer_device()
+    if device is None:
+        pytest.skip("No suitable device found for testing")
+    if qk_dim == 4:
+        q_shape = (batch_size, seqlen, nheads, headdim)
+        cos_sin_shape = (seqlen, 1, rotary_dim)
+    elif qk_dim == 3:
+        q_shape = (batch_size * seqlen, nheads, headdim)
+        cos_sin_shape = (batch_size * seqlen, 1, rotary_dim)
+    q_orig = torch.randn(q_shape, device=device, dtype=dtype)
+    k_orig = torch.randn(q_shape, device=device, dtype=dtype)
+    cos = torch.randn(cos_sin_shape, device=device, dtype=dtype)
+    sin = torch.randn(cos_sin_shape, device=device, dtype=dtype)
+    q_kernel, k_kernel = q_orig.clone(), k_orig.clone()
+    q_torch, k_torch = q_orig.clone(), k_orig.clone()
+    q_torch_out, k_torch_out = apply_rotary_torch_wrapper(q_torch, k_torch, cos, sin, conj)
+    apply_rotary_kernel_wrapper(q_kernel, k_kernel, cos, sin, conj)
+    # verify the rotation results of Q and K are consistent
+    try:
+        assert torch.allclose(q_torch_out, q_kernel, atol=atol, rtol=rtol), "Rotary transformation results for Q do not match"
+    except AssertionError:
+        diff_q = torch.abs(q_torch_out - q_kernel)
+        max_diff_q = torch.max(diff_q)
+        print(f"Max difference for Q: {max_diff_q}")
+        raise
+    try:
+        assert torch.allclose(k_torch_out, k_kernel, atol=atol, rtol=rtol), "Rotary transformation results for K do not match"
+    except AssertionError:
+        diff_k = torch.abs(k_torch_out - k_kernel)
+        max_diff_k = torch.max(diff_k)
+        print(f"Max difference for K: {max_diff_k}")
+        raise
+    # verify the non-rotated part of Q and K remains unchanged
+    if (2 * rotary_dim) < headdim:
+        assert torch.equal(
+            q_kernel[..., 2 * rotary_dim:], q_orig[..., 2 * rotary_dim:]
+        ), "Non-rotated part of Q should be unchanged"
+        assert torch.equal(
+            k_kernel[..., 2 * rotary_dim:], k_orig[..., 2 * rotary_dim:]
+        ), "Non-rotated part of K should be unchanged"

tests/utils.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import torch
+def infer_device():
+    """
+    Get current device name based on available devices
+    """
+    if torch.cuda.is_available():  # Works for both Nvidia and AMD
+        return "cuda"
+    elif torch.xpu.is_available():
+        return "xpu"
+    else:
+        return None
+def supports_bfloat16():
+    device = infer_device()
+    if device == "cuda":
+        return torch.cuda.get_device_capability() >= (8, 0)  # Ampere and newer
+    elif device == "xpu":
+        return True
+    else:
+        return False