[general] name = "paged_attention" universal = false [torch] src = [ "torch-ext/torch_binding.cpp", "torch-ext/torch_binding.h" ] [kernel.cuda_utils] backend = "cuda" src = [ "cuda-utils/cuda_utils.h", "cuda-utils/cuda_utils_kernels.cu", ] depends = [] [kernel.cuda_utils_rocm] backend = "rocm" rocm-archs = [ "gfx906", "gfx908", "gfx90a", "gfx940", "gfx941", "gfx942", "gfx1030", "gfx1100", "gfx1101", ] src = [ "cuda-utils/cuda_utils.h", "cuda-utils/cuda_utils_kernels.cu", ] depends = ["torch"] [kernel.paged_attention] backend = "cuda" src = [ "cuda-utils/cuda_utils.h", "paged-attention/attention/attention_dtypes.h", "paged-attention/attention/attention_generic.cuh", "paged-attention/attention/attention_kernels.cuh", "paged-attention/attention/attention_utils.cuh", "paged-attention/attention/dtype_bfloat16.cuh", "paged-attention/attention/dtype_float16.cuh", "paged-attention/attention/dtype_float32.cuh", "paged-attention/attention/dtype_fp8.cuh", "paged-attention/attention/paged_attention_v1.cu", "paged-attention/attention/paged_attention_v2.cu", "paged-attention/cache_kernels.cu", "paged-attention/cuda_compat.h", "paged-attention/dispatch_utils.h", "paged-attention/quantization/fp8/amd/quant_utils.cuh", "paged-attention/quantization/fp8/nvidia/quant_utils.cuh", ] include = [ "cuda-utils", "paged-attention" ] depends = [ "torch" ] [kernel.paged_attention_rocm] backend = "rocm" rocm-archs = [ "gfx906", "gfx908", "gfx90a", "gfx940", "gfx941", "gfx942", "gfx1030", "gfx1100", "gfx1101", ] src = [ "cuda-utils/cuda_utils.h", "paged-attention/attention/attention_dtypes.h", "paged-attention/attention/attention_generic.cuh", "paged-attention/attention/attention_kernels.cuh", "paged-attention/attention/attention_utils.cuh", "paged-attention/attention/dtype_bfloat16.cuh", "paged-attention/attention/dtype_float16.cuh", "paged-attention/attention/dtype_float32.cuh", "paged-attention/attention/dtype_fp8.cuh", "paged-attention/attention/paged_attention_v1.cu", "paged-attention/attention/paged_attention_v2.cu", "paged-attention/cache_kernels.cu", "paged-attention/cuda_compat.h", "paged-attention/dispatch_utils.h", "paged-attention/quantization/fp8/amd/quant_utils.cuh", "paged-attention/quantization/fp8/nvidia/quant_utils.cuh", ] include = [ "cuda-utils", "paged-attention" ] depends = [ "torch" ] [kernel.paged_attention_metal] backend = "metal" src = [ "paged-attention-metal/attention/paged_attention.metal", "paged-attention-metal/cache/copy_blocks.metal", "paged-attention-metal/cache/reshape_and_cache.metal", "paged-attention-metal/convert_fp8.metal", "paged-attention-metal/float8.metal", "paged-attention-metal/utils.metal", "paged-attention-metal/paged_attention.mm", "paged-attention-metal/cache.mm", "paged-attention-metal/convert_fp8.mm", "paged-attention-metal/device.mm", ] include = [ "." ] depends = [ "torch" ]