kernels-community
/

flash-attn3

danieldk HF Staff commited on 2 days ago

Commit

bc10fdc

1 Parent(s): 48fe103

Prepare for Torch 2.8

Files changed (3) hide show

build.toml CHANGED Viewed

@@ -20,7 +20,6 @@ cuda-flags = [
   "--ftemplate-backtrace-limit=0",              # To debug template code
   "--use_fast_math",
   "-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED",
-  "-DCUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1",
   "-DCUTLASS_ENABLE_GDC_FOR_SM90",
   "--expt-relaxed-constexpr",
   "--expt-extended-lambda",
@@ -53,7 +52,6 @@ cuda-flags = [
   "--ftemplate-backtrace-limit=0",              # To debug template code
   "--use_fast_math",
   "-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED",
-  "-DCUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1",
   "-DCUTLASS_ENABLE_GDC_FOR_SM90",
   "--expt-relaxed-constexpr",
   "--expt-extended-lambda",
@@ -202,7 +200,6 @@ cuda-flags = [
   "--ftemplate-backtrace-limit=0",              # To debug template code
   "--use_fast_math",
   "-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED",
-  "-DCUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1",
   "-DCUTLASS_ENABLE_GDC_FOR_SM90",
   "--expt-relaxed-constexpr",
   "--expt-extended-lambda",
@@ -551,7 +548,6 @@ depends = ["torch", "cutlass_3_9"]
 #   "--ftemplate-backtrace-limit=0",              # To debug template code
 #   "--use_fast_math",
 #   "-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED",
-#   "-DCUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1",
 #   "-DCUTLASS_ENABLE_GDC_FOR_SM90",
 #   "--expt-relaxed-constexpr",
 #   "--expt-extended-lambda",

   "--ftemplate-backtrace-limit=0",              # To debug template code
   "--use_fast_math",
   "-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED",
   "-DCUTLASS_ENABLE_GDC_FOR_SM90",
   "--expt-relaxed-constexpr",
   "--expt-extended-lambda",
   "--ftemplate-backtrace-limit=0",              # To debug template code
   "--use_fast_math",
   "-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED",
   "-DCUTLASS_ENABLE_GDC_FOR_SM90",
   "--expt-relaxed-constexpr",
   "--expt-extended-lambda",
   "--ftemplate-backtrace-limit=0",              # To debug template code
   "--use_fast_math",
   "-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED",
   "-DCUTLASS_ENABLE_GDC_FOR_SM90",
   "--expt-relaxed-constexpr",
   "--expt-extended-lambda",
 #   "--ftemplate-backtrace-limit=0",              # To debug template code
 #   "--use_fast_math",
 #   "-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED",
 #   "-DCUTLASS_ENABLE_GDC_FOR_SM90",
 #   "--expt-relaxed-constexpr",
 #   "--expt-extended-lambda",

flake.lock CHANGED Viewed

@@ -73,11 +73,11 @@
         "nixpkgs": "nixpkgs"
       },
       "locked": {
-        "lastModified": 1750234878,
-        "narHash": "sha256-q9DRC9zdpzUf88qqg1qbhP1qgJbE2cMtn8oUmosuyT8=",
         "owner": "huggingface",
         "repo": "hf-nix",
-        "rev": "c7132f90763d756da3e77da62e01be0a4546dc57",
         "type": "github"
       },
       "original": {
@@ -98,32 +98,33 @@
         ]
       },
       "locked": {
-        "lastModified": 1751014803,
-        "narHash": "sha256-9Xfq2k3uPfB602NwQF+zAY2GQZiKUN1G7Q6XiDCUR8Y=",
         "owner": "huggingface",
         "repo": "kernel-builder",
-        "rev": "bbc4e712ff2046e217818e97de2201e2b996756e",
         "type": "github"
       },
       "original": {
         "owner": "huggingface",
         "repo": "kernel-builder",
         "type": "github"
       }
     },
     "nixpkgs": {
       "locked": {
-        "lastModified": 1747820358,
-        "narHash": "sha256-fTqsZsUX6M3yeEvgyQvXcbGmT2CaRVyVwsi8eK29Oj4=",
-        "owner": "danieldk",
         "repo": "nixpkgs",
-        "rev": "d3c1681180717528068082103bf323147de6ab0b",
         "type": "github"
       },
       "original": {
-        "owner": "danieldk",
-        "ref": "cudatoolkit-12.9-kernel-builder",
         "repo": "nixpkgs",
         "type": "github"
       }
     },

         "nixpkgs": "nixpkgs"
       },
       "locked": {
+        "lastModified": 1753354560,
+        "narHash": "sha256-vmOfRmr0Qm/IbZTWB2sBn+UFrABSTTA/cTg+m27Yt/E=",
         "owner": "huggingface",
         "repo": "hf-nix",
+        "rev": "7f2aceda2a2e72cd573bdb25e5c0667fd75f89d3",
         "type": "github"
       },
       "original": {
         ]
       },
       "locked": {
+        "lastModified": 1753354632,
+        "narHash": "sha256-31SX3Raiyx0qCuY9JSlx9ZZgxljeUxvW+JdujjxbofQ=",
         "owner": "huggingface",
         "repo": "kernel-builder",
+        "rev": "524b628fd8e58525dbd28455bffb0628092c5265",
         "type": "github"
       },
       "original": {
         "owner": "huggingface",
+        "ref": "torch-2.8",
         "repo": "kernel-builder",
         "type": "github"
       }
     },
     "nixpkgs": {
       "locked": {
+        "lastModified": 1752785354,
+        "narHash": "sha256-Y33ryUz7MPqKrZwlbQcsYCUz2jAJCacRf8jbs0tYUlA=",
+        "owner": "nixos",
         "repo": "nixpkgs",
+        "rev": "d38025438a6ee456758dc03188ca6873a415463b",
         "type": "github"
       },
       "original": {
+        "owner": "nixos",
         "repo": "nixpkgs",
+        "rev": "d38025438a6ee456758dc03188ca6873a415463b",
         "type": "github"
       }
     },

flake.nix CHANGED Viewed

@@ -2,7 +2,7 @@
   description = "Flake for Hopper Flash Attention kernel";
   inputs = {
-    kernel-builder.url = "github:huggingface/kernel-builder";
   };
   outputs =
@@ -21,21 +21,7 @@
       # by hand (which works fine thanks to backward compat).
       torchVersions = [
         {
-          torchVersion = "2.6";
-          cudaVersion = "12.4";
-          cxx11Abi = false;
-          systems = [ "x86_64-linux" ];
-          upstreamVariant = true;
-        }
-        {
-          torchVersion = "2.6";
-          cudaVersion = "12.4";
-          cxx11Abi = true;
-          systems = [ "x86_64-linux" ];
-          upstreamVariant = true;
-        }
-        {
-          torchVersion = "2.7";
           cudaVersion = "12.4";
           cxx11Abi = true;
           systems = [

   description = "Flake for Hopper Flash Attention kernel";
   inputs = {
+    kernel-builder.url = "github:huggingface/kernel-builder/torch-2.8";
   };
   outputs =
       # by hand (which works fine thanks to backward compat).
       torchVersions = [
         {
+          torchVersion = "2.8";
           cudaVersion = "12.4";
           cxx11Abi = true;
           systems = [