Support HSDP

by iamwyldecat - opened 13 days ago

base: refs/heads/main

←

from: refs/pr/4

Discussion Files changed

+384

-204

Files changed (46) hide show

build/torch27-cxx11-cu118-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch27-cxx11-cu118-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc +0 -0
build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py +3 -3
build/torch27-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_1f13dae_dirty.abi3.so → _optimizer_2dc97a1_dirty.abi3.so} +1 -1
build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py +33 -15
build/torch27-cxx11-cu126-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch27-cxx11-cu126-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc +0 -0
build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py +3 -3
build/{torch28-cxx11-cu126-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so → torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_2dc97a1_dirty.abi3.so} +1 -1
build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py +33 -15
build/torch27-cxx11-cu128-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch27-cxx11-cu128-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc +0 -0
build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py +3 -3
build/{torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so → torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_2dc97a1_dirty.abi3.so} +1 -1
build/torch27-cxx11-cu128-x86_64-linux/optimizer/muon.py +33 -15
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc +0 -0
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_ops.py +3 -3
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/{_optimizer_1f13dae_dirty.abi3.so → _optimizer_2dc97a1_dirty.abi3.so} +1 -1
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/muon.py +33 -15
build/torch28-cxx11-cu126-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch28-cxx11-cu126-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc +0 -0
build/torch28-cxx11-cu126-x86_64-linux/optimizer/_ops.py +3 -3
build/{torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so → torch28-cxx11-cu126-x86_64-linux/optimizer/_optimizer_2dc97a1_dirty.abi3.so} +1 -1
build/torch28-cxx11-cu126-x86_64-linux/optimizer/muon.py +33 -15
build/torch28-cxx11-cu128-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch28-cxx11-cu128-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc +0 -0
build/torch28-cxx11-cu128-x86_64-linux/optimizer/_ops.py +3 -3
build/{torch28-cxx11-cu129-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so → torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_2dc97a1_dirty.abi3.so} +1 -1
build/torch28-cxx11-cu128-x86_64-linux/optimizer/muon.py +33 -15
build/torch28-cxx11-cu129-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch28-cxx11-cu129-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc +0 -0
build/torch28-cxx11-cu129-x86_64-linux/optimizer/_ops.py +3 -3
build/{torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so → torch28-cxx11-cu129-x86_64-linux/optimizer/_optimizer_2dc97a1_dirty.abi3.so} +1 -1
build/torch28-cxx11-cu129-x86_64-linux/optimizer/muon.py +33 -15
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc +0 -0
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/_ops.py +3 -3
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/{_optimizer_1f13dae_dirty.abi3.so → _optimizer_2dc97a1_dirty.abi3.so} +1 -1
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/muon.py +33 -15
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc +0 -0
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/_ops.py +3 -3
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/{_optimizer_1f13dae_dirty.abi3.so → _optimizer_2dc97a1_dirty.abi3.so} +1 -1
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/muon.py +33 -15
torch-ext/optimizer/muon.py +33 -15

build/torch27-cxx11-cu118-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc CHANGED Viewed

Binary files a/build/torch27-cxx11-cu118-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc and b/build/torch27-cxx11-cu118-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc differ

build/torch27-cxx11-cu118-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc CHANGED Viewed

Binary files a/build/torch27-cxx11-cu118-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc and b/build/torch27-cxx11-cu118-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc differ

build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_1f13dae_dirty
-ops = torch.ops._optimizer_1f13dae_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_1f13dae_dirty::{op_name}"

 import torch
+from . import _optimizer_2dc97a1_dirty
+ops = torch.ops._optimizer_2dc97a1_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_2dc97a1_dirty::{op_name}"

build/torch27-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_1f13dae_dirty.abi3.so → _optimizer_2dc97a1_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7dc5f8a57aa60483209dfcbb0c7cc0e54f1739d643145c1e685fbe2b6675ac43
 size 1787368

 version https://git-lfs.github.com/spec/v1
+oid sha256:9112c8dde01baefa0e3130e143288cd3073ccbab47369a6dc925ce0d35400c6d
 size 1787368

build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -3,7 +3,7 @@ from dataclasses import dataclass
 import torch
 import torch.distributed as dist
-from torch.distributed._tensor import DTensor, Replicate
 # This code snippet is a modified version adapted from the following GitHub repositories:
@@ -50,15 +50,16 @@ class _muon_state:
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
-    mesh = g.device_mesh
     if rank == state.worker_rank:
-        gather_list = [torch.empty_like(g.to_local()) for _ in range(mesh.mesh.numel())]
     else:
         gather_list = None
@@ -67,7 +68,7 @@ def _gather(p, state, rank, comm_stream, none_grad):
             g.to_local(),
             dst=state.worker_rank,
             gather_list=gather_list,
-            group=mesh.get_group(),
         )
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
@@ -105,14 +106,14 @@ def _compute_u(state, steps, rank, compute_stream):
 @torch.no_grad()
 def _scatter(p, state, lr, weight_decay, rank, comm_stream):
     u = state.computed_u
-    mesh = p.device_mesh
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
-            scatter_list = list(torch.split(u, p.size(0) // mesh.mesh.numel(), dim=0))
         else:
             scatter_list = None
@@ -121,7 +122,7 @@ def _scatter(p, state, lr, weight_decay, rank, comm_stream):
             u,
             scatter_list=scatter_list,
             src=state.worker_rank,
-            group=mesh.get_group(),
         )
         if rank == state.worker_rank:
             # Clear u to free memory
@@ -129,7 +130,7 @@ def _scatter(p, state, lr, weight_decay, rank, comm_stream):
         u = DTensor.from_local(
             u,
             placements=p.placements,
-            device_mesh=mesh,
         )
         p.data.mul_(1 - lr * weight_decay)
         p.data.add_(u, alpha=-lr)
@@ -235,6 +236,23 @@ class Muon(torch.optim.Optimizer):
         adjusted_lr = lr * adjusted_ratio
         return adjusted_lr
     def init_state_and_assign_params(self, params, group):
         param_to_state = {}
         param_to_flops = {}
@@ -259,20 +277,20 @@ class Muon(torch.optim.Optimizer):
         round_robin = 0
         mesh = None
         for p in ordered_params:
             if mesh is None:
                 mesh = p.device_mesh
-                if mesh.ndim != 1:
-                    raise NotImplementedError(
-                        "Muon requires a 1D mesh for distributed training yet."
-                    )
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
             param_to_state[id(p)] = _muon_state()
-            param_to_state[id(p)].worker_rank = mesh.mesh[round_robin].item()
-            round_robin = (round_robin + 1) % mesh.mesh.numel()
         return param_to_state, ordered_params
@@ -372,7 +390,7 @@ class Muon(torch.optim.Optimizer):
                     p, state, adjusted_lr, weight_decay, self.rank, self.comm_stream
                 )
-        chunk_size = params[0].device_mesh.mesh.numel()
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())

 import torch
 import torch.distributed as dist
+from torch.distributed._tensor import DTensor, Replicate, Shard
 # This code snippet is a modified version adapted from the following GitHub repositories:
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
+    process_group = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
     if rank == state.worker_rank:
+        num_ranks = dist.get_world_size(group=state.process_group)
+        gather_list = [torch.empty_like(g.to_local()) for _ in range(num_ranks)]
     else:
         gather_list = None
             g.to_local(),
             dst=state.worker_rank,
             gather_list=gather_list,
+            group=state.process_group,
         )
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
 @torch.no_grad()
 def _scatter(p, state, lr, weight_decay, rank, comm_stream):
     u = state.computed_u
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
+            num_ranks = dist.get_world_size(group=state.process_group)
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
+            scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
         else:
             scatter_list = None
             u,
             scatter_list=scatter_list,
             src=state.worker_rank,
+            group=state.process_group,
         )
         if rank == state.worker_rank:
             # Clear u to free memory
         u = DTensor.from_local(
             u,
             placements=p.placements,
+            device_mesh=p.device_mesh,
         )
         p.data.mul_(1 - lr * weight_decay)
         p.data.add_(u, alpha=-lr)
         adjusted_lr = lr * adjusted_ratio
         return adjusted_lr
+    def get_shard_mesh(self, p, rank):
+        """
+        Get the shard mesh for a parameter p on the given rank.
+        """
+        assert isinstance(p, DTensor), "Parallel Muon only supports DTensor parameters."
+        if p.placements == (Shard(dim=0),):
+            # Case for FSDP
+            return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
+        elif p.placements == (Replicate(), Shard(dim=0)):
+            # Case for HSDP
+            for i, shard_mesh in enumerate(p.device_mesh.mesh):
+                if rank in shard_mesh:
+                    return shard_mesh, p.device_mesh.get_group(mesh_dim=1)
+        else:
+            raise ValueError(f"Unsupported placements ({p.placements}).")
     def init_state_and_assign_params(self, params, group):
         param_to_state = {}
         param_to_flops = {}
         round_robin = 0
         mesh = None
+        shard_mesh = None
+        process_group = None
         for p in ordered_params:
             if mesh is None:
                 mesh = p.device_mesh
+                shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
             param_to_state[id(p)] = _muon_state()
+            param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
+            param_to_state[id(p)].process_group = process_group
+            round_robin = (round_robin + 1) % len(shard_mesh)
         return param_to_state, ordered_params
                     p, state, adjusted_lr, weight_decay, self.rank, self.comm_stream
                 )
+        chunk_size = dist.get_world_size(param_to_state[id(params[0])].process_group)
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())

build/torch27-cxx11-cu126-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc CHANGED Viewed

Binary files a/build/torch27-cxx11-cu126-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc and b/build/torch27-cxx11-cu126-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc differ

build/torch27-cxx11-cu126-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc CHANGED Viewed

Binary files a/build/torch27-cxx11-cu126-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc and b/build/torch27-cxx11-cu126-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc differ

build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_1f13dae_dirty
-ops = torch.ops._optimizer_1f13dae_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_1f13dae_dirty::{op_name}"

 import torch
+from . import _optimizer_2dc97a1_dirty
+ops = torch.ops._optimizer_2dc97a1_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_2dc97a1_dirty::{op_name}"

build/{torch28-cxx11-cu126-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so → torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_2dc97a1_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a082b5629efc4e9b8ce608713665d47904949b5d220dad350049bc806d58ecd7
 size 1824256

 version https://git-lfs.github.com/spec/v1
+oid sha256:0449cd352f44c3e848d1f9c847b00bf576673b4fef2a954ec8bd8d2524b8353a
 size 1824256

build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -3,7 +3,7 @@ from dataclasses import dataclass
 import torch
 import torch.distributed as dist
-from torch.distributed._tensor import DTensor, Replicate
 # This code snippet is a modified version adapted from the following GitHub repositories:
@@ -50,15 +50,16 @@ class _muon_state:
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
-    mesh = g.device_mesh
     if rank == state.worker_rank:
-        gather_list = [torch.empty_like(g.to_local()) for _ in range(mesh.mesh.numel())]
     else:
         gather_list = None
@@ -67,7 +68,7 @@ def _gather(p, state, rank, comm_stream, none_grad):
             g.to_local(),
             dst=state.worker_rank,
             gather_list=gather_list,
-            group=mesh.get_group(),
         )
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
@@ -105,14 +106,14 @@ def _compute_u(state, steps, rank, compute_stream):
 @torch.no_grad()
 def _scatter(p, state, lr, weight_decay, rank, comm_stream):
     u = state.computed_u
-    mesh = p.device_mesh
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
-            scatter_list = list(torch.split(u, p.size(0) // mesh.mesh.numel(), dim=0))
         else:
             scatter_list = None
@@ -121,7 +122,7 @@ def _scatter(p, state, lr, weight_decay, rank, comm_stream):
             u,
             scatter_list=scatter_list,
             src=state.worker_rank,
-            group=mesh.get_group(),
         )
         if rank == state.worker_rank:
             # Clear u to free memory
@@ -129,7 +130,7 @@ def _scatter(p, state, lr, weight_decay, rank, comm_stream):
         u = DTensor.from_local(
             u,
             placements=p.placements,
-            device_mesh=mesh,
         )
         p.data.mul_(1 - lr * weight_decay)
         p.data.add_(u, alpha=-lr)
@@ -235,6 +236,23 @@ class Muon(torch.optim.Optimizer):
         adjusted_lr = lr * adjusted_ratio
         return adjusted_lr
     def init_state_and_assign_params(self, params, group):
         param_to_state = {}
         param_to_flops = {}
@@ -259,20 +277,20 @@ class Muon(torch.optim.Optimizer):
         round_robin = 0
         mesh = None
         for p in ordered_params:
             if mesh is None:
                 mesh = p.device_mesh
-                if mesh.ndim != 1:
-                    raise NotImplementedError(
-                        "Muon requires a 1D mesh for distributed training yet."
-                    )
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
             param_to_state[id(p)] = _muon_state()
-            param_to_state[id(p)].worker_rank = mesh.mesh[round_robin].item()
-            round_robin = (round_robin + 1) % mesh.mesh.numel()
         return param_to_state, ordered_params
@@ -372,7 +390,7 @@ class Muon(torch.optim.Optimizer):
                     p, state, adjusted_lr, weight_decay, self.rank, self.comm_stream
                 )
-        chunk_size = params[0].device_mesh.mesh.numel()
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())

 import torch
 import torch.distributed as dist
+from torch.distributed._tensor import DTensor, Replicate, Shard
 # This code snippet is a modified version adapted from the following GitHub repositories:
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
+    process_group = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
     if rank == state.worker_rank:
+        num_ranks = dist.get_world_size(group=state.process_group)
+        gather_list = [torch.empty_like(g.to_local()) for _ in range(num_ranks)]
     else:
         gather_list = None
             g.to_local(),
             dst=state.worker_rank,
             gather_list=gather_list,
+            group=state.process_group,
         )
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
 @torch.no_grad()
 def _scatter(p, state, lr, weight_decay, rank, comm_stream):
     u = state.computed_u
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
+            num_ranks = dist.get_world_size(group=state.process_group)
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
+            scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
         else:
             scatter_list = None
             u,
             scatter_list=scatter_list,
             src=state.worker_rank,
+            group=state.process_group,
         )
         if rank == state.worker_rank:
             # Clear u to free memory
         u = DTensor.from_local(
             u,
             placements=p.placements,
+            device_mesh=p.device_mesh,
         )
         p.data.mul_(1 - lr * weight_decay)
         p.data.add_(u, alpha=-lr)
         adjusted_lr = lr * adjusted_ratio
         return adjusted_lr
+    def get_shard_mesh(self, p, rank):
+        """
+        Get the shard mesh for a parameter p on the given rank.
+        """
+        assert isinstance(p, DTensor), "Parallel Muon only supports DTensor parameters."
+        if p.placements == (Shard(dim=0),):
+            # Case for FSDP
+            return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
+        elif p.placements == (Replicate(), Shard(dim=0)):
+            # Case for HSDP
+            for i, shard_mesh in enumerate(p.device_mesh.mesh):
+                if rank in shard_mesh:
+                    return shard_mesh, p.device_mesh.get_group(mesh_dim=1)
+        else:
+            raise ValueError(f"Unsupported placements ({p.placements}).")
     def init_state_and_assign_params(self, params, group):
         param_to_state = {}
         param_to_flops = {}
         round_robin = 0
         mesh = None
+        shard_mesh = None
+        process_group = None
         for p in ordered_params:
             if mesh is None:
                 mesh = p.device_mesh
+                shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
             param_to_state[id(p)] = _muon_state()
+            param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
+            param_to_state[id(p)].process_group = process_group
+            round_robin = (round_robin + 1) % len(shard_mesh)
         return param_to_state, ordered_params
                     p, state, adjusted_lr, weight_decay, self.rank, self.comm_stream
                 )
+        chunk_size = dist.get_world_size(param_to_state[id(params[0])].process_group)
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())

build/torch27-cxx11-cu128-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc CHANGED Viewed

Binary files a/build/torch27-cxx11-cu128-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc and b/build/torch27-cxx11-cu128-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc differ

build/torch27-cxx11-cu128-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc CHANGED Viewed

Binary files a/build/torch27-cxx11-cu128-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc and b/build/torch27-cxx11-cu128-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc differ

build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_1f13dae_dirty
-ops = torch.ops._optimizer_1f13dae_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_1f13dae_dirty::{op_name}"

 import torch
+from . import _optimizer_2dc97a1_dirty
+ops = torch.ops._optimizer_2dc97a1_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_2dc97a1_dirty::{op_name}"

build/{torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so → torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_2dc97a1_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7d2e65e315cd82d0b6fc2043ff37ee2d1223d6bd293ef552d658db5bf4de0a45
 size 1883352

 version https://git-lfs.github.com/spec/v1
+oid sha256:2e6bab72b965f42d466cd74bbda49851549f2810278e642cef8738e40de4fdc5
 size 1883352

build/torch27-cxx11-cu128-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -3,7 +3,7 @@ from dataclasses import dataclass
 import torch
 import torch.distributed as dist
-from torch.distributed._tensor import DTensor, Replicate
 # This code snippet is a modified version adapted from the following GitHub repositories:
@@ -50,15 +50,16 @@ class _muon_state:
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
-    mesh = g.device_mesh
     if rank == state.worker_rank:
-        gather_list = [torch.empty_like(g.to_local()) for _ in range(mesh.mesh.numel())]
     else:
         gather_list = None
@@ -67,7 +68,7 @@ def _gather(p, state, rank, comm_stream, none_grad):
             g.to_local(),
             dst=state.worker_rank,
             gather_list=gather_list,
-            group=mesh.get_group(),
         )
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
@@ -105,14 +106,14 @@ def _compute_u(state, steps, rank, compute_stream):
 @torch.no_grad()
 def _scatter(p, state, lr, weight_decay, rank, comm_stream):
     u = state.computed_u
-    mesh = p.device_mesh
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
-            scatter_list = list(torch.split(u, p.size(0) // mesh.mesh.numel(), dim=0))
         else:
             scatter_list = None
@@ -121,7 +122,7 @@ def _scatter(p, state, lr, weight_decay, rank, comm_stream):
             u,
             scatter_list=scatter_list,
             src=state.worker_rank,
-            group=mesh.get_group(),
         )
         if rank == state.worker_rank:
             # Clear u to free memory
@@ -129,7 +130,7 @@ def _scatter(p, state, lr, weight_decay, rank, comm_stream):
         u = DTensor.from_local(
             u,
             placements=p.placements,
-            device_mesh=mesh,
         )
         p.data.mul_(1 - lr * weight_decay)
         p.data.add_(u, alpha=-lr)
@@ -235,6 +236,23 @@ class Muon(torch.optim.Optimizer):
         adjusted_lr = lr * adjusted_ratio
         return adjusted_lr
     def init_state_and_assign_params(self, params, group):
         param_to_state = {}
         param_to_flops = {}
@@ -259,20 +277,20 @@ class Muon(torch.optim.Optimizer):
         round_robin = 0
         mesh = None
         for p in ordered_params:
             if mesh is None:
                 mesh = p.device_mesh
-                if mesh.ndim != 1:
-                    raise NotImplementedError(
-                        "Muon requires a 1D mesh for distributed training yet."
-                    )
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
             param_to_state[id(p)] = _muon_state()
-            param_to_state[id(p)].worker_rank = mesh.mesh[round_robin].item()
-            round_robin = (round_robin + 1) % mesh.mesh.numel()
         return param_to_state, ordered_params
@@ -372,7 +390,7 @@ class Muon(torch.optim.Optimizer):
                     p, state, adjusted_lr, weight_decay, self.rank, self.comm_stream
                 )
-        chunk_size = params[0].device_mesh.mesh.numel()
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())

 import torch
 import torch.distributed as dist
+from torch.distributed._tensor import DTensor, Replicate, Shard
 # This code snippet is a modified version adapted from the following GitHub repositories:
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
+    process_group = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
     if rank == state.worker_rank:
+        num_ranks = dist.get_world_size(group=state.process_group)
+        gather_list = [torch.empty_like(g.to_local()) for _ in range(num_ranks)]
     else:
         gather_list = None
             g.to_local(),
             dst=state.worker_rank,
             gather_list=gather_list,
+            group=state.process_group,
         )
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
 @torch.no_grad()
 def _scatter(p, state, lr, weight_decay, rank, comm_stream):
     u = state.computed_u
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
+            num_ranks = dist.get_world_size(group=state.process_group)
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
+            scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
         else:
             scatter_list = None
             u,
             scatter_list=scatter_list,
             src=state.worker_rank,
+            group=state.process_group,
         )
         if rank == state.worker_rank:
             # Clear u to free memory
         u = DTensor.from_local(
             u,
             placements=p.placements,
+            device_mesh=p.device_mesh,
         )
         p.data.mul_(1 - lr * weight_decay)
         p.data.add_(u, alpha=-lr)
         adjusted_lr = lr * adjusted_ratio
         return adjusted_lr
+    def get_shard_mesh(self, p, rank):
+        """
+        Get the shard mesh for a parameter p on the given rank.
+        """
+        assert isinstance(p, DTensor), "Parallel Muon only supports DTensor parameters."
+        if p.placements == (Shard(dim=0),):
+            # Case for FSDP
+            return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
+        elif p.placements == (Replicate(), Shard(dim=0)):
+            # Case for HSDP
+            for i, shard_mesh in enumerate(p.device_mesh.mesh):
+                if rank in shard_mesh:
+                    return shard_mesh, p.device_mesh.get_group(mesh_dim=1)
+        else:
+            raise ValueError(f"Unsupported placements ({p.placements}).")
     def init_state_and_assign_params(self, params, group):
         param_to_state = {}
         param_to_flops = {}
         round_robin = 0
         mesh = None
+        shard_mesh = None
+        process_group = None
         for p in ordered_params:
             if mesh is None:
                 mesh = p.device_mesh
+                shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
             param_to_state[id(p)] = _muon_state()
+            param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
+            param_to_state[id(p)].process_group = process_group
+            round_robin = (round_robin + 1) % len(shard_mesh)
         return param_to_state, ordered_params
                     p, state, adjusted_lr, weight_decay, self.rank, self.comm_stream
                 )
+        chunk_size = dist.get_world_size(param_to_state[id(params[0])].process_group)
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc CHANGED Viewed

Binary files a/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc and b/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc differ

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc CHANGED Viewed

Binary files a/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc and b/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc differ

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_1f13dae_dirty
-ops = torch.ops._optimizer_1f13dae_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_1f13dae_dirty::{op_name}"

 import torch
+from . import _optimizer_2dc97a1_dirty
+ops = torch.ops._optimizer_2dc97a1_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_2dc97a1_dirty::{op_name}"

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/{_optimizer_1f13dae_dirty.abi3.so → _optimizer_2dc97a1_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3d9ee2420e8528032369c476152a1960d123034a83e2c43f38a7fb2d1423aa23
 size 1749840

 version https://git-lfs.github.com/spec/v1
+oid sha256:bdcf9e3d8bf13aa01bf1ae7a94a12dd05c50702a24b57e4cfcc2e54ca5ed21c3
 size 1749840

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -3,7 +3,7 @@ from dataclasses import dataclass
 import torch
 import torch.distributed as dist
-from torch.distributed._tensor import DTensor, Replicate
 # This code snippet is a modified version adapted from the following GitHub repositories:
@@ -50,15 +50,16 @@ class _muon_state:
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
-    mesh = g.device_mesh
     if rank == state.worker_rank:
-        gather_list = [torch.empty_like(g.to_local()) for _ in range(mesh.mesh.numel())]
     else:
         gather_list = None
@@ -67,7 +68,7 @@ def _gather(p, state, rank, comm_stream, none_grad):
             g.to_local(),
             dst=state.worker_rank,
             gather_list=gather_list,
-            group=mesh.get_group(),
         )
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
@@ -105,14 +106,14 @@ def _compute_u(state, steps, rank, compute_stream):
 @torch.no_grad()
 def _scatter(p, state, lr, weight_decay, rank, comm_stream):
     u = state.computed_u
-    mesh = p.device_mesh
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
-            scatter_list = list(torch.split(u, p.size(0) // mesh.mesh.numel(), dim=0))
         else:
             scatter_list = None
@@ -121,7 +122,7 @@ def _scatter(p, state, lr, weight_decay, rank, comm_stream):
             u,
             scatter_list=scatter_list,
             src=state.worker_rank,
-            group=mesh.get_group(),
         )
         if rank == state.worker_rank:
             # Clear u to free memory
@@ -129,7 +130,7 @@ def _scatter(p, state, lr, weight_decay, rank, comm_stream):
         u = DTensor.from_local(
             u,
             placements=p.placements,
-            device_mesh=mesh,
         )
         p.data.mul_(1 - lr * weight_decay)
         p.data.add_(u, alpha=-lr)
@@ -235,6 +236,23 @@ class Muon(torch.optim.Optimizer):
         adjusted_lr = lr * adjusted_ratio
         return adjusted_lr
     def init_state_and_assign_params(self, params, group):
         param_to_state = {}
         param_to_flops = {}
@@ -259,20 +277,20 @@ class Muon(torch.optim.Optimizer):
         round_robin = 0
         mesh = None
         for p in ordered_params:
             if mesh is None:
                 mesh = p.device_mesh
-                if mesh.ndim != 1:
-                    raise NotImplementedError(
-                        "Muon requires a 1D mesh for distributed training yet."
-                    )
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
             param_to_state[id(p)] = _muon_state()
-            param_to_state[id(p)].worker_rank = mesh.mesh[round_robin].item()
-            round_robin = (round_robin + 1) % mesh.mesh.numel()
         return param_to_state, ordered_params
@@ -372,7 +390,7 @@ class Muon(torch.optim.Optimizer):
                     p, state, adjusted_lr, weight_decay, self.rank, self.comm_stream
                 )
-        chunk_size = params[0].device_mesh.mesh.numel()
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())

 import torch
 import torch.distributed as dist
+from torch.distributed._tensor import DTensor, Replicate, Shard
 # This code snippet is a modified version adapted from the following GitHub repositories:
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
+    process_group = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
     if rank == state.worker_rank:
+        num_ranks = dist.get_world_size(group=state.process_group)
+        gather_list = [torch.empty_like(g.to_local()) for _ in range(num_ranks)]
     else:
         gather_list = None
             g.to_local(),
             dst=state.worker_rank,
             gather_list=gather_list,
+            group=state.process_group,
         )
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
 @torch.no_grad()
 def _scatter(p, state, lr, weight_decay, rank, comm_stream):
     u = state.computed_u
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
+            num_ranks = dist.get_world_size(group=state.process_group)
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
+            scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
         else:
             scatter_list = None
             u,
             scatter_list=scatter_list,
             src=state.worker_rank,
+            group=state.process_group,
         )
         if rank == state.worker_rank:
             # Clear u to free memory
         u = DTensor.from_local(
             u,
             placements=p.placements,
+            device_mesh=p.device_mesh,
         )
         p.data.mul_(1 - lr * weight_decay)
         p.data.add_(u, alpha=-lr)
         adjusted_lr = lr * adjusted_ratio
         return adjusted_lr
+    def get_shard_mesh(self, p, rank):
+        """
+        Get the shard mesh for a parameter p on the given rank.
+        """
+        assert isinstance(p, DTensor), "Parallel Muon only supports DTensor parameters."
+        if p.placements == (Shard(dim=0),):
+            # Case for FSDP
+            return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
+        elif p.placements == (Replicate(), Shard(dim=0)):
+            # Case for HSDP
+            for i, shard_mesh in enumerate(p.device_mesh.mesh):
+                if rank in shard_mesh:
+                    return shard_mesh, p.device_mesh.get_group(mesh_dim=1)
+        else:
+            raise ValueError(f"Unsupported placements ({p.placements}).")
     def init_state_and_assign_params(self, params, group):
         param_to_state = {}
         param_to_flops = {}
         round_robin = 0
         mesh = None
+        shard_mesh = None
+        process_group = None
         for p in ordered_params:
             if mesh is None:
                 mesh = p.device_mesh
+                shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
             param_to_state[id(p)] = _muon_state()
+            param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
+            param_to_state[id(p)].process_group = process_group
+            round_robin = (round_robin + 1) % len(shard_mesh)
         return param_to_state, ordered_params
                     p, state, adjusted_lr, weight_decay, self.rank, self.comm_stream
                 )
+        chunk_size = dist.get_world_size(param_to_state[id(params[0])].process_group)
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())

build/torch28-cxx11-cu126-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc CHANGED Viewed

Binary files a/build/torch28-cxx11-cu126-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc and b/build/torch28-cxx11-cu126-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc differ

build/torch28-cxx11-cu126-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc CHANGED Viewed

Binary files a/build/torch28-cxx11-cu126-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc and b/build/torch28-cxx11-cu126-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc differ

build/torch28-cxx11-cu126-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_1f13dae_dirty
-ops = torch.ops._optimizer_1f13dae_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_1f13dae_dirty::{op_name}"

 import torch
+from . import _optimizer_2dc97a1_dirty
+ops = torch.ops._optimizer_2dc97a1_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_2dc97a1_dirty::{op_name}"

build/{torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so → torch28-cxx11-cu126-x86_64-linux/optimizer/_optimizer_2dc97a1_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:96c7e281f9634e3b252f720f4fea4f61490f2f1a1ef1280a3e259decb41c846f
 size 1824256

 version https://git-lfs.github.com/spec/v1
+oid sha256:a423eb4ab3a31c53a3326c71e34fa59fc661f8d432701e41a7de900a9c23e37c
 size 1824256

build/torch28-cxx11-cu126-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -3,7 +3,7 @@ from dataclasses import dataclass
 import torch
 import torch.distributed as dist
-from torch.distributed._tensor import DTensor, Replicate
 # This code snippet is a modified version adapted from the following GitHub repositories:
@@ -50,15 +50,16 @@ class _muon_state:
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
-    mesh = g.device_mesh
     if rank == state.worker_rank:
-        gather_list = [torch.empty_like(g.to_local()) for _ in range(mesh.mesh.numel())]
     else:
         gather_list = None
@@ -67,7 +68,7 @@ def _gather(p, state, rank, comm_stream, none_grad):
             g.to_local(),
             dst=state.worker_rank,
             gather_list=gather_list,
-            group=mesh.get_group(),
         )
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
@@ -105,14 +106,14 @@ def _compute_u(state, steps, rank, compute_stream):
 @torch.no_grad()
 def _scatter(p, state, lr, weight_decay, rank, comm_stream):
     u = state.computed_u
-    mesh = p.device_mesh
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
-            scatter_list = list(torch.split(u, p.size(0) // mesh.mesh.numel(), dim=0))
         else:
             scatter_list = None
@@ -121,7 +122,7 @@ def _scatter(p, state, lr, weight_decay, rank, comm_stream):
             u,
             scatter_list=scatter_list,
             src=state.worker_rank,
-            group=mesh.get_group(),
         )
         if rank == state.worker_rank:
             # Clear u to free memory
@@ -129,7 +130,7 @@ def _scatter(p, state, lr, weight_decay, rank, comm_stream):
         u = DTensor.from_local(
             u,
             placements=p.placements,
-            device_mesh=mesh,
         )
         p.data.mul_(1 - lr * weight_decay)
         p.data.add_(u, alpha=-lr)
@@ -235,6 +236,23 @@ class Muon(torch.optim.Optimizer):
         adjusted_lr = lr * adjusted_ratio
         return adjusted_lr
     def init_state_and_assign_params(self, params, group):
         param_to_state = {}
         param_to_flops = {}
@@ -259,20 +277,20 @@ class Muon(torch.optim.Optimizer):
         round_robin = 0
         mesh = None
         for p in ordered_params:
             if mesh is None:
                 mesh = p.device_mesh
-                if mesh.ndim != 1:
-                    raise NotImplementedError(
-                        "Muon requires a 1D mesh for distributed training yet."
-                    )
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
             param_to_state[id(p)] = _muon_state()
-            param_to_state[id(p)].worker_rank = mesh.mesh[round_robin].item()
-            round_robin = (round_robin + 1) % mesh.mesh.numel()
         return param_to_state, ordered_params
@@ -372,7 +390,7 @@ class Muon(torch.optim.Optimizer):
                     p, state, adjusted_lr, weight_decay, self.rank, self.comm_stream
                 )
-        chunk_size = params[0].device_mesh.mesh.numel()
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())

 import torch
 import torch.distributed as dist
+from torch.distributed._tensor import DTensor, Replicate, Shard
 # This code snippet is a modified version adapted from the following GitHub repositories:
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
+    process_group = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
     if rank == state.worker_rank:
+        num_ranks = dist.get_world_size(group=state.process_group)
+        gather_list = [torch.empty_like(g.to_local()) for _ in range(num_ranks)]
     else:
         gather_list = None
             g.to_local(),
             dst=state.worker_rank,
             gather_list=gather_list,
+            group=state.process_group,
         )
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
 @torch.no_grad()
 def _scatter(p, state, lr, weight_decay, rank, comm_stream):
     u = state.computed_u
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
+            num_ranks = dist.get_world_size(group=state.process_group)
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
+            scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
         else:
             scatter_list = None
             u,
             scatter_list=scatter_list,
             src=state.worker_rank,
+            group=state.process_group,
         )
         if rank == state.worker_rank:
             # Clear u to free memory
         u = DTensor.from_local(
             u,
             placements=p.placements,
+            device_mesh=p.device_mesh,
         )
         p.data.mul_(1 - lr * weight_decay)
         p.data.add_(u, alpha=-lr)
         adjusted_lr = lr * adjusted_ratio
         return adjusted_lr
+    def get_shard_mesh(self, p, rank):
+        """
+        Get the shard mesh for a parameter p on the given rank.
+        """
+        assert isinstance(p, DTensor), "Parallel Muon only supports DTensor parameters."
+        if p.placements == (Shard(dim=0),):
+            # Case for FSDP
+            return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
+        elif p.placements == (Replicate(), Shard(dim=0)):
+            # Case for HSDP
+            for i, shard_mesh in enumerate(p.device_mesh.mesh):
+                if rank in shard_mesh:
+                    return shard_mesh, p.device_mesh.get_group(mesh_dim=1)
+        else:
+            raise ValueError(f"Unsupported placements ({p.placements}).")
     def init_state_and_assign_params(self, params, group):
         param_to_state = {}
         param_to_flops = {}
         round_robin = 0
         mesh = None
+        shard_mesh = None
+        process_group = None
         for p in ordered_params:
             if mesh is None:
                 mesh = p.device_mesh
+                shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
             param_to_state[id(p)] = _muon_state()
+            param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
+            param_to_state[id(p)].process_group = process_group
+            round_robin = (round_robin + 1) % len(shard_mesh)
         return param_to_state, ordered_params
                     p, state, adjusted_lr, weight_decay, self.rank, self.comm_stream
                 )
+        chunk_size = dist.get_world_size(param_to_state[id(params[0])].process_group)
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())

build/torch28-cxx11-cu128-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc CHANGED Viewed

Binary files a/build/torch28-cxx11-cu128-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc and b/build/torch28-cxx11-cu128-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc differ

build/torch28-cxx11-cu128-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc CHANGED Viewed

Binary files a/build/torch28-cxx11-cu128-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc and b/build/torch28-cxx11-cu128-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc differ

build/torch28-cxx11-cu128-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_1f13dae_dirty
-ops = torch.ops._optimizer_1f13dae_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_1f13dae_dirty::{op_name}"

 import torch
+from . import _optimizer_2dc97a1_dirty
+ops = torch.ops._optimizer_2dc97a1_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_2dc97a1_dirty::{op_name}"

build/{torch28-cxx11-cu129-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so → torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_2dc97a1_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:89fea7bfad71c806bc10bf2dc6aa66a6e154c09fc418498b1cab7f48a83432d4
 size 1883352

 version https://git-lfs.github.com/spec/v1
+oid sha256:86d98863cc7ef0b271808b0ef7b1082603cfb5a76986481df37431527aaaf27b
 size 1883352

build/torch28-cxx11-cu128-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -3,7 +3,7 @@ from dataclasses import dataclass
 import torch
 import torch.distributed as dist
-from torch.distributed._tensor import DTensor, Replicate
 # This code snippet is a modified version adapted from the following GitHub repositories:
@@ -50,15 +50,16 @@ class _muon_state:
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
-    mesh = g.device_mesh
     if rank == state.worker_rank:
-        gather_list = [torch.empty_like(g.to_local()) for _ in range(mesh.mesh.numel())]
     else:
         gather_list = None
@@ -67,7 +68,7 @@ def _gather(p, state, rank, comm_stream, none_grad):
             g.to_local(),
             dst=state.worker_rank,
             gather_list=gather_list,
-            group=mesh.get_group(),
         )
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
@@ -105,14 +106,14 @@ def _compute_u(state, steps, rank, compute_stream):
 @torch.no_grad()
 def _scatter(p, state, lr, weight_decay, rank, comm_stream):
     u = state.computed_u
-    mesh = p.device_mesh
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
-            scatter_list = list(torch.split(u, p.size(0) // mesh.mesh.numel(), dim=0))
         else:
             scatter_list = None
@@ -121,7 +122,7 @@ def _scatter(p, state, lr, weight_decay, rank, comm_stream):
             u,
             scatter_list=scatter_list,
             src=state.worker_rank,
-            group=mesh.get_group(),
         )
         if rank == state.worker_rank:
             # Clear u to free memory
@@ -129,7 +130,7 @@ def _scatter(p, state, lr, weight_decay, rank, comm_stream):
         u = DTensor.from_local(
             u,
             placements=p.placements,
-            device_mesh=mesh,
         )
         p.data.mul_(1 - lr * weight_decay)
         p.data.add_(u, alpha=-lr)
@@ -235,6 +236,23 @@ class Muon(torch.optim.Optimizer):
         adjusted_lr = lr * adjusted_ratio
         return adjusted_lr
     def init_state_and_assign_params(self, params, group):
         param_to_state = {}
         param_to_flops = {}
@@ -259,20 +277,20 @@ class Muon(torch.optim.Optimizer):
         round_robin = 0
         mesh = None
         for p in ordered_params:
             if mesh is None:
                 mesh = p.device_mesh
-                if mesh.ndim != 1:
-                    raise NotImplementedError(
-                        "Muon requires a 1D mesh for distributed training yet."
-                    )
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
             param_to_state[id(p)] = _muon_state()
-            param_to_state[id(p)].worker_rank = mesh.mesh[round_robin].item()
-            round_robin = (round_robin + 1) % mesh.mesh.numel()
         return param_to_state, ordered_params
@@ -372,7 +390,7 @@ class Muon(torch.optim.Optimizer):
                     p, state, adjusted_lr, weight_decay, self.rank, self.comm_stream
                 )
-        chunk_size = params[0].device_mesh.mesh.numel()
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())

 import torch
 import torch.distributed as dist
+from torch.distributed._tensor import DTensor, Replicate, Shard
 # This code snippet is a modified version adapted from the following GitHub repositories:
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
+    process_group = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
     if rank == state.worker_rank:
+        num_ranks = dist.get_world_size(group=state.process_group)
+        gather_list = [torch.empty_like(g.to_local()) for _ in range(num_ranks)]
     else:
         gather_list = None
             g.to_local(),
             dst=state.worker_rank,
             gather_list=gather_list,
+            group=state.process_group,
         )
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
 @torch.no_grad()
 def _scatter(p, state, lr, weight_decay, rank, comm_stream):
     u = state.computed_u
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
+            num_ranks = dist.get_world_size(group=state.process_group)
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
+            scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
         else:
             scatter_list = None
             u,
             scatter_list=scatter_list,
             src=state.worker_rank,
+            group=state.process_group,
         )
         if rank == state.worker_rank:
             # Clear u to free memory
         u = DTensor.from_local(
             u,
             placements=p.placements,
+            device_mesh=p.device_mesh,
         )
         p.data.mul_(1 - lr * weight_decay)
         p.data.add_(u, alpha=-lr)
         adjusted_lr = lr * adjusted_ratio
         return adjusted_lr
+    def get_shard_mesh(self, p, rank):
+        """
+        Get the shard mesh for a parameter p on the given rank.
+        """
+        assert isinstance(p, DTensor), "Parallel Muon only supports DTensor parameters."
+        if p.placements == (Shard(dim=0),):
+            # Case for FSDP
+            return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
+        elif p.placements == (Replicate(), Shard(dim=0)):
+            # Case for HSDP
+            for i, shard_mesh in enumerate(p.device_mesh.mesh):
+                if rank in shard_mesh:
+                    return shard_mesh, p.device_mesh.get_group(mesh_dim=1)
+        else:
+            raise ValueError(f"Unsupported placements ({p.placements}).")
     def init_state_and_assign_params(self, params, group):
         param_to_state = {}
         param_to_flops = {}
         round_robin = 0
         mesh = None
+        shard_mesh = None
+        process_group = None
         for p in ordered_params:
             if mesh is None:
                 mesh = p.device_mesh
+                shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
             param_to_state[id(p)] = _muon_state()
+            param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
+            param_to_state[id(p)].process_group = process_group
+            round_robin = (round_robin + 1) % len(shard_mesh)
         return param_to_state, ordered_params
                     p, state, adjusted_lr, weight_decay, self.rank, self.comm_stream
                 )
+        chunk_size = dist.get_world_size(param_to_state[id(params[0])].process_group)
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())

build/torch28-cxx11-cu129-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc CHANGED Viewed

Binary files a/build/torch28-cxx11-cu129-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc and b/build/torch28-cxx11-cu129-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc differ

build/torch28-cxx11-cu129-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc CHANGED Viewed

Binary files a/build/torch28-cxx11-cu129-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc and b/build/torch28-cxx11-cu129-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc differ

build/torch28-cxx11-cu129-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_1f13dae_dirty
-ops = torch.ops._optimizer_1f13dae_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_1f13dae_dirty::{op_name}"

 import torch
+from . import _optimizer_2dc97a1_dirty
+ops = torch.ops._optimizer_2dc97a1_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_2dc97a1_dirty::{op_name}"

build/{torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so → torch28-cxx11-cu129-x86_64-linux/optimizer/_optimizer_2dc97a1_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:046a45fae81c2b7d79ff2237a1d26277f4883ef8a8b87a3980bf06d1182711b1
 size 1883352

 version https://git-lfs.github.com/spec/v1
+oid sha256:f8daaad69e6958850f848fab60c9acb938c3a5e54e3ec34a1bec03a3d32653cb
 size 1883352

build/torch28-cxx11-cu129-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -3,7 +3,7 @@ from dataclasses import dataclass
 import torch
 import torch.distributed as dist
-from torch.distributed._tensor import DTensor, Replicate
 # This code snippet is a modified version adapted from the following GitHub repositories:
@@ -50,15 +50,16 @@ class _muon_state:
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
-    mesh = g.device_mesh
     if rank == state.worker_rank:
-        gather_list = [torch.empty_like(g.to_local()) for _ in range(mesh.mesh.numel())]
     else:
         gather_list = None
@@ -67,7 +68,7 @@ def _gather(p, state, rank, comm_stream, none_grad):
             g.to_local(),
             dst=state.worker_rank,
             gather_list=gather_list,
-            group=mesh.get_group(),
         )
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
@@ -105,14 +106,14 @@ def _compute_u(state, steps, rank, compute_stream):
 @torch.no_grad()
 def _scatter(p, state, lr, weight_decay, rank, comm_stream):
     u = state.computed_u
-    mesh = p.device_mesh
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
-            scatter_list = list(torch.split(u, p.size(0) // mesh.mesh.numel(), dim=0))
         else:
             scatter_list = None
@@ -121,7 +122,7 @@ def _scatter(p, state, lr, weight_decay, rank, comm_stream):
             u,
             scatter_list=scatter_list,
             src=state.worker_rank,
-            group=mesh.get_group(),
         )
         if rank == state.worker_rank:
             # Clear u to free memory
@@ -129,7 +130,7 @@ def _scatter(p, state, lr, weight_decay, rank, comm_stream):
         u = DTensor.from_local(
             u,
             placements=p.placements,
-            device_mesh=mesh,
         )
         p.data.mul_(1 - lr * weight_decay)
         p.data.add_(u, alpha=-lr)
@@ -235,6 +236,23 @@ class Muon(torch.optim.Optimizer):
         adjusted_lr = lr * adjusted_ratio
         return adjusted_lr
     def init_state_and_assign_params(self, params, group):
         param_to_state = {}
         param_to_flops = {}
@@ -259,20 +277,20 @@ class Muon(torch.optim.Optimizer):
         round_robin = 0
         mesh = None
         for p in ordered_params:
             if mesh is None:
                 mesh = p.device_mesh
-                if mesh.ndim != 1:
-                    raise NotImplementedError(
-                        "Muon requires a 1D mesh for distributed training yet."
-                    )
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
             param_to_state[id(p)] = _muon_state()
-            param_to_state[id(p)].worker_rank = mesh.mesh[round_robin].item()
-            round_robin = (round_robin + 1) % mesh.mesh.numel()
         return param_to_state, ordered_params
@@ -372,7 +390,7 @@ class Muon(torch.optim.Optimizer):
                     p, state, adjusted_lr, weight_decay, self.rank, self.comm_stream
                 )
-        chunk_size = params[0].device_mesh.mesh.numel()
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())

 import torch
 import torch.distributed as dist
+from torch.distributed._tensor import DTensor, Replicate, Shard
 # This code snippet is a modified version adapted from the following GitHub repositories:
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
+    process_group = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
     if rank == state.worker_rank:
+        num_ranks = dist.get_world_size(group=state.process_group)
+        gather_list = [torch.empty_like(g.to_local()) for _ in range(num_ranks)]
     else:
         gather_list = None
             g.to_local(),
             dst=state.worker_rank,
             gather_list=gather_list,
+            group=state.process_group,
         )
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
 @torch.no_grad()
 def _scatter(p, state, lr, weight_decay, rank, comm_stream):
     u = state.computed_u
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
+            num_ranks = dist.get_world_size(group=state.process_group)
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
+            scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
         else:
             scatter_list = None
             u,
             scatter_list=scatter_list,
             src=state.worker_rank,
+            group=state.process_group,
         )
         if rank == state.worker_rank:
             # Clear u to free memory
         u = DTensor.from_local(
             u,
             placements=p.placements,
+            device_mesh=p.device_mesh,
         )
         p.data.mul_(1 - lr * weight_decay)
         p.data.add_(u, alpha=-lr)
         adjusted_lr = lr * adjusted_ratio
         return adjusted_lr
+    def get_shard_mesh(self, p, rank):
+        """
+        Get the shard mesh for a parameter p on the given rank.
+        """
+        assert isinstance(p, DTensor), "Parallel Muon only supports DTensor parameters."
+        if p.placements == (Shard(dim=0),):
+            # Case for FSDP
+            return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
+        elif p.placements == (Replicate(), Shard(dim=0)):
+            # Case for HSDP
+            for i, shard_mesh in enumerate(p.device_mesh.mesh):
+                if rank in shard_mesh:
+                    return shard_mesh, p.device_mesh.get_group(mesh_dim=1)
+        else:
+            raise ValueError(f"Unsupported placements ({p.placements}).")
     def init_state_and_assign_params(self, params, group):
         param_to_state = {}
         param_to_flops = {}
         round_robin = 0
         mesh = None
+        shard_mesh = None
+        process_group = None
         for p in ordered_params:
             if mesh is None:
                 mesh = p.device_mesh
+                shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
             param_to_state[id(p)] = _muon_state()
+            param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
+            param_to_state[id(p)].process_group = process_group
+            round_robin = (round_robin + 1) % len(shard_mesh)
         return param_to_state, ordered_params
                     p, state, adjusted_lr, weight_decay, self.rank, self.comm_stream
                 )
+        chunk_size = dist.get_world_size(param_to_state[id(params[0])].process_group)
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())

build/torch28-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc CHANGED Viewed

Binary files a/build/torch28-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc and b/build/torch28-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc differ

build/torch28-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc CHANGED Viewed

Binary files a/build/torch28-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc and b/build/torch28-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc differ

build/torch28-cxx11-rocm63-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_1f13dae_dirty
-ops = torch.ops._optimizer_1f13dae_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_1f13dae_dirty::{op_name}"

 import torch
+from . import _optimizer_2dc97a1_dirty
+ops = torch.ops._optimizer_2dc97a1_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_2dc97a1_dirty::{op_name}"

build/torch28-cxx11-rocm63-x86_64-linux/optimizer/{_optimizer_1f13dae_dirty.abi3.so → _optimizer_2dc97a1_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0805952950efdbe79c378ca84ae62b77d2d11cd2ba680c8ffccfd79301489ac5
 size 1750000

 version https://git-lfs.github.com/spec/v1
+oid sha256:76910ba81e2c95c83207118725c4379db636346c4ccf05010e2ee00c41dff1ce
 size 1750000

build/torch28-cxx11-rocm63-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -3,7 +3,7 @@ from dataclasses import dataclass
 import torch
 import torch.distributed as dist
-from torch.distributed._tensor import DTensor, Replicate
 # This code snippet is a modified version adapted from the following GitHub repositories:
@@ -50,15 +50,16 @@ class _muon_state:
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
-    mesh = g.device_mesh
     if rank == state.worker_rank:
-        gather_list = [torch.empty_like(g.to_local()) for _ in range(mesh.mesh.numel())]
     else:
         gather_list = None
@@ -67,7 +68,7 @@ def _gather(p, state, rank, comm_stream, none_grad):
             g.to_local(),
             dst=state.worker_rank,
             gather_list=gather_list,
-            group=mesh.get_group(),
         )
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
@@ -105,14 +106,14 @@ def _compute_u(state, steps, rank, compute_stream):
 @torch.no_grad()
 def _scatter(p, state, lr, weight_decay, rank, comm_stream):
     u = state.computed_u
-    mesh = p.device_mesh
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
-            scatter_list = list(torch.split(u, p.size(0) // mesh.mesh.numel(), dim=0))
         else:
             scatter_list = None
@@ -121,7 +122,7 @@ def _scatter(p, state, lr, weight_decay, rank, comm_stream):
             u,
             scatter_list=scatter_list,
             src=state.worker_rank,
-            group=mesh.get_group(),
         )
         if rank == state.worker_rank:
             # Clear u to free memory
@@ -129,7 +130,7 @@ def _scatter(p, state, lr, weight_decay, rank, comm_stream):
         u = DTensor.from_local(
             u,
             placements=p.placements,
-            device_mesh=mesh,
         )
         p.data.mul_(1 - lr * weight_decay)
         p.data.add_(u, alpha=-lr)
@@ -235,6 +236,23 @@ class Muon(torch.optim.Optimizer):
         adjusted_lr = lr * adjusted_ratio
         return adjusted_lr
     def init_state_and_assign_params(self, params, group):
         param_to_state = {}
         param_to_flops = {}
@@ -259,20 +277,20 @@ class Muon(torch.optim.Optimizer):
         round_robin = 0
         mesh = None
         for p in ordered_params:
             if mesh is None:
                 mesh = p.device_mesh
-                if mesh.ndim != 1:
-                    raise NotImplementedError(
-                        "Muon requires a 1D mesh for distributed training yet."
-                    )
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
             param_to_state[id(p)] = _muon_state()
-            param_to_state[id(p)].worker_rank = mesh.mesh[round_robin].item()
-            round_robin = (round_robin + 1) % mesh.mesh.numel()
         return param_to_state, ordered_params
@@ -372,7 +390,7 @@ class Muon(torch.optim.Optimizer):
                     p, state, adjusted_lr, weight_decay, self.rank, self.comm_stream
                 )
-        chunk_size = params[0].device_mesh.mesh.numel()
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())

 import torch
 import torch.distributed as dist
+from torch.distributed._tensor import DTensor, Replicate, Shard
 # This code snippet is a modified version adapted from the following GitHub repositories:
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
+    process_group = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
     if rank == state.worker_rank:
+        num_ranks = dist.get_world_size(group=state.process_group)
+        gather_list = [torch.empty_like(g.to_local()) for _ in range(num_ranks)]
     else:
         gather_list = None
             g.to_local(),
             dst=state.worker_rank,
             gather_list=gather_list,
+            group=state.process_group,
         )
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
 @torch.no_grad()
 def _scatter(p, state, lr, weight_decay, rank, comm_stream):
     u = state.computed_u
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
+            num_ranks = dist.get_world_size(group=state.process_group)
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
+            scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
         else:
             scatter_list = None
             u,
             scatter_list=scatter_list,
             src=state.worker_rank,
+            group=state.process_group,
         )
         if rank == state.worker_rank:
             # Clear u to free memory
         u = DTensor.from_local(
             u,
             placements=p.placements,
+            device_mesh=p.device_mesh,
         )
         p.data.mul_(1 - lr * weight_decay)
         p.data.add_(u, alpha=-lr)
         adjusted_lr = lr * adjusted_ratio
         return adjusted_lr
+    def get_shard_mesh(self, p, rank):
+        """
+        Get the shard mesh for a parameter p on the given rank.
+        """
+        assert isinstance(p, DTensor), "Parallel Muon only supports DTensor parameters."
+        if p.placements == (Shard(dim=0),):
+            # Case for FSDP
+            return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
+        elif p.placements == (Replicate(), Shard(dim=0)):
+            # Case for HSDP
+            for i, shard_mesh in enumerate(p.device_mesh.mesh):
+                if rank in shard_mesh:
+                    return shard_mesh, p.device_mesh.get_group(mesh_dim=1)
+        else:
+            raise ValueError(f"Unsupported placements ({p.placements}).")
     def init_state_and_assign_params(self, params, group):
         param_to_state = {}
         param_to_flops = {}
         round_robin = 0
         mesh = None
+        shard_mesh = None
+        process_group = None
         for p in ordered_params:
             if mesh is None:
                 mesh = p.device_mesh
+                shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
             param_to_state[id(p)] = _muon_state()
+            param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
+            param_to_state[id(p)].process_group = process_group
+            round_robin = (round_robin + 1) % len(shard_mesh)
         return param_to_state, ordered_params
                     p, state, adjusted_lr, weight_decay, self.rank, self.comm_stream
                 )
+        chunk_size = dist.get_world_size(param_to_state[id(params[0])].process_group)
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())

build/torch28-cxx11-rocm64-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc CHANGED Viewed

Binary files a/build/torch28-cxx11-rocm64-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc and b/build/torch28-cxx11-rocm64-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc differ

build/torch28-cxx11-rocm64-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc CHANGED Viewed

Binary files a/build/torch28-cxx11-rocm64-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc and b/build/torch28-cxx11-rocm64-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc differ

build/torch28-cxx11-rocm64-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_1f13dae_dirty
-ops = torch.ops._optimizer_1f13dae_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_1f13dae_dirty::{op_name}"

 import torch
+from . import _optimizer_2dc97a1_dirty
+ops = torch.ops._optimizer_2dc97a1_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_2dc97a1_dirty::{op_name}"

build/torch28-cxx11-rocm64-x86_64-linux/optimizer/{_optimizer_1f13dae_dirty.abi3.so → _optimizer_2dc97a1_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:af91f4eec9fc14d66f3db4e120d4913a0e62102c76b9b8cd9c25d8af427be290
 size 1750088

 version https://git-lfs.github.com/spec/v1
+oid sha256:dd0a35a6f846a075a8f4561cfc66ef17c6358dd4a0062e63057b02625d9d6af7
 size 1750088

build/torch28-cxx11-rocm64-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -3,7 +3,7 @@ from dataclasses import dataclass
 import torch
 import torch.distributed as dist
-from torch.distributed._tensor import DTensor, Replicate
 # This code snippet is a modified version adapted from the following GitHub repositories:
@@ -50,15 +50,16 @@ class _muon_state:
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
-    mesh = g.device_mesh
     if rank == state.worker_rank:
-        gather_list = [torch.empty_like(g.to_local()) for _ in range(mesh.mesh.numel())]
     else:
         gather_list = None
@@ -67,7 +68,7 @@ def _gather(p, state, rank, comm_stream, none_grad):
             g.to_local(),
             dst=state.worker_rank,
             gather_list=gather_list,
-            group=mesh.get_group(),
         )
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
@@ -105,14 +106,14 @@ def _compute_u(state, steps, rank, compute_stream):
 @torch.no_grad()
 def _scatter(p, state, lr, weight_decay, rank, comm_stream):
     u = state.computed_u
-    mesh = p.device_mesh
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
-            scatter_list = list(torch.split(u, p.size(0) // mesh.mesh.numel(), dim=0))
         else:
             scatter_list = None
@@ -121,7 +122,7 @@ def _scatter(p, state, lr, weight_decay, rank, comm_stream):
             u,
             scatter_list=scatter_list,
             src=state.worker_rank,
-            group=mesh.get_group(),
         )
         if rank == state.worker_rank:
             # Clear u to free memory
@@ -129,7 +130,7 @@ def _scatter(p, state, lr, weight_decay, rank, comm_stream):
         u = DTensor.from_local(
             u,
             placements=p.placements,
-            device_mesh=mesh,
         )
         p.data.mul_(1 - lr * weight_decay)
         p.data.add_(u, alpha=-lr)
@@ -235,6 +236,23 @@ class Muon(torch.optim.Optimizer):
         adjusted_lr = lr * adjusted_ratio
         return adjusted_lr
     def init_state_and_assign_params(self, params, group):
         param_to_state = {}
         param_to_flops = {}
@@ -259,20 +277,20 @@ class Muon(torch.optim.Optimizer):
         round_robin = 0
         mesh = None
         for p in ordered_params:
             if mesh is None:
                 mesh = p.device_mesh
-                if mesh.ndim != 1:
-                    raise NotImplementedError(
-                        "Muon requires a 1D mesh for distributed training yet."
-                    )
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
             param_to_state[id(p)] = _muon_state()
-            param_to_state[id(p)].worker_rank = mesh.mesh[round_robin].item()
-            round_robin = (round_robin + 1) % mesh.mesh.numel()
         return param_to_state, ordered_params
@@ -372,7 +390,7 @@ class Muon(torch.optim.Optimizer):
                     p, state, adjusted_lr, weight_decay, self.rank, self.comm_stream
                 )
-        chunk_size = params[0].device_mesh.mesh.numel()
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())

 import torch
 import torch.distributed as dist
+from torch.distributed._tensor import DTensor, Replicate, Shard
 # This code snippet is a modified version adapted from the following GitHub repositories:
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
+    process_group = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
     if rank == state.worker_rank:
+        num_ranks = dist.get_world_size(group=state.process_group)
+        gather_list = [torch.empty_like(g.to_local()) for _ in range(num_ranks)]
     else:
         gather_list = None
             g.to_local(),
             dst=state.worker_rank,
             gather_list=gather_list,
+            group=state.process_group,
         )
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
 @torch.no_grad()
 def _scatter(p, state, lr, weight_decay, rank, comm_stream):
     u = state.computed_u
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
+            num_ranks = dist.get_world_size(group=state.process_group)
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
+            scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
         else:
             scatter_list = None
             u,
             scatter_list=scatter_list,
             src=state.worker_rank,
+            group=state.process_group,
         )
         if rank == state.worker_rank:
             # Clear u to free memory
         u = DTensor.from_local(
             u,
             placements=p.placements,
+            device_mesh=p.device_mesh,
         )
         p.data.mul_(1 - lr * weight_decay)
         p.data.add_(u, alpha=-lr)
         adjusted_lr = lr * adjusted_ratio
         return adjusted_lr
+    def get_shard_mesh(self, p, rank):
+        """
+        Get the shard mesh for a parameter p on the given rank.
+        """
+        assert isinstance(p, DTensor), "Parallel Muon only supports DTensor parameters."
+        if p.placements == (Shard(dim=0),):
+            # Case for FSDP
+            return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
+        elif p.placements == (Replicate(), Shard(dim=0)):
+            # Case for HSDP
+            for i, shard_mesh in enumerate(p.device_mesh.mesh):
+                if rank in shard_mesh:
+                    return shard_mesh, p.device_mesh.get_group(mesh_dim=1)
+        else:
+            raise ValueError(f"Unsupported placements ({p.placements}).")
     def init_state_and_assign_params(self, params, group):
         param_to_state = {}
         param_to_flops = {}
         round_robin = 0
         mesh = None
+        shard_mesh = None
+        process_group = None
         for p in ordered_params:
             if mesh is None:
                 mesh = p.device_mesh
+                shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
             param_to_state[id(p)] = _muon_state()
+            param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
+            param_to_state[id(p)].process_group = process_group
+            round_robin = (round_robin + 1) % len(shard_mesh)
         return param_to_state, ordered_params
                     p, state, adjusted_lr, weight_decay, self.rank, self.comm_stream
                 )
+        chunk_size = dist.get_world_size(param_to_state[id(params[0])].process_group)
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())

torch-ext/optimizer/muon.py CHANGED Viewed

@@ -3,7 +3,7 @@ from dataclasses import dataclass
 import torch
 import torch.distributed as dist
-from torch.distributed._tensor import DTensor, Replicate
 # This code snippet is a modified version adapted from the following GitHub repositories:
@@ -50,15 +50,16 @@ class _muon_state:
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
-    mesh = g.device_mesh
     if rank == state.worker_rank:
-        gather_list = [torch.empty_like(g.to_local()) for _ in range(mesh.mesh.numel())]
     else:
         gather_list = None
@@ -67,7 +68,7 @@ def _gather(p, state, rank, comm_stream, none_grad):
             g.to_local(),
             dst=state.worker_rank,
             gather_list=gather_list,
-            group=mesh.get_group(),
         )
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
@@ -105,14 +106,14 @@ def _compute_u(state, steps, rank, compute_stream):
 @torch.no_grad()
 def _scatter(p, state, lr, weight_decay, rank, comm_stream):
     u = state.computed_u
-    mesh = p.device_mesh
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
-            scatter_list = list(torch.split(u, p.size(0) // mesh.mesh.numel(), dim=0))
         else:
             scatter_list = None
@@ -121,7 +122,7 @@ def _scatter(p, state, lr, weight_decay, rank, comm_stream):
             u,
             scatter_list=scatter_list,
             src=state.worker_rank,
-            group=mesh.get_group(),
         )
         if rank == state.worker_rank:
             # Clear u to free memory
@@ -129,7 +130,7 @@ def _scatter(p, state, lr, weight_decay, rank, comm_stream):
         u = DTensor.from_local(
             u,
             placements=p.placements,
-            device_mesh=mesh,
         )
         p.data.mul_(1 - lr * weight_decay)
         p.data.add_(u, alpha=-lr)
@@ -235,6 +236,23 @@ class Muon(torch.optim.Optimizer):
         adjusted_lr = lr * adjusted_ratio
         return adjusted_lr
     def init_state_and_assign_params(self, params, group):
         param_to_state = {}
         param_to_flops = {}
@@ -259,20 +277,20 @@ class Muon(torch.optim.Optimizer):
         round_robin = 0
         mesh = None
         for p in ordered_params:
             if mesh is None:
                 mesh = p.device_mesh
-                if mesh.ndim != 1:
-                    raise NotImplementedError(
-                        "Muon requires a 1D mesh for distributed training yet."
-                    )
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
             param_to_state[id(p)] = _muon_state()
-            param_to_state[id(p)].worker_rank = mesh.mesh[round_robin].item()
-            round_robin = (round_robin + 1) % mesh.mesh.numel()
         return param_to_state, ordered_params
@@ -372,7 +390,7 @@ class Muon(torch.optim.Optimizer):
                     p, state, adjusted_lr, weight_decay, self.rank, self.comm_stream
                 )
-        chunk_size = params[0].device_mesh.mesh.numel()
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())

 import torch
 import torch.distributed as dist
+from torch.distributed._tensor import DTensor, Replicate, Shard
 # This code snippet is a modified version adapted from the following GitHub repositories:
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
+    process_group = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
     if rank == state.worker_rank:
+        num_ranks = dist.get_world_size(group=state.process_group)
+        gather_list = [torch.empty_like(g.to_local()) for _ in range(num_ranks)]
     else:
         gather_list = None
             g.to_local(),
             dst=state.worker_rank,
             gather_list=gather_list,
+            group=state.process_group,
         )
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
 @torch.no_grad()
 def _scatter(p, state, lr, weight_decay, rank, comm_stream):
     u = state.computed_u
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
+            num_ranks = dist.get_world_size(group=state.process_group)
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
+            scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
         else:
             scatter_list = None
             u,
             scatter_list=scatter_list,
             src=state.worker_rank,
+            group=state.process_group,
         )
         if rank == state.worker_rank:
             # Clear u to free memory
         u = DTensor.from_local(
             u,
             placements=p.placements,
+            device_mesh=p.device_mesh,
         )
         p.data.mul_(1 - lr * weight_decay)
         p.data.add_(u, alpha=-lr)
         adjusted_lr = lr * adjusted_ratio
         return adjusted_lr
+    def get_shard_mesh(self, p, rank):
+        """
+        Get the shard mesh for a parameter p on the given rank.
+        """
+        assert isinstance(p, DTensor), "Parallel Muon only supports DTensor parameters."
+        if p.placements == (Shard(dim=0),):
+            # Case for FSDP
+            return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
+        elif p.placements == (Replicate(), Shard(dim=0)):
+            # Case for HSDP
+            for i, shard_mesh in enumerate(p.device_mesh.mesh):
+                if rank in shard_mesh:
+                    return shard_mesh, p.device_mesh.get_group(mesh_dim=1)
+        else:
+            raise ValueError(f"Unsupported placements ({p.placements}).")
     def init_state_and_assign_params(self, params, group):
         param_to_state = {}
         param_to_flops = {}
         round_robin = 0
         mesh = None
+        shard_mesh = None
+        process_group = None
         for p in ordered_params:
             if mesh is None:
                 mesh = p.device_mesh
+                shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
             elif mesh != p.device_mesh:
                 raise ValueError("All parameters must be on the same mesh.")
             param_to_state[id(p)] = _muon_state()
+            param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
+            param_to_state[id(p)].process_group = process_group
+            round_robin = (round_robin + 1) % len(shard_mesh)
         return param_to_state, ordered_params
                     p, state, adjusted_lr, weight_decay, self.rank, self.comm_stream
                 )
+        chunk_size = dist.get_world_size(param_to_state[id(params[0])].process_group)
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())