support torch 2.8

by iamwyldecat - opened 17 days ago

base: refs/heads/main

←

from: refs/pr/3

Discussion Files changed

+84

-1273

This view is limited to 50 files because it contains too many changes. See the raw diff here.

Files changed (50) hide show

build/torch26-cxx98-cu118-x86_64-linux/optimizer/_ops.py +0 -9
build/torch26-cxx98-cu118-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so +0 -3
build/torch26-cxx98-cu124-x86_64-linux/optimizer/__init__.py +0 -5
build/torch26-cxx98-cu124-x86_64-linux/optimizer/_ops.py +0 -9
build/torch26-cxx98-cu124-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so +0 -3
build/torch26-cxx98-cu124-x86_64-linux/optimizer/muon.py +0 -494
build/torch26-cxx98-cu126-x86_64-linux/optimizer/__init__.py +0 -5
build/torch26-cxx98-cu126-x86_64-linux/optimizer/_ops.py +0 -9
build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so +0 -3
build/torch26-cxx98-cu126-x86_64-linux/optimizer/muon.py +0 -494
build/torch27-cxx11-cu118-x86_64-linux/optimizer/__init__.py +0 -0
build/torch27-cxx11-cu118-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch27-cxx11-cu118-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc +0 -0
build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py +3 -3
build/torch27-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_02ac540_dirty.abi3.so → _optimizer_1f13dae_dirty.abi3.so} +1 -1
build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py +0 -0
build/torch27-cxx11-cu126-x86_64-linux/optimizer/__init__.py +0 -0
build/torch27-cxx11-cu126-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch27-cxx11-cu126-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc +0 -0
build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py +3 -3
build/torch27-cxx11-cu126-x86_64-linux/optimizer/{_optimizer_02ac540_dirty.abi3.so → _optimizer_1f13dae_dirty.abi3.so} +1 -1
build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py +0 -0
build/torch27-cxx11-cu128-x86_64-linux/optimizer/__init__.py +0 -0
build/torch27-cxx11-cu128-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch27-cxx11-cu128-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc +0 -0
build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py +3 -3
build/torch27-cxx11-cu128-x86_64-linux/optimizer/{_optimizer_02ac540_dirty.abi3.so → _optimizer_1f13dae_dirty.abi3.so} +1 -1
build/torch27-cxx11-cu128-x86_64-linux/optimizer/muon.py +0 -0
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__init__.py +0 -0
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-312.pyc +0 -0
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-312.pyc +0 -0
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc +0 -0
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_ops.py +3 -3
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so +0 -3
build/{torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so → torch27-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so} +2 -2
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/muon.py +0 -0
build/{torch26-cxx11-cu118-x86_64-linux → torch28-cxx11-cu126-x86_64-linux}/optimizer/__init__.py +0 -0
build/torch28-cxx11-cu126-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch28-cxx11-cu126-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc +0 -0
build/{torch26-cxx11-cu118-x86_64-linux → torch28-cxx11-cu126-x86_64-linux}/optimizer/_ops.py +3 -3
build/{torch26-cxx11-cu118-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so → torch28-cxx11-cu126-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so} +2 -2
build/{torch26-cxx11-cu118-x86_64-linux → torch28-cxx11-cu126-x86_64-linux}/optimizer/muon.py +0 -0
build/{torch26-cxx11-cu124-x86_64-linux → torch28-cxx11-cu128-x86_64-linux}/optimizer/__init__.py +0 -0
build/torch28-cxx11-cu128-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch28-cxx11-cu128-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc +0 -0
build/{torch26-cxx11-cu126-x86_64-linux → torch28-cxx11-cu128-x86_64-linux}/optimizer/_ops.py +3 -3
build/{torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so → torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so} +2 -2
build/{torch26-cxx11-cu124-x86_64-linux → torch28-cxx11-cu128-x86_64-linux}/optimizer/muon.py +0 -0
build/{torch26-cxx11-cu126-x86_64-linux → torch28-cxx11-cu129-x86_64-linux}/optimizer/__init__.py +0 -0

build/torch26-cxx98-cu118-x86_64-linux/optimizer/_ops.py DELETED Viewed

@@ -1,9 +0,0 @@
-import torch
-from . import _optimizer_02ac540_dirty
-ops = torch.ops._optimizer_02ac540_dirty
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_optimizer_02ac540_dirty::{op_name}"

build/torch26-cxx98-cu118-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9e09882858886be06e8ac48d184b320c57624d9c85165ce8b56640b022838e44
-size 1787192

build/torch26-cxx98-cu124-x86_64-linux/optimizer/__init__.py DELETED Viewed

@@ -1,5 +0,0 @@
-from .muon import Muon
-__all__ = [
-    "Muon",
-]

build/torch26-cxx98-cu124-x86_64-linux/optimizer/_ops.py DELETED Viewed

@@ -1,9 +0,0 @@
-import torch
-from . import _optimizer_02ac540_dirty
-ops = torch.ops._optimizer_02ac540_dirty
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_optimizer_02ac540_dirty::{op_name}"

build/torch26-cxx98-cu124-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6f63b2cd2c67b44f5e54837a0a4f26d94d3e6e8bfa4964bd99fc7e38494e2d52
-size 1824184

build/torch26-cxx98-cu124-x86_64-linux/optimizer/muon.py DELETED Viewed

@@ -1,494 +0,0 @@
-import math
-from dataclasses import dataclass
-import torch
-import torch.distributed as dist
-from torch.distributed._tensor import DTensor, Replicate
-# This code snippet is a modified version adapted from the following GitHub repositories:
-# https://github.com/KellerJordan/Muon/blob/master/muon.py
-@torch.no_grad()
-def _zeropower_via_newtonschulz5(G, steps):
-    """
-    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
-    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
-    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
-    zero even beyond the point where the iteration no longer converges all the way to one everywhere
-    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
-    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
-    performance at all relative to UV^T, where USV^T = G is the SVD.
-    """
-    assert len(G.shape) == 2
-    a, b, c = (3.4445, -4.7750, 2.0315)
-    X = G  # no manual typecast
-    if G.size(0) > G.size(1):
-        X = X.T
-    # Ensure spectral norm is at most 1
-    X = X / (X.norm() + 1e-7)
-    X = X.bfloat16()
-    # Perform the NS iterations
-    for _ in range(steps):
-        A = X @ X.T
-        # B = (
-        #    b * A + c * A @ A
-        # )
-        B = torch.addmm(A, A, A, alpha=c, beta=b)
-        # X = a * X + B @ X
-        X = torch.addmm(X, B, X, alpha=1.0, beta=a)
-    if G.size(0) > G.size(1):
-        X = X.T
-    return X.to(G.dtype)
-@dataclass
-class _muon_state:
-    # TODO: use Optional
-    worker_rank: int | None = None
-    gathered_grad: torch.Tensor | None = None
-    computed_u: torch.Tensor | None = None
-    gather_event: torch.cuda.Event | None = None
-    compute_event: torch.cuda.Event | None = None
-@torch.no_grad()
-def _gather(p, state, rank, comm_stream, none_grad):
-    g = p.grad
-    mesh = g.device_mesh
-    if rank == state.worker_rank:
-        gather_list = [torch.empty_like(g.to_local()) for _ in range(mesh.mesh.numel())]
-    else:
-        gather_list = None
-    with torch.cuda.stream(comm_stream):
-        torch.distributed.gather(
-            g.to_local(),
-            dst=state.worker_rank,
-            gather_list=gather_list,
-            group=mesh.get_group(),
-        )
-        if rank == state.worker_rank:
-            if state.gathered_grad is not None:
-                raise RuntimeError(
-                    "Gather event already exists, which should not happen."
-                )
-            state.gathered_grad = torch.cat(gather_list, dim=0)
-            state.gather_event = torch.cuda.Event()
-            state.gather_event.record()
-        else:
-            state.gathered_grad = None
-            state.gather_event = None
-        if none_grad:
-            p.grad = None
-@torch.no_grad()
-def _compute_u(state, steps, rank, compute_stream):
-    with torch.cuda.stream(compute_stream):
-        if rank == state.worker_rank:
-            if state.gather_event is None:
-                raise RuntimeError("Gather event must be set before compute.")
-            compute_stream.wait_event(state.gather_event)
-            u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
-            state.computed_u = u
-            state.compute_event = torch.cuda.Event()
-            state.compute_event.record()
-            # Clear the gathered gradient to free memory
-            state.gathered_grad = None
-        else:
-            state.computed_u = None
-            state.compute_event = None
-@torch.no_grad()
-def _scatter(p, state, lr, weight_decay, rank, comm_stream):
-    u = state.computed_u
-    mesh = p.device_mesh
-    with torch.cuda.stream(comm_stream):
-        if rank == state.worker_rank:
-            if state.compute_event is None:
-                raise RuntimeError("Compute event must be set before scatter.")
-            comm_stream.wait_event(state.compute_event)
-            scatter_list = list(torch.split(u, p.size(0) // mesh.mesh.numel(), dim=0))
-        else:
-            scatter_list = None
-        u = torch.empty_like(p.to_local())
-        torch.distributed.scatter(
-            u,
-            scatter_list=scatter_list,
-            src=state.worker_rank,
-            group=mesh.get_group(),
-        )
-        if rank == state.worker_rank:
-            # Clear u to free memory
-            state.computed_u = None
-        u = DTensor.from_local(
-            u,
-            placements=p.placements,
-            device_mesh=mesh,
-        )
-        p.data.mul_(1 - lr * weight_decay)
-        p.data.add_(u, alpha=-lr)
-def default_is_muon(x, name):
-    return x.ndim >= 2 and "embed_tokens" not in name and "lm_head" not in name
-class Muon(torch.optim.Optimizer):
-    """
-    Muon - MomentUm Orthogonalized by Newton-schulz
-    Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
-    processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
-    matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
-    the advantage that it can be stably run in bfloat16 on the GPU.
-    Some warnings:
-    - We believe this optimizer is unlikely to work well for training with small batch size.
-    - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
-    Arguments:
-        muon_params: The parameters to be optimized by Muon.
-        lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
-        momentum: The momentum used by the internal SGD. (0.95 is a good default)
-        nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
-        ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
-        adamw_params: The parameters to be optimized by AdamW. Any parameters in `muon_params` which are
-        {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
-        adamw_lr: The learning rate for the internal AdamW.
-        adamw_betas: The betas for the internal AdamW.
-        adamw_eps: The epsilon for the internal AdamW.
-        adamw_weight_decay: The weight decay for the internal AdamW.
-    """
-    def __init__(
-        self,
-        model,
-        is_muon_func=default_is_muon,
-        lr=1e-3,
-        momentum=0.95,
-        nesterov=True,
-        ns_steps=5,
-        weight_decay=0.1,
-        adamw_betas=(0.9, 0.95),
-        adamw_eps=1e-8,
-        none_grad=True,
-        debug=False,
-    ):
-        defaults = dict(
-            lr=lr,
-            weight_decay=weight_decay,
-            momentum=momentum,
-            nesterov=nesterov,
-            ns_steps=ns_steps,
-            adamw_betas=adamw_betas,
-            adamw_eps=adamw_eps,
-            none_grad=none_grad,
-        )
-        super().__init__(model.parameters(), defaults)
-        self.is_muon_func = is_muon_func
-        self.model = model
-        if dist.is_initialized():
-            self.rank = dist.get_rank()
-        else:
-            self.rank = None
-        self.comm_stream = torch.cuda.Stream()
-        self.compute_stream = torch.cuda.Stream()
-        self.debug = debug
-    def __setstate__(self, state):
-        # Sort parameters into those for which we will use Muon, and those for which we will not
-        super().__setstate__(state)
-        self._init_state()
-    def _init_state(self):
-        for name, p in self.model.named_parameters():
-            if self.is_muon_func(p, name):
-                # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
-                assert p.ndim == 2, p.ndim
-                self.state[p]["use_muon"] = True
-            else:
-                # Do not use Muon for parameters in adamw_params
-                self.state[p]["use_muon"] = False
-    def _calc_flops(self, G, steps):
-        assert len(G.shape) == 2
-        M, N = G.shape
-        if M > N:
-            M, N = N, M
-        return steps * ((M**3) * 2 + (M**2 * N) * 4 + M * N * 2 + M**2 * 3)
-    def adjust_lr_for_muon(self, lr, param_shape):
-        A, B = param_shape[:2]
-        # We adjust the learning rate and weight decay based on the size of the parameter matrix
-        # as describted in the paper
-        adjusted_ratio = 0.2 * math.sqrt(max(A, B))
-        adjusted_lr = lr * adjusted_ratio
-        return adjusted_lr
-    def init_state_and_assign_params(self, params, group):
-        param_to_state = {}
-        param_to_flops = {}
-        total_flops = 0
-        for p in params:
-            g = p.grad
-            if g is None:
-                continue
-            assert g.ndim == 2, "Muon only supports 2D parameters."
-            flops = self._calc_flops(g, group["ns_steps"])
-            param_to_flops[id(p)] = flops
-            total_flops += flops
-        if self.debug:
-            print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs", flush=True)
-        ordered_params = sorted(
-            params, key=lambda p: param_to_flops[id(p)], reverse=True
-        )
-        round_robin = 0
-        mesh = None
-        for p in ordered_params:
-            if mesh is None:
-                mesh = p.device_mesh
-                if mesh.ndim != 1:
-                    raise NotImplementedError(
-                        "Muon requires a 1D mesh for distributed training yet."
-                    )
-            elif mesh != p.device_mesh:
-                raise ValueError("All parameters must be on the same mesh.")
-            param_to_state[id(p)] = _muon_state()
-            param_to_state[id(p)].worker_rank = mesh.mesh[round_robin].item()
-            round_robin = (round_robin + 1) % mesh.mesh.numel()
-        return param_to_state, ordered_params
-    def base(self, params, group, lr, weight_decay, momentum):
-        # generate weight updates in distributed fashion
-        for p in params:
-            g = p.grad
-            if g is None:
-                continue
-            if g.ndim > 2:
-                g = g.view(g.size(0), -1)
-            assert g is not None
-            # calc update
-            state = self.state[p]
-            if "momentum_buffer" not in state:
-                state["momentum_buffer"] = torch.zeros_like(g)
-            buf = state["momentum_buffer"]
-            buf.mul_(momentum).add_(g)
-            if group["nesterov"]:
-                g = g.add(buf, alpha=momentum)
-            else:
-                g = buf
-            u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
-            # scale update
-            adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
-            # apply weight decay
-            p.data.mul_(1 - lr * weight_decay)
-            # apply update
-            p.data.add_(u, alpha=-adjusted_lr)
-    def _update_g(self, p, g, group, momentum):
-        # calc update
-        state = self.state[p]
-        if "momentum_buffer" not in state:
-            state["momentum_buffer"] = torch.zeros_like(g)
-        buf = state["momentum_buffer"]
-        buf.mul_(momentum).add_(g)
-        if group["nesterov"]:
-            g = g.add(buf, alpha=momentum)
-        else:
-            g = buf
-        return g
-    def _update_p(self, p, u, lr, weight_decay):
-        # scale update
-        adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
-        # apply weight decay
-        p.data.mul_(1 - lr * weight_decay)
-        # apply update
-        p.data.add_(u, alpha=-adjusted_lr)
-    def parallel(self, params, group, lr, weight_decay, momentum):
-        """
-        Perform a parallel optimization step using Muon.
-        """
-        for p in params:
-            g = p.grad
-            if g is None:
-                continue
-            if g.ndim > 2:
-                g = g.view(g.size(0), -1)
-            # Update g in the local rank
-            g = self._update_g(
-                p,
-                g,
-                group,
-                momentum=momentum,
-            )
-            p.grad = g
-        param_to_state, ordered_params = self.init_state_and_assign_params(
-            params, group
-        )
-        def enqueue_gathers(start_idx, chunk_size):
-            for p in ordered_params[start_idx : start_idx + chunk_size]:
-                state = param_to_state[id(p)]
-                _gather(p, state, self.rank, self.comm_stream, group["none_grad"])
-        def enqueue_computes(start_idx, chunk_size):
-            for p in ordered_params[start_idx : start_idx + chunk_size]:
-                state = param_to_state[id(p)]
-                _compute_u(state, group["ns_steps"], self.rank, self.compute_stream)
-        def enqueue_scatters(start_idx, chunk_size):
-            for p in ordered_params[start_idx : start_idx + chunk_size]:
-                state = param_to_state[id(p)]
-                adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
-                _scatter(
-                    p, state, adjusted_lr, weight_decay, self.rank, self.comm_stream
-                )
-        chunk_size = params[0].device_mesh.mesh.numel()
-        # Wait grad update
-        self.comm_stream.wait_stream(torch.cuda.current_stream())
-        enqueue_gathers(0, chunk_size)
-        for i in range(0, len(params) + chunk_size - 1, chunk_size):
-            enqueue_computes(i, chunk_size)
-            enqueue_gathers(i + chunk_size, chunk_size)
-            enqueue_scatters(i, chunk_size)
-        torch.cuda.current_stream().wait_stream(self.comm_stream)
-    def step(self, closure=None):
-        """Perform a single optimization step.
-        Args:
-            closure (Callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-        for group in self.param_groups:
-            ############################
-            #           Muon           #
-            ############################
-            if "use_muon" not in self.state[group["params"][0]]:
-                self._init_state()
-            params = [p for p in group["params"] if self.state[p]["use_muon"]]
-            lr = group["lr"]
-            weight_decay = group["weight_decay"]
-            momentum = group["momentum"]
-            param_dtensors = []
-            param_tensors = []
-            for p in params:
-                if p is None or p.grad is None:
-                    continue
-                if isinstance(p.data, DTensor):
-                    if all(
-                        isinstance(placement, Replicate) for placement in p.placements
-                    ):
-                        param_tensors.append(p)
-                    else:
-                        param_dtensors.append(p)
-                elif isinstance(p.data, torch.Tensor):
-                    param_tensors.append(p)
-                else:
-                    raise TypeError(f"Unsupported parameter type: {type(p.data)}")
-            if self.debug:
-                print(
-                    f"[Muon] {len(param_dtensors)} DTensors, {len(param_tensors)} Tensors",
-                    flush=True,
-                )
-            if len(param_dtensors) > 0:
-                if not dist.is_initialized():
-                    raise RuntimeError(
-                        "Parallel Muon requires torch.distributed to be initialized."
-                    )
-                self.parallel(
-                    param_dtensors,
-                    group,
-                    lr=lr,
-                    weight_decay=weight_decay,
-                    momentum=momentum,
-                )
-            if len(param_tensors) > 0:
-                self.base(
-                    param_tensors,
-                    group,
-                    lr=lr,
-                    weight_decay=weight_decay,
-                    momentum=momentum,
-                )
-            ############################
-            #       AdamW backup       #
-            ############################
-            params = [p for p in group["params"] if not self.state[p]["use_muon"]]
-            lr = group["lr"]
-            beta1, beta2 = group["adamw_betas"]
-            eps = group["adamw_eps"]
-            weight_decay = group["weight_decay"]
-            for p in params:
-                g = p.grad
-                if g is None:
-                    continue
-                state = self.state[p]
-                if "step" not in state:
-                    state["step"] = 0
-                    state["moment1"] = torch.zeros_like(g)
-                    state["moment2"] = torch.zeros_like(g)
-                state["step"] += 1
-                step = state["step"]
-                buf1 = state["moment1"]
-                buf2 = state["moment2"]
-                buf1.lerp_(g, 1 - beta1)
-                buf2.lerp_(g.square(), 1 - beta2)
-                g = buf1 / (eps + buf2.sqrt())
-                bias_correction1 = 1 - beta1**step
-                bias_correction2 = 1 - beta2**step
-                scale = bias_correction1 / bias_correction2**0.5
-                p.data.mul_(1 - lr * weight_decay)
-                p.data.add_(g, alpha=-lr / scale)
-        return loss

build/torch26-cxx98-cu126-x86_64-linux/optimizer/__init__.py DELETED Viewed

@@ -1,5 +0,0 @@
-from .muon import Muon
-__all__ = [
-    "Muon",
-]

build/torch26-cxx98-cu126-x86_64-linux/optimizer/_ops.py DELETED Viewed

@@ -1,9 +0,0 @@
-import torch
-from . import _optimizer_02ac540_dirty
-ops = torch.ops._optimizer_02ac540_dirty
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_optimizer_02ac540_dirty::{op_name}"

build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:48795cb66a740b14266d757ac70a6b43fb11df6662970bb4040650d237e6cbc5
-size 1824184

build/torch26-cxx98-cu126-x86_64-linux/optimizer/muon.py DELETED Viewed

@@ -1,494 +0,0 @@
-import math
-from dataclasses import dataclass
-import torch
-import torch.distributed as dist
-from torch.distributed._tensor import DTensor, Replicate
-# This code snippet is a modified version adapted from the following GitHub repositories:
-# https://github.com/KellerJordan/Muon/blob/master/muon.py
-@torch.no_grad()
-def _zeropower_via_newtonschulz5(G, steps):
-    """
-    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
-    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
-    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
-    zero even beyond the point where the iteration no longer converges all the way to one everywhere
-    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
-    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
-    performance at all relative to UV^T, where USV^T = G is the SVD.
-    """
-    assert len(G.shape) == 2
-    a, b, c = (3.4445, -4.7750, 2.0315)
-    X = G  # no manual typecast
-    if G.size(0) > G.size(1):
-        X = X.T
-    # Ensure spectral norm is at most 1
-    X = X / (X.norm() + 1e-7)
-    X = X.bfloat16()
-    # Perform the NS iterations
-    for _ in range(steps):
-        A = X @ X.T
-        # B = (
-        #    b * A + c * A @ A
-        # )
-        B = torch.addmm(A, A, A, alpha=c, beta=b)
-        # X = a * X + B @ X
-        X = torch.addmm(X, B, X, alpha=1.0, beta=a)
-    if G.size(0) > G.size(1):
-        X = X.T
-    return X.to(G.dtype)
-@dataclass
-class _muon_state:
-    # TODO: use Optional
-    worker_rank: int | None = None
-    gathered_grad: torch.Tensor | None = None
-    computed_u: torch.Tensor | None = None
-    gather_event: torch.cuda.Event | None = None
-    compute_event: torch.cuda.Event | None = None
-@torch.no_grad()
-def _gather(p, state, rank, comm_stream, none_grad):
-    g = p.grad
-    mesh = g.device_mesh
-    if rank == state.worker_rank:
-        gather_list = [torch.empty_like(g.to_local()) for _ in range(mesh.mesh.numel())]
-    else:
-        gather_list = None
-    with torch.cuda.stream(comm_stream):
-        torch.distributed.gather(
-            g.to_local(),
-            dst=state.worker_rank,
-            gather_list=gather_list,
-            group=mesh.get_group(),
-        )
-        if rank == state.worker_rank:
-            if state.gathered_grad is not None:
-                raise RuntimeError(
-                    "Gather event already exists, which should not happen."
-                )
-            state.gathered_grad = torch.cat(gather_list, dim=0)
-            state.gather_event = torch.cuda.Event()
-            state.gather_event.record()
-        else:
-            state.gathered_grad = None
-            state.gather_event = None
-        if none_grad:
-            p.grad = None
-@torch.no_grad()
-def _compute_u(state, steps, rank, compute_stream):
-    with torch.cuda.stream(compute_stream):
-        if rank == state.worker_rank:
-            if state.gather_event is None:
-                raise RuntimeError("Gather event must be set before compute.")
-            compute_stream.wait_event(state.gather_event)
-            u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
-            state.computed_u = u
-            state.compute_event = torch.cuda.Event()
-            state.compute_event.record()
-            # Clear the gathered gradient to free memory
-            state.gathered_grad = None
-        else:
-            state.computed_u = None
-            state.compute_event = None
-@torch.no_grad()
-def _scatter(p, state, lr, weight_decay, rank, comm_stream):
-    u = state.computed_u
-    mesh = p.device_mesh
-    with torch.cuda.stream(comm_stream):
-        if rank == state.worker_rank:
-            if state.compute_event is None:
-                raise RuntimeError("Compute event must be set before scatter.")
-            comm_stream.wait_event(state.compute_event)
-            scatter_list = list(torch.split(u, p.size(0) // mesh.mesh.numel(), dim=0))
-        else:
-            scatter_list = None
-        u = torch.empty_like(p.to_local())
-        torch.distributed.scatter(
-            u,
-            scatter_list=scatter_list,
-            src=state.worker_rank,
-            group=mesh.get_group(),
-        )
-        if rank == state.worker_rank:
-            # Clear u to free memory
-            state.computed_u = None
-        u = DTensor.from_local(
-            u,
-            placements=p.placements,
-            device_mesh=mesh,
-        )
-        p.data.mul_(1 - lr * weight_decay)
-        p.data.add_(u, alpha=-lr)
-def default_is_muon(x, name):
-    return x.ndim >= 2 and "embed_tokens" not in name and "lm_head" not in name
-class Muon(torch.optim.Optimizer):
-    """
-    Muon - MomentUm Orthogonalized by Newton-schulz
-    Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
-    processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
-    matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
-    the advantage that it can be stably run in bfloat16 on the GPU.
-    Some warnings:
-    - We believe this optimizer is unlikely to work well for training with small batch size.
-    - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
-    Arguments:
-        muon_params: The parameters to be optimized by Muon.
-        lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
-        momentum: The momentum used by the internal SGD. (0.95 is a good default)
-        nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
-        ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
-        adamw_params: The parameters to be optimized by AdamW. Any parameters in `muon_params` which are
-        {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
-        adamw_lr: The learning rate for the internal AdamW.
-        adamw_betas: The betas for the internal AdamW.
-        adamw_eps: The epsilon for the internal AdamW.
-        adamw_weight_decay: The weight decay for the internal AdamW.
-    """
-    def __init__(
-        self,
-        model,
-        is_muon_func=default_is_muon,
-        lr=1e-3,
-        momentum=0.95,
-        nesterov=True,
-        ns_steps=5,
-        weight_decay=0.1,
-        adamw_betas=(0.9, 0.95),
-        adamw_eps=1e-8,
-        none_grad=True,
-        debug=False,
-    ):
-        defaults = dict(
-            lr=lr,
-            weight_decay=weight_decay,
-            momentum=momentum,
-            nesterov=nesterov,
-            ns_steps=ns_steps,
-            adamw_betas=adamw_betas,
-            adamw_eps=adamw_eps,
-            none_grad=none_grad,
-        )
-        super().__init__(model.parameters(), defaults)
-        self.is_muon_func = is_muon_func
-        self.model = model
-        if dist.is_initialized():
-            self.rank = dist.get_rank()
-        else:
-            self.rank = None
-        self.comm_stream = torch.cuda.Stream()
-        self.compute_stream = torch.cuda.Stream()
-        self.debug = debug
-    def __setstate__(self, state):
-        # Sort parameters into those for which we will use Muon, and those for which we will not
-        super().__setstate__(state)
-        self._init_state()
-    def _init_state(self):
-        for name, p in self.model.named_parameters():
-            if self.is_muon_func(p, name):
-                # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
-                assert p.ndim == 2, p.ndim
-                self.state[p]["use_muon"] = True
-            else:
-                # Do not use Muon for parameters in adamw_params
-                self.state[p]["use_muon"] = False
-    def _calc_flops(self, G, steps):
-        assert len(G.shape) == 2
-        M, N = G.shape
-        if M > N:
-            M, N = N, M
-        return steps * ((M**3) * 2 + (M**2 * N) * 4 + M * N * 2 + M**2 * 3)
-    def adjust_lr_for_muon(self, lr, param_shape):
-        A, B = param_shape[:2]
-        # We adjust the learning rate and weight decay based on the size of the parameter matrix
-        # as describted in the paper
-        adjusted_ratio = 0.2 * math.sqrt(max(A, B))
-        adjusted_lr = lr * adjusted_ratio
-        return adjusted_lr
-    def init_state_and_assign_params(self, params, group):
-        param_to_state = {}
-        param_to_flops = {}
-        total_flops = 0
-        for p in params:
-            g = p.grad
-            if g is None:
-                continue
-            assert g.ndim == 2, "Muon only supports 2D parameters."
-            flops = self._calc_flops(g, group["ns_steps"])
-            param_to_flops[id(p)] = flops
-            total_flops += flops
-        if self.debug:
-            print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs", flush=True)
-        ordered_params = sorted(
-            params, key=lambda p: param_to_flops[id(p)], reverse=True
-        )
-        round_robin = 0
-        mesh = None
-        for p in ordered_params:
-            if mesh is None:
-                mesh = p.device_mesh
-                if mesh.ndim != 1:
-                    raise NotImplementedError(
-                        "Muon requires a 1D mesh for distributed training yet."
-                    )
-            elif mesh != p.device_mesh:
-                raise ValueError("All parameters must be on the same mesh.")
-            param_to_state[id(p)] = _muon_state()
-            param_to_state[id(p)].worker_rank = mesh.mesh[round_robin].item()
-            round_robin = (round_robin + 1) % mesh.mesh.numel()
-        return param_to_state, ordered_params
-    def base(self, params, group, lr, weight_decay, momentum):
-        # generate weight updates in distributed fashion
-        for p in params:
-            g = p.grad
-            if g is None:
-                continue
-            if g.ndim > 2:
-                g = g.view(g.size(0), -1)
-            assert g is not None
-            # calc update
-            state = self.state[p]
-            if "momentum_buffer" not in state:
-                state["momentum_buffer"] = torch.zeros_like(g)
-            buf = state["momentum_buffer"]
-            buf.mul_(momentum).add_(g)
-            if group["nesterov"]:
-                g = g.add(buf, alpha=momentum)
-            else:
-                g = buf
-            u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
-            # scale update
-            adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
-            # apply weight decay
-            p.data.mul_(1 - lr * weight_decay)
-            # apply update
-            p.data.add_(u, alpha=-adjusted_lr)
-    def _update_g(self, p, g, group, momentum):
-        # calc update
-        state = self.state[p]
-        if "momentum_buffer" not in state:
-            state["momentum_buffer"] = torch.zeros_like(g)
-        buf = state["momentum_buffer"]
-        buf.mul_(momentum).add_(g)
-        if group["nesterov"]:
-            g = g.add(buf, alpha=momentum)
-        else:
-            g = buf
-        return g
-    def _update_p(self, p, u, lr, weight_decay):
-        # scale update
-        adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
-        # apply weight decay
-        p.data.mul_(1 - lr * weight_decay)
-        # apply update
-        p.data.add_(u, alpha=-adjusted_lr)
-    def parallel(self, params, group, lr, weight_decay, momentum):
-        """
-        Perform a parallel optimization step using Muon.
-        """
-        for p in params:
-            g = p.grad
-            if g is None:
-                continue
-            if g.ndim > 2:
-                g = g.view(g.size(0), -1)
-            # Update g in the local rank
-            g = self._update_g(
-                p,
-                g,
-                group,
-                momentum=momentum,
-            )
-            p.grad = g
-        param_to_state, ordered_params = self.init_state_and_assign_params(
-            params, group
-        )
-        def enqueue_gathers(start_idx, chunk_size):
-            for p in ordered_params[start_idx : start_idx + chunk_size]:
-                state = param_to_state[id(p)]
-                _gather(p, state, self.rank, self.comm_stream, group["none_grad"])
-        def enqueue_computes(start_idx, chunk_size):
-            for p in ordered_params[start_idx : start_idx + chunk_size]:
-                state = param_to_state[id(p)]
-                _compute_u(state, group["ns_steps"], self.rank, self.compute_stream)
-        def enqueue_scatters(start_idx, chunk_size):
-            for p in ordered_params[start_idx : start_idx + chunk_size]:
-                state = param_to_state[id(p)]
-                adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
-                _scatter(
-                    p, state, adjusted_lr, weight_decay, self.rank, self.comm_stream
-                )
-        chunk_size = params[0].device_mesh.mesh.numel()
-        # Wait grad update
-        self.comm_stream.wait_stream(torch.cuda.current_stream())
-        enqueue_gathers(0, chunk_size)
-        for i in range(0, len(params) + chunk_size - 1, chunk_size):
-            enqueue_computes(i, chunk_size)
-            enqueue_gathers(i + chunk_size, chunk_size)
-            enqueue_scatters(i, chunk_size)
-        torch.cuda.current_stream().wait_stream(self.comm_stream)
-    def step(self, closure=None):
-        """Perform a single optimization step.
-        Args:
-            closure (Callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-        for group in self.param_groups:
-            ############################
-            #           Muon           #
-            ############################
-            if "use_muon" not in self.state[group["params"][0]]:
-                self._init_state()
-            params = [p for p in group["params"] if self.state[p]["use_muon"]]
-            lr = group["lr"]
-            weight_decay = group["weight_decay"]
-            momentum = group["momentum"]
-            param_dtensors = []
-            param_tensors = []
-            for p in params:
-                if p is None or p.grad is None:
-                    continue
-                if isinstance(p.data, DTensor):
-                    if all(
-                        isinstance(placement, Replicate) for placement in p.placements
-                    ):
-                        param_tensors.append(p)
-                    else:
-                        param_dtensors.append(p)
-                elif isinstance(p.data, torch.Tensor):
-                    param_tensors.append(p)
-                else:
-                    raise TypeError(f"Unsupported parameter type: {type(p.data)}")
-            if self.debug:
-                print(
-                    f"[Muon] {len(param_dtensors)} DTensors, {len(param_tensors)} Tensors",
-                    flush=True,
-                )
-            if len(param_dtensors) > 0:
-                if not dist.is_initialized():
-                    raise RuntimeError(
-                        "Parallel Muon requires torch.distributed to be initialized."
-                    )
-                self.parallel(
-                    param_dtensors,
-                    group,
-                    lr=lr,
-                    weight_decay=weight_decay,
-                    momentum=momentum,
-                )
-            if len(param_tensors) > 0:
-                self.base(
-                    param_tensors,
-                    group,
-                    lr=lr,
-                    weight_decay=weight_decay,
-                    momentum=momentum,
-                )
-            ############################
-            #       AdamW backup       #
-            ############################
-            params = [p for p in group["params"] if not self.state[p]["use_muon"]]
-            lr = group["lr"]
-            beta1, beta2 = group["adamw_betas"]
-            eps = group["adamw_eps"]
-            weight_decay = group["weight_decay"]
-            for p in params:
-                g = p.grad
-                if g is None:
-                    continue
-                state = self.state[p]
-                if "step" not in state:
-                    state["step"] = 0
-                    state["moment1"] = torch.zeros_like(g)
-                    state["moment2"] = torch.zeros_like(g)
-                state["step"] += 1
-                step = state["step"]
-                buf1 = state["moment1"]
-                buf2 = state["moment2"]
-                buf1.lerp_(g, 1 - beta1)
-                buf2.lerp_(g.square(), 1 - beta2)
-                g = buf1 / (eps + buf2.sqrt())
-                bias_correction1 = 1 - beta1**step
-                bias_correction2 = 1 - beta2**step
-                scale = bias_correction1 / bias_correction2**0.5
-                p.data.mul_(1 - lr * weight_decay)
-                p.data.add_(g, alpha=-lr / scale)
-        return loss

build/torch27-cxx11-cu118-x86_64-linux/optimizer/__init__.py CHANGED Viewed

File without changes

build/torch27-cxx11-cu118-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (307 Bytes). View file

build/torch27-cxx11-cu118-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc ADDED Viewed

Binary file (22.4 kB). View file

build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_02ac540_dirty
-ops = torch.ops._optimizer_02ac540_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_02ac540_dirty::{op_name}"

 import torch
+from . import _optimizer_1f13dae_dirty
+ops = torch.ops._optimizer_1f13dae_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_1f13dae_dirty::{op_name}"

build/torch27-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_02ac540_dirty.abi3.so → _optimizer_1f13dae_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ec1f34fd4ead50eb51db63f51afc0751d6bf0c64a46c44c713ab245f150979cc
 size 1787368

 version https://git-lfs.github.com/spec/v1
+oid sha256:7dc5f8a57aa60483209dfcbb0c7cc0e54f1739d643145c1e685fbe2b6675ac43
 size 1787368

build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py CHANGED Viewed

File without changes

build/torch27-cxx11-cu126-x86_64-linux/optimizer/__init__.py CHANGED Viewed

File without changes

build/torch27-cxx11-cu126-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (307 Bytes). View file

build/torch27-cxx11-cu126-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc ADDED Viewed

Binary file (22.4 kB). View file

build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_02ac540_dirty
-ops = torch.ops._optimizer_02ac540_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_02ac540_dirty::{op_name}"

 import torch
+from . import _optimizer_1f13dae_dirty
+ops = torch.ops._optimizer_1f13dae_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_1f13dae_dirty::{op_name}"

build/torch27-cxx11-cu126-x86_64-linux/optimizer/{_optimizer_02ac540_dirty.abi3.so → _optimizer_1f13dae_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bdb8ab38f72351ae88307560aca5e1af7b2dcb63a39627dbd4c806cad3f83442
 size 1824256

 version https://git-lfs.github.com/spec/v1
+oid sha256:96c7e281f9634e3b252f720f4fea4f61490f2f1a1ef1280a3e259decb41c846f
 size 1824256

build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py CHANGED Viewed

File without changes

build/torch27-cxx11-cu128-x86_64-linux/optimizer/__init__.py CHANGED Viewed

File without changes

build/torch27-cxx11-cu128-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (307 Bytes). View file

build/torch27-cxx11-cu128-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc ADDED Viewed

Binary file (22.4 kB). View file

build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_02ac540_dirty
-ops = torch.ops._optimizer_02ac540_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_02ac540_dirty::{op_name}"

 import torch
+from . import _optimizer_1f13dae_dirty
+ops = torch.ops._optimizer_1f13dae_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_1f13dae_dirty::{op_name}"

build/torch27-cxx11-cu128-x86_64-linux/optimizer/{_optimizer_02ac540_dirty.abi3.so → _optimizer_1f13dae_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0652d611e00b1bcbece47da13dffb28396ae0831dc4be43c7ae9be27ad9a10fe
 size 1883352

 version https://git-lfs.github.com/spec/v1
+oid sha256:046a45fae81c2b7d79ff2237a1d26277f4883ef8a8b87a3980bf06d1182711b1
 size 1883352

build/torch27-cxx11-cu128-x86_64-linux/optimizer/muon.py CHANGED Viewed

File without changes

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__init__.py CHANGED Viewed

File without changes

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-312.pyc DELETED Viewed

Binary file (252 Bytes)

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (308 Bytes). View file

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-312.pyc DELETED Viewed

Binary file (22.3 kB)

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc ADDED Viewed

Binary file (22.4 kB). View file

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_02ac540_dirty
-ops = torch.ops._optimizer_02ac540_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_02ac540_dirty::{op_name}"

 import torch
+from . import _optimizer_1f13dae_dirty
+ops = torch.ops._optimizer_1f13dae_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_1f13dae_dirty::{op_name}"

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a96bfd1f461d7cd029dd39d142d2999dcc86dd7f56fb40f045e00f3fb2c400bd
-size 1749648

build/{torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so → torch27-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c478b90b83052c5931cb3d872adad7811663e28bd3447f12ac412f15b1d0ffc5
-size 1824224

 version https://git-lfs.github.com/spec/v1
+oid sha256:3d9ee2420e8528032369c476152a1960d123034a83e2c43f38a7fb2d1423aa23
+size 1749840

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/muon.py CHANGED Viewed

File without changes

build/{torch26-cxx11-cu118-x86_64-linux → torch28-cxx11-cu126-x86_64-linux}/optimizer/__init__.py RENAMED Viewed

File without changes

build/torch28-cxx11-cu126-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (307 Bytes). View file

build/torch28-cxx11-cu126-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc ADDED Viewed

Binary file (22.4 kB). View file

build/{torch26-cxx11-cu118-x86_64-linux → torch28-cxx11-cu126-x86_64-linux}/optimizer/_ops.py RENAMED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_02ac540_dirty
-ops = torch.ops._optimizer_02ac540_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_02ac540_dirty::{op_name}"

 import torch
+from . import _optimizer_1f13dae_dirty
+ops = torch.ops._optimizer_1f13dae_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_1f13dae_dirty::{op_name}"

build/{torch26-cxx11-cu118-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so → torch28-cxx11-cu126-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:829533f24bccb220101238fcbafa1343d2ec3ba3922a91a836b8a05813b44672
-size 1787272

 version https://git-lfs.github.com/spec/v1
+oid sha256:a082b5629efc4e9b8ce608713665d47904949b5d220dad350049bc806d58ecd7
+size 1824256

build/{torch26-cxx11-cu118-x86_64-linux → torch28-cxx11-cu126-x86_64-linux}/optimizer/muon.py RENAMED Viewed

File without changes

build/{torch26-cxx11-cu124-x86_64-linux → torch28-cxx11-cu128-x86_64-linux}/optimizer/__init__.py RENAMED Viewed

File without changes

build/torch28-cxx11-cu128-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (307 Bytes). View file

build/torch28-cxx11-cu128-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc ADDED Viewed

Binary file (22.4 kB). View file

build/{torch26-cxx11-cu126-x86_64-linux → torch28-cxx11-cu128-x86_64-linux}/optimizer/_ops.py RENAMED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_02ac540_dirty
-ops = torch.ops._optimizer_02ac540_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_02ac540_dirty::{op_name}"

 import torch
+from . import _optimizer_1f13dae_dirty
+ops = torch.ops._optimizer_1f13dae_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_1f13dae_dirty::{op_name}"

build/{torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so → torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ec46d147914be5998dfc62d4b87eb6730be7f012700d49543a318cadab3820db
-size 1749744

 version https://git-lfs.github.com/spec/v1
+oid sha256:7d2e65e315cd82d0b6fc2043ff37ee2d1223d6bd293ef552d658db5bf4de0a45
+size 1883352

build/{torch26-cxx11-cu124-x86_64-linux → torch28-cxx11-cu128-x86_64-linux}/optimizer/muon.py RENAMED Viewed

File without changes

build/{torch26-cxx11-cu126-x86_64-linux → torch28-cxx11-cu129-x86_64-linux}/optimizer/__init__.py RENAMED Viewed

File without changes