diff --git a/build/torch26-cxx11-cu118-x86_64-linux/optimizer/_ops.py b/build/torch26-cxx11-cu118-x86_64-linux/optimizer/_ops.py deleted file mode 100755 index f9005c0e58c46fac8d403b1388237929a9286555..0000000000000000000000000000000000000000 --- a/build/torch26-cxx11-cu118-x86_64-linux/optimizer/_ops.py +++ /dev/null @@ -1,9 +0,0 @@ -import torch -from . import _optimizer_02ac540_dirty -ops = torch.ops._optimizer_02ac540_dirty - -def add_op_namespace_prefix(op_name: str): - """ - Prefix op by namespace. - """ - return f"_optimizer_02ac540_dirty::{op_name}" \ No newline at end of file diff --git a/build/torch26-cxx11-cu118-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so b/build/torch26-cxx11-cu118-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so deleted file mode 100755 index 118377bf44dcf4bcfeeb598f98093c19362684a8..0000000000000000000000000000000000000000 --- a/build/torch26-cxx11-cu118-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:829533f24bccb220101238fcbafa1343d2ec3ba3922a91a836b8a05813b44672 -size 1787272 diff --git a/build/torch26-cxx11-cu124-x86_64-linux/optimizer/_ops.py b/build/torch26-cxx11-cu124-x86_64-linux/optimizer/_ops.py deleted file mode 100755 index f9005c0e58c46fac8d403b1388237929a9286555..0000000000000000000000000000000000000000 --- a/build/torch26-cxx11-cu124-x86_64-linux/optimizer/_ops.py +++ /dev/null @@ -1,9 +0,0 @@ -import torch -from . import _optimizer_02ac540_dirty -ops = torch.ops._optimizer_02ac540_dirty - -def add_op_namespace_prefix(op_name: str): - """ - Prefix op by namespace. - """ - return f"_optimizer_02ac540_dirty::{op_name}" \ No newline at end of file diff --git a/build/torch26-cxx11-cu124-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so b/build/torch26-cxx11-cu124-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so deleted file mode 100755 index fefc5731f88b5a088630f2499c6ee8c3edeb80ba..0000000000000000000000000000000000000000 --- a/build/torch26-cxx11-cu124-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:880f65ca04a52278892cdcb40dac073f21552ac16b69903f2b8026894a81e35d -size 1824224 diff --git a/build/torch26-cxx11-cu126-x86_64-linux/optimizer/_ops.py b/build/torch26-cxx11-cu126-x86_64-linux/optimizer/_ops.py deleted file mode 100755 index f9005c0e58c46fac8d403b1388237929a9286555..0000000000000000000000000000000000000000 --- a/build/torch26-cxx11-cu126-x86_64-linux/optimizer/_ops.py +++ /dev/null @@ -1,9 +0,0 @@ -import torch -from . import _optimizer_02ac540_dirty -ops = torch.ops._optimizer_02ac540_dirty - -def add_op_namespace_prefix(op_name: str): - """ - Prefix op by namespace. - """ - return f"_optimizer_02ac540_dirty::{op_name}" \ No newline at end of file diff --git a/build/torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so b/build/torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so deleted file mode 100755 index 38ad9c9e1aec43680498e322ae036630fc2b67db..0000000000000000000000000000000000000000 --- a/build/torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c478b90b83052c5931cb3d872adad7811663e28bd3447f12ac412f15b1d0ffc5 -size 1824224 diff --git a/build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_ops.py b/build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_ops.py deleted file mode 100755 index f9005c0e58c46fac8d403b1388237929a9286555..0000000000000000000000000000000000000000 --- a/build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_ops.py +++ /dev/null @@ -1,9 +0,0 @@ -import torch -from . import _optimizer_02ac540_dirty -ops = torch.ops._optimizer_02ac540_dirty - -def add_op_namespace_prefix(op_name: str): - """ - Prefix op by namespace. - """ - return f"_optimizer_02ac540_dirty::{op_name}" \ No newline at end of file diff --git a/build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so b/build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so deleted file mode 100755 index ce8c1fdc2608e9e505ed9573e93601801a5eac76..0000000000000000000000000000000000000000 --- a/build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ec46d147914be5998dfc62d4b87eb6730be7f012700d49543a318cadab3820db -size 1749744 diff --git a/build/torch26-cxx98-cu118-x86_64-linux/optimizer/_ops.py b/build/torch26-cxx98-cu118-x86_64-linux/optimizer/_ops.py deleted file mode 100755 index f9005c0e58c46fac8d403b1388237929a9286555..0000000000000000000000000000000000000000 --- a/build/torch26-cxx98-cu118-x86_64-linux/optimizer/_ops.py +++ /dev/null @@ -1,9 +0,0 @@ -import torch -from . import _optimizer_02ac540_dirty -ops = torch.ops._optimizer_02ac540_dirty - -def add_op_namespace_prefix(op_name: str): - """ - Prefix op by namespace. - """ - return f"_optimizer_02ac540_dirty::{op_name}" \ No newline at end of file diff --git a/build/torch26-cxx98-cu118-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so b/build/torch26-cxx98-cu118-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so deleted file mode 100755 index e6c6aed3d7c164597f9a75bceec3e8185dd0ca52..0000000000000000000000000000000000000000 --- a/build/torch26-cxx98-cu118-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9e09882858886be06e8ac48d184b320c57624d9c85165ce8b56640b022838e44 -size 1787192 diff --git a/build/torch26-cxx98-cu124-x86_64-linux/optimizer/__init__.py b/build/torch26-cxx98-cu124-x86_64-linux/optimizer/__init__.py deleted file mode 100755 index 239c7a65f8293e7d0df28f05fce645af56d628c0..0000000000000000000000000000000000000000 --- a/build/torch26-cxx98-cu124-x86_64-linux/optimizer/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from .muon import Muon - -__all__ = [ - "Muon", -] diff --git a/build/torch26-cxx98-cu124-x86_64-linux/optimizer/_ops.py b/build/torch26-cxx98-cu124-x86_64-linux/optimizer/_ops.py deleted file mode 100755 index f9005c0e58c46fac8d403b1388237929a9286555..0000000000000000000000000000000000000000 --- a/build/torch26-cxx98-cu124-x86_64-linux/optimizer/_ops.py +++ /dev/null @@ -1,9 +0,0 @@ -import torch -from . import _optimizer_02ac540_dirty -ops = torch.ops._optimizer_02ac540_dirty - -def add_op_namespace_prefix(op_name: str): - """ - Prefix op by namespace. - """ - return f"_optimizer_02ac540_dirty::{op_name}" \ No newline at end of file diff --git a/build/torch26-cxx98-cu124-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so b/build/torch26-cxx98-cu124-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so deleted file mode 100755 index 19e3eb12fc2e6b0da7b26245afeccc3f0fcba4d5..0000000000000000000000000000000000000000 --- a/build/torch26-cxx98-cu124-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6f63b2cd2c67b44f5e54837a0a4f26d94d3e6e8bfa4964bd99fc7e38494e2d52 -size 1824184 diff --git a/build/torch26-cxx98-cu124-x86_64-linux/optimizer/muon.py b/build/torch26-cxx98-cu124-x86_64-linux/optimizer/muon.py deleted file mode 100755 index 99f2033830f653e4037203360ee4e94beff6f732..0000000000000000000000000000000000000000 --- a/build/torch26-cxx98-cu124-x86_64-linux/optimizer/muon.py +++ /dev/null @@ -1,494 +0,0 @@ -import math -from dataclasses import dataclass - -import torch -import torch.distributed as dist -from torch.distributed._tensor import DTensor, Replicate - - -# This code snippet is a modified version adapted from the following GitHub repositories: -# https://github.com/KellerJordan/Muon/blob/master/muon.py -@torch.no_grad() -def _zeropower_via_newtonschulz5(G, steps): - """ - Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a - quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose - of minimizing steps, it turns out to be empirically effective to keep increasing the slope at - zero even beyond the point where the iteration no longer converges all the way to one everywhere - on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T - where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model - performance at all relative to UV^T, where USV^T = G is the SVD. - """ - assert len(G.shape) == 2 - a, b, c = (3.4445, -4.7750, 2.0315) - X = G # no manual typecast - if G.size(0) > G.size(1): - X = X.T - # Ensure spectral norm is at most 1 - X = X / (X.norm() + 1e-7) - X = X.bfloat16() - # Perform the NS iterations - for _ in range(steps): - A = X @ X.T - # B = ( - # b * A + c * A @ A - # ) - B = torch.addmm(A, A, A, alpha=c, beta=b) - # X = a * X + B @ X - X = torch.addmm(X, B, X, alpha=1.0, beta=a) - - if G.size(0) > G.size(1): - X = X.T - return X.to(G.dtype) - - -@dataclass -class _muon_state: - # TODO: use Optional - worker_rank: int | None = None - gathered_grad: torch.Tensor | None = None - computed_u: torch.Tensor | None = None - gather_event: torch.cuda.Event | None = None - compute_event: torch.cuda.Event | None = None - - -@torch.no_grad() -def _gather(p, state, rank, comm_stream, none_grad): - g = p.grad - mesh = g.device_mesh - - if rank == state.worker_rank: - gather_list = [torch.empty_like(g.to_local()) for _ in range(mesh.mesh.numel())] - else: - gather_list = None - - with torch.cuda.stream(comm_stream): - torch.distributed.gather( - g.to_local(), - dst=state.worker_rank, - gather_list=gather_list, - group=mesh.get_group(), - ) - if rank == state.worker_rank: - if state.gathered_grad is not None: - raise RuntimeError( - "Gather event already exists, which should not happen." - ) - state.gathered_grad = torch.cat(gather_list, dim=0) - state.gather_event = torch.cuda.Event() - state.gather_event.record() - else: - state.gathered_grad = None - state.gather_event = None - if none_grad: - p.grad = None - - -@torch.no_grad() -def _compute_u(state, steps, rank, compute_stream): - with torch.cuda.stream(compute_stream): - if rank == state.worker_rank: - if state.gather_event is None: - raise RuntimeError("Gather event must be set before compute.") - compute_stream.wait_event(state.gather_event) - u = _zeropower_via_newtonschulz5(state.gathered_grad, steps) - state.computed_u = u - state.compute_event = torch.cuda.Event() - state.compute_event.record() - # Clear the gathered gradient to free memory - state.gathered_grad = None - else: - state.computed_u = None - state.compute_event = None - - -@torch.no_grad() -def _scatter(p, state, lr, weight_decay, rank, comm_stream): - u = state.computed_u - mesh = p.device_mesh - - with torch.cuda.stream(comm_stream): - if rank == state.worker_rank: - if state.compute_event is None: - raise RuntimeError("Compute event must be set before scatter.") - comm_stream.wait_event(state.compute_event) - scatter_list = list(torch.split(u, p.size(0) // mesh.mesh.numel(), dim=0)) - else: - scatter_list = None - - u = torch.empty_like(p.to_local()) - torch.distributed.scatter( - u, - scatter_list=scatter_list, - src=state.worker_rank, - group=mesh.get_group(), - ) - if rank == state.worker_rank: - # Clear u to free memory - state.computed_u = None - u = DTensor.from_local( - u, - placements=p.placements, - device_mesh=mesh, - ) - p.data.mul_(1 - lr * weight_decay) - p.data.add_(u, alpha=-lr) - - -def default_is_muon(x, name): - return x.ndim >= 2 and "embed_tokens" not in name and "lm_head" not in name - - -class Muon(torch.optim.Optimizer): - """ - Muon - MomentUm Orthogonalized by Newton-schulz - - Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- - processing step, in which each 2D parameter's update is replaced with the nearest orthogonal - matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has - the advantage that it can be stably run in bfloat16 on the GPU. - - Some warnings: - - We believe this optimizer is unlikely to work well for training with small batch size. - - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - - Arguments: - muon_params: The parameters to be optimized by Muon. - lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default) - momentum: The momentum used by the internal SGD. (0.95 is a good default) - nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) - ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough) - adamw_params: The parameters to be optimized by AdamW. Any parameters in `muon_params` which are - {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well. - adamw_lr: The learning rate for the internal AdamW. - adamw_betas: The betas for the internal AdamW. - adamw_eps: The epsilon for the internal AdamW. - adamw_weight_decay: The weight decay for the internal AdamW. - """ - - def __init__( - self, - model, - is_muon_func=default_is_muon, - lr=1e-3, - momentum=0.95, - nesterov=True, - ns_steps=5, - weight_decay=0.1, - adamw_betas=(0.9, 0.95), - adamw_eps=1e-8, - none_grad=True, - debug=False, - ): - defaults = dict( - lr=lr, - weight_decay=weight_decay, - momentum=momentum, - nesterov=nesterov, - ns_steps=ns_steps, - adamw_betas=adamw_betas, - adamw_eps=adamw_eps, - none_grad=none_grad, - ) - - super().__init__(model.parameters(), defaults) - self.is_muon_func = is_muon_func - self.model = model - - if dist.is_initialized(): - self.rank = dist.get_rank() - else: - self.rank = None - - self.comm_stream = torch.cuda.Stream() - self.compute_stream = torch.cuda.Stream() - self.debug = debug - - def __setstate__(self, state): - # Sort parameters into those for which we will use Muon, and those for which we will not - super().__setstate__(state) - self._init_state() - - def _init_state(self): - for name, p in self.model.named_parameters(): - if self.is_muon_func(p, name): - # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer - assert p.ndim == 2, p.ndim - self.state[p]["use_muon"] = True - else: - # Do not use Muon for parameters in adamw_params - self.state[p]["use_muon"] = False - - def _calc_flops(self, G, steps): - assert len(G.shape) == 2 - M, N = G.shape - if M > N: - M, N = N, M - - return steps * ((M**3) * 2 + (M**2 * N) * 4 + M * N * 2 + M**2 * 3) - - def adjust_lr_for_muon(self, lr, param_shape): - A, B = param_shape[:2] - # We adjust the learning rate and weight decay based on the size of the parameter matrix - # as describted in the paper - adjusted_ratio = 0.2 * math.sqrt(max(A, B)) - adjusted_lr = lr * adjusted_ratio - return adjusted_lr - - def init_state_and_assign_params(self, params, group): - param_to_state = {} - param_to_flops = {} - - total_flops = 0 - for p in params: - g = p.grad - if g is None: - continue - assert g.ndim == 2, "Muon only supports 2D parameters." - - flops = self._calc_flops(g, group["ns_steps"]) - param_to_flops[id(p)] = flops - total_flops += flops - - if self.debug: - print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs", flush=True) - - ordered_params = sorted( - params, key=lambda p: param_to_flops[id(p)], reverse=True - ) - - round_robin = 0 - mesh = None - for p in ordered_params: - if mesh is None: - mesh = p.device_mesh - if mesh.ndim != 1: - raise NotImplementedError( - "Muon requires a 1D mesh for distributed training yet." - ) - elif mesh != p.device_mesh: - raise ValueError("All parameters must be on the same mesh.") - - param_to_state[id(p)] = _muon_state() - param_to_state[id(p)].worker_rank = mesh.mesh[round_robin].item() - - round_robin = (round_robin + 1) % mesh.mesh.numel() - - return param_to_state, ordered_params - - def base(self, params, group, lr, weight_decay, momentum): - # generate weight updates in distributed fashion - for p in params: - g = p.grad - if g is None: - continue - if g.ndim > 2: - g = g.view(g.size(0), -1) - assert g is not None - - # calc update - state = self.state[p] - if "momentum_buffer" not in state: - state["momentum_buffer"] = torch.zeros_like(g) - buf = state["momentum_buffer"] - buf.mul_(momentum).add_(g) - if group["nesterov"]: - g = g.add(buf, alpha=momentum) - else: - g = buf - - u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"]) - - # scale update - adjusted_lr = self.adjust_lr_for_muon(lr, p.shape) - - # apply weight decay - p.data.mul_(1 - lr * weight_decay) - - # apply update - p.data.add_(u, alpha=-adjusted_lr) - - def _update_g(self, p, g, group, momentum): - # calc update - state = self.state[p] - if "momentum_buffer" not in state: - state["momentum_buffer"] = torch.zeros_like(g) - buf = state["momentum_buffer"] - buf.mul_(momentum).add_(g) - if group["nesterov"]: - g = g.add(buf, alpha=momentum) - else: - g = buf - return g - - def _update_p(self, p, u, lr, weight_decay): - # scale update - adjusted_lr = self.adjust_lr_for_muon(lr, p.shape) - # apply weight decay - p.data.mul_(1 - lr * weight_decay) - # apply update - p.data.add_(u, alpha=-adjusted_lr) - - def parallel(self, params, group, lr, weight_decay, momentum): - """ - Perform a parallel optimization step using Muon. - """ - - for p in params: - g = p.grad - if g is None: - continue - if g.ndim > 2: - g = g.view(g.size(0), -1) - - # Update g in the local rank - g = self._update_g( - p, - g, - group, - momentum=momentum, - ) - p.grad = g - - param_to_state, ordered_params = self.init_state_and_assign_params( - params, group - ) - - def enqueue_gathers(start_idx, chunk_size): - for p in ordered_params[start_idx : start_idx + chunk_size]: - state = param_to_state[id(p)] - _gather(p, state, self.rank, self.comm_stream, group["none_grad"]) - - def enqueue_computes(start_idx, chunk_size): - for p in ordered_params[start_idx : start_idx + chunk_size]: - state = param_to_state[id(p)] - _compute_u(state, group["ns_steps"], self.rank, self.compute_stream) - - def enqueue_scatters(start_idx, chunk_size): - for p in ordered_params[start_idx : start_idx + chunk_size]: - state = param_to_state[id(p)] - adjusted_lr = self.adjust_lr_for_muon(lr, p.shape) - _scatter( - p, state, adjusted_lr, weight_decay, self.rank, self.comm_stream - ) - - chunk_size = params[0].device_mesh.mesh.numel() - - # Wait grad update - self.comm_stream.wait_stream(torch.cuda.current_stream()) - - enqueue_gathers(0, chunk_size) - for i in range(0, len(params) + chunk_size - 1, chunk_size): - enqueue_computes(i, chunk_size) - enqueue_gathers(i + chunk_size, chunk_size) - enqueue_scatters(i, chunk_size) - - torch.cuda.current_stream().wait_stream(self.comm_stream) - - def step(self, closure=None): - """Perform a single optimization step. - - Args: - closure (Callable, optional): A closure that reevaluates the model - and returns the loss. - """ - loss = None - if closure is not None: - with torch.enable_grad(): - loss = closure() - - for group in self.param_groups: - ############################ - # Muon # - ############################ - - if "use_muon" not in self.state[group["params"][0]]: - self._init_state() - - params = [p for p in group["params"] if self.state[p]["use_muon"]] - lr = group["lr"] - weight_decay = group["weight_decay"] - momentum = group["momentum"] - - param_dtensors = [] - param_tensors = [] - - for p in params: - if p is None or p.grad is None: - continue - if isinstance(p.data, DTensor): - if all( - isinstance(placement, Replicate) for placement in p.placements - ): - param_tensors.append(p) - else: - param_dtensors.append(p) - elif isinstance(p.data, torch.Tensor): - param_tensors.append(p) - else: - raise TypeError(f"Unsupported parameter type: {type(p.data)}") - - if self.debug: - print( - f"[Muon] {len(param_dtensors)} DTensors, {len(param_tensors)} Tensors", - flush=True, - ) - - if len(param_dtensors) > 0: - if not dist.is_initialized(): - raise RuntimeError( - "Parallel Muon requires torch.distributed to be initialized." - ) - - self.parallel( - param_dtensors, - group, - lr=lr, - weight_decay=weight_decay, - momentum=momentum, - ) - - if len(param_tensors) > 0: - self.base( - param_tensors, - group, - lr=lr, - weight_decay=weight_decay, - momentum=momentum, - ) - - ############################ - # AdamW backup # - ############################ - - params = [p for p in group["params"] if not self.state[p]["use_muon"]] - lr = group["lr"] - beta1, beta2 = group["adamw_betas"] - eps = group["adamw_eps"] - weight_decay = group["weight_decay"] - - for p in params: - g = p.grad - if g is None: - continue - state = self.state[p] - if "step" not in state: - state["step"] = 0 - state["moment1"] = torch.zeros_like(g) - state["moment2"] = torch.zeros_like(g) - state["step"] += 1 - step = state["step"] - buf1 = state["moment1"] - buf2 = state["moment2"] - buf1.lerp_(g, 1 - beta1) - buf2.lerp_(g.square(), 1 - beta2) - - g = buf1 / (eps + buf2.sqrt()) - - bias_correction1 = 1 - beta1**step - bias_correction2 = 1 - beta2**step - scale = bias_correction1 / bias_correction2**0.5 - p.data.mul_(1 - lr * weight_decay) - p.data.add_(g, alpha=-lr / scale) - - return loss diff --git a/build/torch26-cxx98-cu126-x86_64-linux/optimizer/__init__.py b/build/torch26-cxx98-cu126-x86_64-linux/optimizer/__init__.py deleted file mode 100755 index 239c7a65f8293e7d0df28f05fce645af56d628c0..0000000000000000000000000000000000000000 --- a/build/torch26-cxx98-cu126-x86_64-linux/optimizer/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from .muon import Muon - -__all__ = [ - "Muon", -] diff --git a/build/torch26-cxx98-cu126-x86_64-linux/optimizer/_ops.py b/build/torch26-cxx98-cu126-x86_64-linux/optimizer/_ops.py deleted file mode 100755 index f9005c0e58c46fac8d403b1388237929a9286555..0000000000000000000000000000000000000000 --- a/build/torch26-cxx98-cu126-x86_64-linux/optimizer/_ops.py +++ /dev/null @@ -1,9 +0,0 @@ -import torch -from . import _optimizer_02ac540_dirty -ops = torch.ops._optimizer_02ac540_dirty - -def add_op_namespace_prefix(op_name: str): - """ - Prefix op by namespace. - """ - return f"_optimizer_02ac540_dirty::{op_name}" \ No newline at end of file diff --git a/build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so b/build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so deleted file mode 100755 index cd702fe7a98dcee7e7fd75ff790ce0b10713653d..0000000000000000000000000000000000000000 --- a/build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:48795cb66a740b14266d757ac70a6b43fb11df6662970bb4040650d237e6cbc5 -size 1824184 diff --git a/build/torch26-cxx98-cu126-x86_64-linux/optimizer/muon.py b/build/torch26-cxx98-cu126-x86_64-linux/optimizer/muon.py deleted file mode 100755 index 99f2033830f653e4037203360ee4e94beff6f732..0000000000000000000000000000000000000000 --- a/build/torch26-cxx98-cu126-x86_64-linux/optimizer/muon.py +++ /dev/null @@ -1,494 +0,0 @@ -import math -from dataclasses import dataclass - -import torch -import torch.distributed as dist -from torch.distributed._tensor import DTensor, Replicate - - -# This code snippet is a modified version adapted from the following GitHub repositories: -# https://github.com/KellerJordan/Muon/blob/master/muon.py -@torch.no_grad() -def _zeropower_via_newtonschulz5(G, steps): - """ - Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a - quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose - of minimizing steps, it turns out to be empirically effective to keep increasing the slope at - zero even beyond the point where the iteration no longer converges all the way to one everywhere - on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T - where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model - performance at all relative to UV^T, where USV^T = G is the SVD. - """ - assert len(G.shape) == 2 - a, b, c = (3.4445, -4.7750, 2.0315) - X = G # no manual typecast - if G.size(0) > G.size(1): - X = X.T - # Ensure spectral norm is at most 1 - X = X / (X.norm() + 1e-7) - X = X.bfloat16() - # Perform the NS iterations - for _ in range(steps): - A = X @ X.T - # B = ( - # b * A + c * A @ A - # ) - B = torch.addmm(A, A, A, alpha=c, beta=b) - # X = a * X + B @ X - X = torch.addmm(X, B, X, alpha=1.0, beta=a) - - if G.size(0) > G.size(1): - X = X.T - return X.to(G.dtype) - - -@dataclass -class _muon_state: - # TODO: use Optional - worker_rank: int | None = None - gathered_grad: torch.Tensor | None = None - computed_u: torch.Tensor | None = None - gather_event: torch.cuda.Event | None = None - compute_event: torch.cuda.Event | None = None - - -@torch.no_grad() -def _gather(p, state, rank, comm_stream, none_grad): - g = p.grad - mesh = g.device_mesh - - if rank == state.worker_rank: - gather_list = [torch.empty_like(g.to_local()) for _ in range(mesh.mesh.numel())] - else: - gather_list = None - - with torch.cuda.stream(comm_stream): - torch.distributed.gather( - g.to_local(), - dst=state.worker_rank, - gather_list=gather_list, - group=mesh.get_group(), - ) - if rank == state.worker_rank: - if state.gathered_grad is not None: - raise RuntimeError( - "Gather event already exists, which should not happen." - ) - state.gathered_grad = torch.cat(gather_list, dim=0) - state.gather_event = torch.cuda.Event() - state.gather_event.record() - else: - state.gathered_grad = None - state.gather_event = None - if none_grad: - p.grad = None - - -@torch.no_grad() -def _compute_u(state, steps, rank, compute_stream): - with torch.cuda.stream(compute_stream): - if rank == state.worker_rank: - if state.gather_event is None: - raise RuntimeError("Gather event must be set before compute.") - compute_stream.wait_event(state.gather_event) - u = _zeropower_via_newtonschulz5(state.gathered_grad, steps) - state.computed_u = u - state.compute_event = torch.cuda.Event() - state.compute_event.record() - # Clear the gathered gradient to free memory - state.gathered_grad = None - else: - state.computed_u = None - state.compute_event = None - - -@torch.no_grad() -def _scatter(p, state, lr, weight_decay, rank, comm_stream): - u = state.computed_u - mesh = p.device_mesh - - with torch.cuda.stream(comm_stream): - if rank == state.worker_rank: - if state.compute_event is None: - raise RuntimeError("Compute event must be set before scatter.") - comm_stream.wait_event(state.compute_event) - scatter_list = list(torch.split(u, p.size(0) // mesh.mesh.numel(), dim=0)) - else: - scatter_list = None - - u = torch.empty_like(p.to_local()) - torch.distributed.scatter( - u, - scatter_list=scatter_list, - src=state.worker_rank, - group=mesh.get_group(), - ) - if rank == state.worker_rank: - # Clear u to free memory - state.computed_u = None - u = DTensor.from_local( - u, - placements=p.placements, - device_mesh=mesh, - ) - p.data.mul_(1 - lr * weight_decay) - p.data.add_(u, alpha=-lr) - - -def default_is_muon(x, name): - return x.ndim >= 2 and "embed_tokens" not in name and "lm_head" not in name - - -class Muon(torch.optim.Optimizer): - """ - Muon - MomentUm Orthogonalized by Newton-schulz - - Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- - processing step, in which each 2D parameter's update is replaced with the nearest orthogonal - matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has - the advantage that it can be stably run in bfloat16 on the GPU. - - Some warnings: - - We believe this optimizer is unlikely to work well for training with small batch size. - - We believe it may not work well for finetuning pretrained models, but we haven't tested this. - - Arguments: - muon_params: The parameters to be optimized by Muon. - lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default) - momentum: The momentum used by the internal SGD. (0.95 is a good default) - nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) - ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough) - adamw_params: The parameters to be optimized by AdamW. Any parameters in `muon_params` which are - {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well. - adamw_lr: The learning rate for the internal AdamW. - adamw_betas: The betas for the internal AdamW. - adamw_eps: The epsilon for the internal AdamW. - adamw_weight_decay: The weight decay for the internal AdamW. - """ - - def __init__( - self, - model, - is_muon_func=default_is_muon, - lr=1e-3, - momentum=0.95, - nesterov=True, - ns_steps=5, - weight_decay=0.1, - adamw_betas=(0.9, 0.95), - adamw_eps=1e-8, - none_grad=True, - debug=False, - ): - defaults = dict( - lr=lr, - weight_decay=weight_decay, - momentum=momentum, - nesterov=nesterov, - ns_steps=ns_steps, - adamw_betas=adamw_betas, - adamw_eps=adamw_eps, - none_grad=none_grad, - ) - - super().__init__(model.parameters(), defaults) - self.is_muon_func = is_muon_func - self.model = model - - if dist.is_initialized(): - self.rank = dist.get_rank() - else: - self.rank = None - - self.comm_stream = torch.cuda.Stream() - self.compute_stream = torch.cuda.Stream() - self.debug = debug - - def __setstate__(self, state): - # Sort parameters into those for which we will use Muon, and those for which we will not - super().__setstate__(state) - self._init_state() - - def _init_state(self): - for name, p in self.model.named_parameters(): - if self.is_muon_func(p, name): - # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer - assert p.ndim == 2, p.ndim - self.state[p]["use_muon"] = True - else: - # Do not use Muon for parameters in adamw_params - self.state[p]["use_muon"] = False - - def _calc_flops(self, G, steps): - assert len(G.shape) == 2 - M, N = G.shape - if M > N: - M, N = N, M - - return steps * ((M**3) * 2 + (M**2 * N) * 4 + M * N * 2 + M**2 * 3) - - def adjust_lr_for_muon(self, lr, param_shape): - A, B = param_shape[:2] - # We adjust the learning rate and weight decay based on the size of the parameter matrix - # as describted in the paper - adjusted_ratio = 0.2 * math.sqrt(max(A, B)) - adjusted_lr = lr * adjusted_ratio - return adjusted_lr - - def init_state_and_assign_params(self, params, group): - param_to_state = {} - param_to_flops = {} - - total_flops = 0 - for p in params: - g = p.grad - if g is None: - continue - assert g.ndim == 2, "Muon only supports 2D parameters." - - flops = self._calc_flops(g, group["ns_steps"]) - param_to_flops[id(p)] = flops - total_flops += flops - - if self.debug: - print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs", flush=True) - - ordered_params = sorted( - params, key=lambda p: param_to_flops[id(p)], reverse=True - ) - - round_robin = 0 - mesh = None - for p in ordered_params: - if mesh is None: - mesh = p.device_mesh - if mesh.ndim != 1: - raise NotImplementedError( - "Muon requires a 1D mesh for distributed training yet." - ) - elif mesh != p.device_mesh: - raise ValueError("All parameters must be on the same mesh.") - - param_to_state[id(p)] = _muon_state() - param_to_state[id(p)].worker_rank = mesh.mesh[round_robin].item() - - round_robin = (round_robin + 1) % mesh.mesh.numel() - - return param_to_state, ordered_params - - def base(self, params, group, lr, weight_decay, momentum): - # generate weight updates in distributed fashion - for p in params: - g = p.grad - if g is None: - continue - if g.ndim > 2: - g = g.view(g.size(0), -1) - assert g is not None - - # calc update - state = self.state[p] - if "momentum_buffer" not in state: - state["momentum_buffer"] = torch.zeros_like(g) - buf = state["momentum_buffer"] - buf.mul_(momentum).add_(g) - if group["nesterov"]: - g = g.add(buf, alpha=momentum) - else: - g = buf - - u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"]) - - # scale update - adjusted_lr = self.adjust_lr_for_muon(lr, p.shape) - - # apply weight decay - p.data.mul_(1 - lr * weight_decay) - - # apply update - p.data.add_(u, alpha=-adjusted_lr) - - def _update_g(self, p, g, group, momentum): - # calc update - state = self.state[p] - if "momentum_buffer" not in state: - state["momentum_buffer"] = torch.zeros_like(g) - buf = state["momentum_buffer"] - buf.mul_(momentum).add_(g) - if group["nesterov"]: - g = g.add(buf, alpha=momentum) - else: - g = buf - return g - - def _update_p(self, p, u, lr, weight_decay): - # scale update - adjusted_lr = self.adjust_lr_for_muon(lr, p.shape) - # apply weight decay - p.data.mul_(1 - lr * weight_decay) - # apply update - p.data.add_(u, alpha=-adjusted_lr) - - def parallel(self, params, group, lr, weight_decay, momentum): - """ - Perform a parallel optimization step using Muon. - """ - - for p in params: - g = p.grad - if g is None: - continue - if g.ndim > 2: - g = g.view(g.size(0), -1) - - # Update g in the local rank - g = self._update_g( - p, - g, - group, - momentum=momentum, - ) - p.grad = g - - param_to_state, ordered_params = self.init_state_and_assign_params( - params, group - ) - - def enqueue_gathers(start_idx, chunk_size): - for p in ordered_params[start_idx : start_idx + chunk_size]: - state = param_to_state[id(p)] - _gather(p, state, self.rank, self.comm_stream, group["none_grad"]) - - def enqueue_computes(start_idx, chunk_size): - for p in ordered_params[start_idx : start_idx + chunk_size]: - state = param_to_state[id(p)] - _compute_u(state, group["ns_steps"], self.rank, self.compute_stream) - - def enqueue_scatters(start_idx, chunk_size): - for p in ordered_params[start_idx : start_idx + chunk_size]: - state = param_to_state[id(p)] - adjusted_lr = self.adjust_lr_for_muon(lr, p.shape) - _scatter( - p, state, adjusted_lr, weight_decay, self.rank, self.comm_stream - ) - - chunk_size = params[0].device_mesh.mesh.numel() - - # Wait grad update - self.comm_stream.wait_stream(torch.cuda.current_stream()) - - enqueue_gathers(0, chunk_size) - for i in range(0, len(params) + chunk_size - 1, chunk_size): - enqueue_computes(i, chunk_size) - enqueue_gathers(i + chunk_size, chunk_size) - enqueue_scatters(i, chunk_size) - - torch.cuda.current_stream().wait_stream(self.comm_stream) - - def step(self, closure=None): - """Perform a single optimization step. - - Args: - closure (Callable, optional): A closure that reevaluates the model - and returns the loss. - """ - loss = None - if closure is not None: - with torch.enable_grad(): - loss = closure() - - for group in self.param_groups: - ############################ - # Muon # - ############################ - - if "use_muon" not in self.state[group["params"][0]]: - self._init_state() - - params = [p for p in group["params"] if self.state[p]["use_muon"]] - lr = group["lr"] - weight_decay = group["weight_decay"] - momentum = group["momentum"] - - param_dtensors = [] - param_tensors = [] - - for p in params: - if p is None or p.grad is None: - continue - if isinstance(p.data, DTensor): - if all( - isinstance(placement, Replicate) for placement in p.placements - ): - param_tensors.append(p) - else: - param_dtensors.append(p) - elif isinstance(p.data, torch.Tensor): - param_tensors.append(p) - else: - raise TypeError(f"Unsupported parameter type: {type(p.data)}") - - if self.debug: - print( - f"[Muon] {len(param_dtensors)} DTensors, {len(param_tensors)} Tensors", - flush=True, - ) - - if len(param_dtensors) > 0: - if not dist.is_initialized(): - raise RuntimeError( - "Parallel Muon requires torch.distributed to be initialized." - ) - - self.parallel( - param_dtensors, - group, - lr=lr, - weight_decay=weight_decay, - momentum=momentum, - ) - - if len(param_tensors) > 0: - self.base( - param_tensors, - group, - lr=lr, - weight_decay=weight_decay, - momentum=momentum, - ) - - ############################ - # AdamW backup # - ############################ - - params = [p for p in group["params"] if not self.state[p]["use_muon"]] - lr = group["lr"] - beta1, beta2 = group["adamw_betas"] - eps = group["adamw_eps"] - weight_decay = group["weight_decay"] - - for p in params: - g = p.grad - if g is None: - continue - state = self.state[p] - if "step" not in state: - state["step"] = 0 - state["moment1"] = torch.zeros_like(g) - state["moment2"] = torch.zeros_like(g) - state["step"] += 1 - step = state["step"] - buf1 = state["moment1"] - buf2 = state["moment2"] - buf1.lerp_(g, 1 - beta1) - buf2.lerp_(g.square(), 1 - beta2) - - g = buf1 / (eps + buf2.sqrt()) - - bias_correction1 = 1 - beta1**step - bias_correction2 = 1 - beta2**step - scale = bias_correction1 / bias_correction2**0.5 - p.data.mul_(1 - lr * weight_decay) - p.data.add_(g, alpha=-lr / scale) - - return loss diff --git a/build/torch27-cxx11-cu118-x86_64-linux/optimizer/__init__.py b/build/torch27-cxx11-cu118-x86_64-linux/optimizer/__init__.py old mode 100755 new mode 100644 diff --git a/build/torch27-cxx11-cu118-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc b/build/torch27-cxx11-cu118-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2855bd5a62f62977bdb1bb9cdb4e2e3c29ab869b Binary files /dev/null and b/build/torch27-cxx11-cu118-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc differ diff --git a/build/torch27-cxx11-cu118-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc b/build/torch27-cxx11-cu118-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..10f037038f8acd999ffcdc2b527a3ce2c7c7ecba Binary files /dev/null and b/build/torch27-cxx11-cu118-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc differ diff --git a/build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py b/build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py old mode 100755 new mode 100644 index f9005c0e58c46fac8d403b1388237929a9286555..dab288ec61063a45be53f30f54dca0cbbabf7809 --- a/build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py +++ b/build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _optimizer_02ac540_dirty -ops = torch.ops._optimizer_02ac540_dirty +from . import _optimizer_1f13dae_dirty +ops = torch.ops._optimizer_1f13dae_dirty def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_optimizer_02ac540_dirty::{op_name}" \ No newline at end of file + return f"_optimizer_1f13dae_dirty::{op_name}" \ No newline at end of file diff --git a/build/torch27-cxx11-cu118-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so b/build/torch27-cxx11-cu118-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so deleted file mode 100755 index ba3ebbf6afbc9422148c7bd0691e40ea66eaa29f..0000000000000000000000000000000000000000 --- a/build/torch27-cxx11-cu118-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ec1f34fd4ead50eb51db63f51afc0751d6bf0c64a46c44c713ab245f150979cc -size 1787368 diff --git a/build/torch27-cxx11-cu118-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so b/build/torch27-cxx11-cu118-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..ef89d559d485d0158b70da4c0e45733c7262d45d --- /dev/null +++ b/build/torch27-cxx11-cu118-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7dc5f8a57aa60483209dfcbb0c7cc0e54f1739d643145c1e685fbe2b6675ac43 +size 1787368 diff --git a/build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py b/build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py old mode 100755 new mode 100644 diff --git a/build/torch27-cxx11-cu126-x86_64-linux/optimizer/__init__.py b/build/torch27-cxx11-cu126-x86_64-linux/optimizer/__init__.py old mode 100755 new mode 100644 diff --git a/build/torch27-cxx11-cu126-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc b/build/torch27-cxx11-cu126-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..685f0542fb68654756ffe89daccad0cbebcb1a83 Binary files /dev/null and b/build/torch27-cxx11-cu126-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc differ diff --git a/build/torch27-cxx11-cu126-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc b/build/torch27-cxx11-cu126-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6d7f7725a60d883700e903eaedb7201e196da72e Binary files /dev/null and b/build/torch27-cxx11-cu126-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc differ diff --git a/build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py b/build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py old mode 100755 new mode 100644 index f9005c0e58c46fac8d403b1388237929a9286555..dab288ec61063a45be53f30f54dca0cbbabf7809 --- a/build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py +++ b/build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _optimizer_02ac540_dirty -ops = torch.ops._optimizer_02ac540_dirty +from . import _optimizer_1f13dae_dirty +ops = torch.ops._optimizer_1f13dae_dirty def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_optimizer_02ac540_dirty::{op_name}" \ No newline at end of file + return f"_optimizer_1f13dae_dirty::{op_name}" \ No newline at end of file diff --git a/build/torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so b/build/torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so deleted file mode 100755 index c7d17bedd35f7d5ea9173366b612f9060a8023bb..0000000000000000000000000000000000000000 --- a/build/torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bdb8ab38f72351ae88307560aca5e1af7b2dcb63a39627dbd4c806cad3f83442 -size 1824256 diff --git a/build/torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so b/build/torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..c7eb82a107660abd6fde04c59f5d227a59fa3008 --- /dev/null +++ b/build/torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96c7e281f9634e3b252f720f4fea4f61490f2f1a1ef1280a3e259decb41c846f +size 1824256 diff --git a/build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py b/build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py old mode 100755 new mode 100644 diff --git a/build/torch27-cxx11-cu128-x86_64-linux/optimizer/__init__.py b/build/torch27-cxx11-cu128-x86_64-linux/optimizer/__init__.py old mode 100755 new mode 100644 diff --git a/build/torch27-cxx11-cu128-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc b/build/torch27-cxx11-cu128-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3fdf49d66842a9f88ada1b24acd7d0435870ca29 Binary files /dev/null and b/build/torch27-cxx11-cu128-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc differ diff --git a/build/torch27-cxx11-cu128-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc b/build/torch27-cxx11-cu128-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c1c09186a1f84a2900205a1374a40eefd5181765 Binary files /dev/null and b/build/torch27-cxx11-cu128-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc differ diff --git a/build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py b/build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py old mode 100755 new mode 100644 index f9005c0e58c46fac8d403b1388237929a9286555..dab288ec61063a45be53f30f54dca0cbbabf7809 --- a/build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py +++ b/build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _optimizer_02ac540_dirty -ops = torch.ops._optimizer_02ac540_dirty +from . import _optimizer_1f13dae_dirty +ops = torch.ops._optimizer_1f13dae_dirty def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_optimizer_02ac540_dirty::{op_name}" \ No newline at end of file + return f"_optimizer_1f13dae_dirty::{op_name}" \ No newline at end of file diff --git a/build/torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so b/build/torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so deleted file mode 100755 index 47c495490ce16530c593d7d6df0260c05eaeb3d0..0000000000000000000000000000000000000000 --- a/build/torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0652d611e00b1bcbece47da13dffb28396ae0831dc4be43c7ae9be27ad9a10fe -size 1883352 diff --git a/build/torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so b/build/torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..bfa7b31c2528c22eac5cf22e361f8969c9ac8a25 --- /dev/null +++ b/build/torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:046a45fae81c2b7d79ff2237a1d26277f4883ef8a8b87a3980bf06d1182711b1 +size 1883352 diff --git a/build/torch27-cxx11-cu128-x86_64-linux/optimizer/muon.py b/build/torch27-cxx11-cu128-x86_64-linux/optimizer/muon.py old mode 100755 new mode 100644 diff --git a/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__init__.py b/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__init__.py old mode 100755 new mode 100644 diff --git a/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-312.pyc b/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-312.pyc deleted file mode 100644 index 615aa907dab7fa111b7d1d4d537c6db8be900ebf..0000000000000000000000000000000000000000 Binary files a/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-312.pyc and /dev/null differ diff --git a/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc b/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..345004d8e5a62d0cd4b1c440673393a2fef15c47 Binary files /dev/null and b/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc differ diff --git a/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-312.pyc b/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-312.pyc deleted file mode 100644 index 26c9a5594694747ba5588fa310780336ec698ad0..0000000000000000000000000000000000000000 Binary files a/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-312.pyc and /dev/null differ diff --git a/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc b/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f10dd600f1049494fbad23db073a01e7c383b6f1 Binary files /dev/null and b/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc differ diff --git a/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_ops.py b/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_ops.py old mode 100755 new mode 100644 index f9005c0e58c46fac8d403b1388237929a9286555..dab288ec61063a45be53f30f54dca0cbbabf7809 --- a/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_ops.py +++ b/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _optimizer_02ac540_dirty -ops = torch.ops._optimizer_02ac540_dirty +from . import _optimizer_1f13dae_dirty +ops = torch.ops._optimizer_1f13dae_dirty def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_optimizer_02ac540_dirty::{op_name}" \ No newline at end of file + return f"_optimizer_1f13dae_dirty::{op_name}" \ No newline at end of file diff --git a/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so b/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so deleted file mode 100755 index c7d8c06273cdfc4d10ef2ca3a0e13a097551608a..0000000000000000000000000000000000000000 --- a/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a96bfd1f461d7cd029dd39d142d2999dcc86dd7f56fb40f045e00f3fb2c400bd -size 1749648 diff --git a/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so b/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..2955b9ec704c5373b8b3c12cf8767de8b8604d9d --- /dev/null +++ b/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d9ee2420e8528032369c476152a1960d123034a83e2c43f38a7fb2d1423aa23 +size 1749840 diff --git a/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/muon.py b/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/muon.py old mode 100755 new mode 100644 diff --git a/build/torch26-cxx11-cu118-x86_64-linux/optimizer/__init__.py b/build/torch28-cxx11-cu126-x86_64-linux/optimizer/__init__.py old mode 100755 new mode 100644 similarity index 100% rename from build/torch26-cxx11-cu118-x86_64-linux/optimizer/__init__.py rename to build/torch28-cxx11-cu126-x86_64-linux/optimizer/__init__.py diff --git a/build/torch28-cxx11-cu126-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc b/build/torch28-cxx11-cu126-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..77d242f9dd245935769c063d820ce8769db537d4 Binary files /dev/null and b/build/torch28-cxx11-cu126-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc differ diff --git a/build/torch28-cxx11-cu126-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc b/build/torch28-cxx11-cu126-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b96c7727abedfaae9311d62f3b97521916d4fa1e Binary files /dev/null and b/build/torch28-cxx11-cu126-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc differ diff --git a/build/torch28-cxx11-cu126-x86_64-linux/optimizer/_ops.py b/build/torch28-cxx11-cu126-x86_64-linux/optimizer/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..dab288ec61063a45be53f30f54dca0cbbabf7809 --- /dev/null +++ b/build/torch28-cxx11-cu126-x86_64-linux/optimizer/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _optimizer_1f13dae_dirty +ops = torch.ops._optimizer_1f13dae_dirty + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_optimizer_1f13dae_dirty::{op_name}" \ No newline at end of file diff --git a/build/torch28-cxx11-cu126-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so b/build/torch28-cxx11-cu126-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..f649563a12f8001df0b263a1fb2c582af07f74cc --- /dev/null +++ b/build/torch28-cxx11-cu126-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a082b5629efc4e9b8ce608713665d47904949b5d220dad350049bc806d58ecd7 +size 1824256 diff --git a/build/torch26-cxx11-cu118-x86_64-linux/optimizer/muon.py b/build/torch28-cxx11-cu126-x86_64-linux/optimizer/muon.py old mode 100755 new mode 100644 similarity index 100% rename from build/torch26-cxx11-cu118-x86_64-linux/optimizer/muon.py rename to build/torch28-cxx11-cu126-x86_64-linux/optimizer/muon.py diff --git a/build/torch26-cxx11-cu124-x86_64-linux/optimizer/__init__.py b/build/torch28-cxx11-cu128-x86_64-linux/optimizer/__init__.py old mode 100755 new mode 100644 similarity index 100% rename from build/torch26-cxx11-cu124-x86_64-linux/optimizer/__init__.py rename to build/torch28-cxx11-cu128-x86_64-linux/optimizer/__init__.py diff --git a/build/torch28-cxx11-cu128-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc b/build/torch28-cxx11-cu128-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5f3dbb2dceeb98ac4daba4bcf67bd07c46e6efcc Binary files /dev/null and b/build/torch28-cxx11-cu128-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc differ diff --git a/build/torch28-cxx11-cu128-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc b/build/torch28-cxx11-cu128-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1291428ec8416d5d28f6000e64d5ae07b33cd3c8 Binary files /dev/null and b/build/torch28-cxx11-cu128-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc differ diff --git a/build/torch28-cxx11-cu128-x86_64-linux/optimizer/_ops.py b/build/torch28-cxx11-cu128-x86_64-linux/optimizer/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..dab288ec61063a45be53f30f54dca0cbbabf7809 --- /dev/null +++ b/build/torch28-cxx11-cu128-x86_64-linux/optimizer/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _optimizer_1f13dae_dirty +ops = torch.ops._optimizer_1f13dae_dirty + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_optimizer_1f13dae_dirty::{op_name}" \ No newline at end of file diff --git a/build/torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so b/build/torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..f37d6abe851c426203b919a44364a0084792c416 --- /dev/null +++ b/build/torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d2e65e315cd82d0b6fc2043ff37ee2d1223d6bd293ef552d658db5bf4de0a45 +size 1883352 diff --git a/build/torch26-cxx11-cu124-x86_64-linux/optimizer/muon.py b/build/torch28-cxx11-cu128-x86_64-linux/optimizer/muon.py old mode 100755 new mode 100644 similarity index 100% rename from build/torch26-cxx11-cu124-x86_64-linux/optimizer/muon.py rename to build/torch28-cxx11-cu128-x86_64-linux/optimizer/muon.py diff --git a/build/torch26-cxx11-cu126-x86_64-linux/optimizer/__init__.py b/build/torch28-cxx11-cu129-x86_64-linux/optimizer/__init__.py old mode 100755 new mode 100644 similarity index 100% rename from build/torch26-cxx11-cu126-x86_64-linux/optimizer/__init__.py rename to build/torch28-cxx11-cu129-x86_64-linux/optimizer/__init__.py diff --git a/build/torch28-cxx11-cu129-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc b/build/torch28-cxx11-cu129-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b028f179166395d7869180975731ee6cfb7d40ae Binary files /dev/null and b/build/torch28-cxx11-cu129-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc differ diff --git a/build/torch28-cxx11-cu129-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc b/build/torch28-cxx11-cu129-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4e9ca9400c81327050a45605d1aa9f2ecf8d7bc1 Binary files /dev/null and b/build/torch28-cxx11-cu129-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc differ diff --git a/build/torch28-cxx11-cu129-x86_64-linux/optimizer/_ops.py b/build/torch28-cxx11-cu129-x86_64-linux/optimizer/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..dab288ec61063a45be53f30f54dca0cbbabf7809 --- /dev/null +++ b/build/torch28-cxx11-cu129-x86_64-linux/optimizer/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _optimizer_1f13dae_dirty +ops = torch.ops._optimizer_1f13dae_dirty + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_optimizer_1f13dae_dirty::{op_name}" \ No newline at end of file diff --git a/build/torch28-cxx11-cu129-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so b/build/torch28-cxx11-cu129-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..92a0b79054f448bef49680afb362164885d7b16a --- /dev/null +++ b/build/torch28-cxx11-cu129-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89fea7bfad71c806bc10bf2dc6aa66a6e154c09fc418498b1cab7f48a83432d4 +size 1883352 diff --git a/build/torch26-cxx11-cu126-x86_64-linux/optimizer/muon.py b/build/torch28-cxx11-cu129-x86_64-linux/optimizer/muon.py old mode 100755 new mode 100644 similarity index 100% rename from build/torch26-cxx11-cu126-x86_64-linux/optimizer/muon.py rename to build/torch28-cxx11-cu129-x86_64-linux/optimizer/muon.py diff --git a/build/torch26-cxx11-rocm62-x86_64-linux/optimizer/__init__.py b/build/torch28-cxx11-rocm63-x86_64-linux/optimizer/__init__.py old mode 100755 new mode 100644 similarity index 100% rename from build/torch26-cxx11-rocm62-x86_64-linux/optimizer/__init__.py rename to build/torch28-cxx11-rocm63-x86_64-linux/optimizer/__init__.py diff --git a/build/torch28-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc b/build/torch28-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fe2583a518cc661d7686314582249f7b7318eb7b Binary files /dev/null and b/build/torch28-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc differ diff --git a/build/torch28-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc b/build/torch28-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8bc326973aac2393ee2551e11203197dd8700dba Binary files /dev/null and b/build/torch28-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc differ diff --git a/build/torch28-cxx11-rocm63-x86_64-linux/optimizer/_ops.py b/build/torch28-cxx11-rocm63-x86_64-linux/optimizer/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..dab288ec61063a45be53f30f54dca0cbbabf7809 --- /dev/null +++ b/build/torch28-cxx11-rocm63-x86_64-linux/optimizer/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _optimizer_1f13dae_dirty +ops = torch.ops._optimizer_1f13dae_dirty + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_optimizer_1f13dae_dirty::{op_name}" \ No newline at end of file diff --git a/build/torch28-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so b/build/torch28-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..bcc7683c495c6e4c1e05d82260c232bad014ac98 --- /dev/null +++ b/build/torch28-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0805952950efdbe79c378ca84ae62b77d2d11cd2ba680c8ffccfd79301489ac5 +size 1750000 diff --git a/build/torch26-cxx11-rocm62-x86_64-linux/optimizer/muon.py b/build/torch28-cxx11-rocm63-x86_64-linux/optimizer/muon.py old mode 100755 new mode 100644 similarity index 100% rename from build/torch26-cxx11-rocm62-x86_64-linux/optimizer/muon.py rename to build/torch28-cxx11-rocm63-x86_64-linux/optimizer/muon.py diff --git a/build/torch26-cxx98-cu118-x86_64-linux/optimizer/__init__.py b/build/torch28-cxx11-rocm64-x86_64-linux/optimizer/__init__.py old mode 100755 new mode 100644 similarity index 100% rename from build/torch26-cxx98-cu118-x86_64-linux/optimizer/__init__.py rename to build/torch28-cxx11-rocm64-x86_64-linux/optimizer/__init__.py diff --git a/build/torch28-cxx11-rocm64-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc b/build/torch28-cxx11-rocm64-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c889bf8a61bb35b5c8dfb016d957d3ec40dd3e71 Binary files /dev/null and b/build/torch28-cxx11-rocm64-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc differ diff --git a/build/torch28-cxx11-rocm64-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc b/build/torch28-cxx11-rocm64-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ff2a202bacba17f7d85c2ccd7d1dbefe8ed947b6 Binary files /dev/null and b/build/torch28-cxx11-rocm64-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc differ diff --git a/build/torch28-cxx11-rocm64-x86_64-linux/optimizer/_ops.py b/build/torch28-cxx11-rocm64-x86_64-linux/optimizer/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..dab288ec61063a45be53f30f54dca0cbbabf7809 --- /dev/null +++ b/build/torch28-cxx11-rocm64-x86_64-linux/optimizer/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _optimizer_1f13dae_dirty +ops = torch.ops._optimizer_1f13dae_dirty + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_optimizer_1f13dae_dirty::{op_name}" \ No newline at end of file diff --git a/build/torch28-cxx11-rocm64-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so b/build/torch28-cxx11-rocm64-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..d7bf6740427ebe5a271c5a6ed94302b2405753c7 --- /dev/null +++ b/build/torch28-cxx11-rocm64-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af91f4eec9fc14d66f3db4e120d4913a0e62102c76b9b8cd9c25d8af427be290 +size 1750088 diff --git a/build/torch26-cxx98-cu118-x86_64-linux/optimizer/muon.py b/build/torch28-cxx11-rocm64-x86_64-linux/optimizer/muon.py old mode 100755 new mode 100644 similarity index 100% rename from build/torch26-cxx98-cu118-x86_64-linux/optimizer/muon.py rename to build/torch28-cxx11-rocm64-x86_64-linux/optimizer/muon.py diff --git a/flake.lock b/flake.lock deleted file mode 100644 index 368754a84e467fe6ba68962628649fc9ab6121cc..0000000000000000000000000000000000000000 --- a/flake.lock +++ /dev/null @@ -1,167 +0,0 @@ -{ - "nodes": { - "flake-compat": { - "locked": { - "lastModified": 1747046372, - "narHash": "sha256-CIVLLkVgvHYbgI2UpXvIIBJ12HWgX+fjA8Xf8PUmqCY=", - "owner": "edolstra", - "repo": "flake-compat", - "rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885", - "type": "github" - }, - "original": { - "owner": "edolstra", - "repo": "flake-compat", - "type": "github" - } - }, - "flake-compat_2": { - "locked": { - "lastModified": 1733328505, - "narHash": "sha256-NeCCThCEP3eCl2l/+27kNNK7QrwZB1IJCrXfrbv5oqU=", - "owner": "edolstra", - "repo": "flake-compat", - "rev": "ff81ac966bb2cae68946d5ed5fc4994f96d0ffec", - "type": "github" - }, - "original": { - "owner": "edolstra", - "repo": "flake-compat", - "type": "github" - } - }, - "flake-utils": { - "inputs": { - "systems": "systems" - }, - "locked": { - "lastModified": 1731533236, - "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=", - "owner": "numtide", - "repo": "flake-utils", - "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b", - "type": "github" - }, - "original": { - "owner": "numtide", - "repo": "flake-utils", - "type": "github" - } - }, - "flake-utils_2": { - "inputs": { - "systems": "systems_2" - }, - "locked": { - "lastModified": 1731533236, - "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=", - "owner": "numtide", - "repo": "flake-utils", - "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b", - "type": "github" - }, - "original": { - "owner": "numtide", - "repo": "flake-utils", - "type": "github" - } - }, - "hf-nix": { - "inputs": { - "flake-compat": "flake-compat_2", - "flake-utils": "flake-utils_2", - "nixpkgs": "nixpkgs" - }, - "locked": { - "lastModified": 1748598786, - "owner": "huggingface", - "repo": "hf-nix", - "rev": "6ca679441494139fde1f2355691ddb5dc8170269", - "type": "github" - }, - "original": { - "owner": "huggingface", - "repo": "hf-nix", - "type": "github" - } - }, - "kernel-builder": { - "inputs": { - "flake-compat": "flake-compat", - "flake-utils": "flake-utils", - "hf-nix": "hf-nix", - "nixpkgs": [ - "kernel-builder", - "hf-nix", - "nixpkgs" - ] - }, - "locked": { - "lastModified": 1749822059, - "narHash": "sha256-zype8KSqESZUIQpsY6sbf4f9pPxM/Zwem+KuH5LeHFk=", - "owner": "huggingface", - "repo": "kernel-builder", - "rev": "96abd968baa5fa16217413050fa7372d5db3baa5", - "type": "github" - }, - "original": { - "owner": "huggingface", - "repo": "kernel-builder", - "type": "github" - } - }, - "nixpkgs": { - "locked": { - "lastModified": 1747820358, - "narHash": "sha256-fTqsZsUX6M3yeEvgyQvXcbGmT2CaRVyVwsi8eK29Oj4=", - "owner": "danieldk", - "repo": "nixpkgs", - "rev": "d3c1681180717528068082103bf323147de6ab0b", - "type": "github" - }, - "original": { - "owner": "danieldk", - "ref": "cudatoolkit-12.9-kernel-builder", - "repo": "nixpkgs", - "type": "github" - } - }, - "root": { - "inputs": { - "kernel-builder": "kernel-builder" - } - }, - "systems": { - "locked": { - "lastModified": 1681028828, - "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", - "owner": "nix-systems", - "repo": "default", - "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", - "type": "github" - }, - "original": { - "owner": "nix-systems", - "repo": "default", - "type": "github" - } - }, - "systems_2": { - "locked": { - "lastModified": 1681028828, - "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", - "owner": "nix-systems", - "repo": "default", - "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", - "type": "github" - }, - "original": { - "owner": "nix-systems", - "repo": "default", - "type": "github" - } - } - }, - "root": "root", - "version": 7 -}