This view is limited to 50 files because it contains too many changes.Β  See the raw diff here.
Files changed (50) hide show
  1. build/torch26-cxx98-cu118-x86_64-linux/optimizer/_ops.py +0 -9
  2. build/torch26-cxx98-cu118-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so +0 -3
  3. build/torch26-cxx98-cu124-x86_64-linux/optimizer/__init__.py +0 -5
  4. build/torch26-cxx98-cu124-x86_64-linux/optimizer/_ops.py +0 -9
  5. build/torch26-cxx98-cu124-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so +0 -3
  6. build/torch26-cxx98-cu124-x86_64-linux/optimizer/muon.py +0 -494
  7. build/torch26-cxx98-cu126-x86_64-linux/optimizer/__init__.py +0 -5
  8. build/torch26-cxx98-cu126-x86_64-linux/optimizer/_ops.py +0 -9
  9. build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so +0 -3
  10. build/torch26-cxx98-cu126-x86_64-linux/optimizer/muon.py +0 -494
  11. build/torch27-cxx11-cu118-x86_64-linux/optimizer/__init__.py +0 -0
  12. build/torch27-cxx11-cu118-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc +0 -0
  13. build/torch27-cxx11-cu118-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc +0 -0
  14. build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py +3 -3
  15. build/torch27-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_02ac540_dirty.abi3.so β†’ _optimizer_1f13dae_dirty.abi3.so} +1 -1
  16. build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py +0 -0
  17. build/torch27-cxx11-cu126-x86_64-linux/optimizer/__init__.py +0 -0
  18. build/torch27-cxx11-cu126-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc +0 -0
  19. build/torch27-cxx11-cu126-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc +0 -0
  20. build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py +3 -3
  21. build/torch27-cxx11-cu126-x86_64-linux/optimizer/{_optimizer_02ac540_dirty.abi3.so β†’ _optimizer_1f13dae_dirty.abi3.so} +1 -1
  22. build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py +0 -0
  23. build/torch27-cxx11-cu128-x86_64-linux/optimizer/__init__.py +0 -0
  24. build/torch27-cxx11-cu128-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc +0 -0
  25. build/torch27-cxx11-cu128-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc +0 -0
  26. build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py +3 -3
  27. build/torch27-cxx11-cu128-x86_64-linux/optimizer/{_optimizer_02ac540_dirty.abi3.so β†’ _optimizer_1f13dae_dirty.abi3.so} +1 -1
  28. build/torch27-cxx11-cu128-x86_64-linux/optimizer/muon.py +0 -0
  29. build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__init__.py +0 -0
  30. build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-312.pyc +0 -0
  31. build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc +0 -0
  32. build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-312.pyc +0 -0
  33. build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc +0 -0
  34. build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_ops.py +3 -3
  35. build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so +0 -3
  36. build/{torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so β†’ torch27-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so} +2 -2
  37. build/torch27-cxx11-rocm63-x86_64-linux/optimizer/muon.py +0 -0
  38. build/{torch26-cxx11-cu118-x86_64-linux β†’ torch28-cxx11-cu126-x86_64-linux}/optimizer/__init__.py +0 -0
  39. build/torch28-cxx11-cu126-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc +0 -0
  40. build/torch28-cxx11-cu126-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc +0 -0
  41. build/{torch26-cxx11-cu118-x86_64-linux β†’ torch28-cxx11-cu126-x86_64-linux}/optimizer/_ops.py +3 -3
  42. build/{torch26-cxx11-cu118-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so β†’ torch28-cxx11-cu126-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so} +2 -2
  43. build/{torch26-cxx11-cu118-x86_64-linux β†’ torch28-cxx11-cu126-x86_64-linux}/optimizer/muon.py +0 -0
  44. build/{torch26-cxx11-cu124-x86_64-linux β†’ torch28-cxx11-cu128-x86_64-linux}/optimizer/__init__.py +0 -0
  45. build/torch28-cxx11-cu128-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc +0 -0
  46. build/torch28-cxx11-cu128-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc +0 -0
  47. build/{torch26-cxx11-cu126-x86_64-linux β†’ torch28-cxx11-cu128-x86_64-linux}/optimizer/_ops.py +3 -3
  48. build/{torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so β†’ torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so} +2 -2
  49. build/{torch26-cxx11-cu124-x86_64-linux β†’ torch28-cxx11-cu128-x86_64-linux}/optimizer/muon.py +0 -0
  50. build/{torch26-cxx11-cu126-x86_64-linux β†’ torch28-cxx11-cu129-x86_64-linux}/optimizer/__init__.py +0 -0
build/torch26-cxx98-cu118-x86_64-linux/optimizer/_ops.py DELETED
@@ -1,9 +0,0 @@
1
- import torch
2
- from . import _optimizer_02ac540_dirty
3
- ops = torch.ops._optimizer_02ac540_dirty
4
-
5
- def add_op_namespace_prefix(op_name: str):
6
- """
7
- Prefix op by namespace.
8
- """
9
- return f"_optimizer_02ac540_dirty::{op_name}"
 
 
 
 
 
 
 
 
 
 
build/torch26-cxx98-cu118-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9e09882858886be06e8ac48d184b320c57624d9c85165ce8b56640b022838e44
3
- size 1787192
 
 
 
 
build/torch26-cxx98-cu124-x86_64-linux/optimizer/__init__.py DELETED
@@ -1,5 +0,0 @@
1
- from .muon import Muon
2
-
3
- __all__ = [
4
- "Muon",
5
- ]
 
 
 
 
 
 
build/torch26-cxx98-cu124-x86_64-linux/optimizer/_ops.py DELETED
@@ -1,9 +0,0 @@
1
- import torch
2
- from . import _optimizer_02ac540_dirty
3
- ops = torch.ops._optimizer_02ac540_dirty
4
-
5
- def add_op_namespace_prefix(op_name: str):
6
- """
7
- Prefix op by namespace.
8
- """
9
- return f"_optimizer_02ac540_dirty::{op_name}"
 
 
 
 
 
 
 
 
 
 
build/torch26-cxx98-cu124-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6f63b2cd2c67b44f5e54837a0a4f26d94d3e6e8bfa4964bd99fc7e38494e2d52
3
- size 1824184
 
 
 
 
build/torch26-cxx98-cu124-x86_64-linux/optimizer/muon.py DELETED
@@ -1,494 +0,0 @@
1
- import math
2
- from dataclasses import dataclass
3
-
4
- import torch
5
- import torch.distributed as dist
6
- from torch.distributed._tensor import DTensor, Replicate
7
-
8
-
9
- # This code snippet is a modified version adapted from the following GitHub repositories:
10
- # https://github.com/KellerJordan/Muon/blob/master/muon.py
11
- @torch.no_grad()
12
- def _zeropower_via_newtonschulz5(G, steps):
13
- """
14
- Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
15
- quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
16
- of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
17
- zero even beyond the point where the iteration no longer converges all the way to one everywhere
18
- on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
19
- where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
20
- performance at all relative to UV^T, where USV^T = G is the SVD.
21
- """
22
- assert len(G.shape) == 2
23
- a, b, c = (3.4445, -4.7750, 2.0315)
24
- X = G # no manual typecast
25
- if G.size(0) > G.size(1):
26
- X = X.T
27
- # Ensure spectral norm is at most 1
28
- X = X / (X.norm() + 1e-7)
29
- X = X.bfloat16()
30
- # Perform the NS iterations
31
- for _ in range(steps):
32
- A = X @ X.T
33
- # B = (
34
- # b * A + c * A @ A
35
- # )
36
- B = torch.addmm(A, A, A, alpha=c, beta=b)
37
- # X = a * X + B @ X
38
- X = torch.addmm(X, B, X, alpha=1.0, beta=a)
39
-
40
- if G.size(0) > G.size(1):
41
- X = X.T
42
- return X.to(G.dtype)
43
-
44
-
45
- @dataclass
46
- class _muon_state:
47
- # TODO: use Optional
48
- worker_rank: int | None = None
49
- gathered_grad: torch.Tensor | None = None
50
- computed_u: torch.Tensor | None = None
51
- gather_event: torch.cuda.Event | None = None
52
- compute_event: torch.cuda.Event | None = None
53
-
54
-
55
- @torch.no_grad()
56
- def _gather(p, state, rank, comm_stream, none_grad):
57
- g = p.grad
58
- mesh = g.device_mesh
59
-
60
- if rank == state.worker_rank:
61
- gather_list = [torch.empty_like(g.to_local()) for _ in range(mesh.mesh.numel())]
62
- else:
63
- gather_list = None
64
-
65
- with torch.cuda.stream(comm_stream):
66
- torch.distributed.gather(
67
- g.to_local(),
68
- dst=state.worker_rank,
69
- gather_list=gather_list,
70
- group=mesh.get_group(),
71
- )
72
- if rank == state.worker_rank:
73
- if state.gathered_grad is not None:
74
- raise RuntimeError(
75
- "Gather event already exists, which should not happen."
76
- )
77
- state.gathered_grad = torch.cat(gather_list, dim=0)
78
- state.gather_event = torch.cuda.Event()
79
- state.gather_event.record()
80
- else:
81
- state.gathered_grad = None
82
- state.gather_event = None
83
- if none_grad:
84
- p.grad = None
85
-
86
-
87
- @torch.no_grad()
88
- def _compute_u(state, steps, rank, compute_stream):
89
- with torch.cuda.stream(compute_stream):
90
- if rank == state.worker_rank:
91
- if state.gather_event is None:
92
- raise RuntimeError("Gather event must be set before compute.")
93
- compute_stream.wait_event(state.gather_event)
94
- u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
95
- state.computed_u = u
96
- state.compute_event = torch.cuda.Event()
97
- state.compute_event.record()
98
- # Clear the gathered gradient to free memory
99
- state.gathered_grad = None
100
- else:
101
- state.computed_u = None
102
- state.compute_event = None
103
-
104
-
105
- @torch.no_grad()
106
- def _scatter(p, state, lr, weight_decay, rank, comm_stream):
107
- u = state.computed_u
108
- mesh = p.device_mesh
109
-
110
- with torch.cuda.stream(comm_stream):
111
- if rank == state.worker_rank:
112
- if state.compute_event is None:
113
- raise RuntimeError("Compute event must be set before scatter.")
114
- comm_stream.wait_event(state.compute_event)
115
- scatter_list = list(torch.split(u, p.size(0) // mesh.mesh.numel(), dim=0))
116
- else:
117
- scatter_list = None
118
-
119
- u = torch.empty_like(p.to_local())
120
- torch.distributed.scatter(
121
- u,
122
- scatter_list=scatter_list,
123
- src=state.worker_rank,
124
- group=mesh.get_group(),
125
- )
126
- if rank == state.worker_rank:
127
- # Clear u to free memory
128
- state.computed_u = None
129
- u = DTensor.from_local(
130
- u,
131
- placements=p.placements,
132
- device_mesh=mesh,
133
- )
134
- p.data.mul_(1 - lr * weight_decay)
135
- p.data.add_(u, alpha=-lr)
136
-
137
-
138
- def default_is_muon(x, name):
139
- return x.ndim >= 2 and "embed_tokens" not in name and "lm_head" not in name
140
-
141
-
142
- class Muon(torch.optim.Optimizer):
143
- """
144
- Muon - MomentUm Orthogonalized by Newton-schulz
145
-
146
- Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
147
- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
148
- matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
149
- the advantage that it can be stably run in bfloat16 on the GPU.
150
-
151
- Some warnings:
152
- - We believe this optimizer is unlikely to work well for training with small batch size.
153
- - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
154
-
155
- Arguments:
156
- muon_params: The parameters to be optimized by Muon.
157
- lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
158
- momentum: The momentum used by the internal SGD. (0.95 is a good default)
159
- nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
160
- ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
161
- adamw_params: The parameters to be optimized by AdamW. Any parameters in `muon_params` which are
162
- {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
163
- adamw_lr: The learning rate for the internal AdamW.
164
- adamw_betas: The betas for the internal AdamW.
165
- adamw_eps: The epsilon for the internal AdamW.
166
- adamw_weight_decay: The weight decay for the internal AdamW.
167
- """
168
-
169
- def __init__(
170
- self,
171
- model,
172
- is_muon_func=default_is_muon,
173
- lr=1e-3,
174
- momentum=0.95,
175
- nesterov=True,
176
- ns_steps=5,
177
- weight_decay=0.1,
178
- adamw_betas=(0.9, 0.95),
179
- adamw_eps=1e-8,
180
- none_grad=True,
181
- debug=False,
182
- ):
183
- defaults = dict(
184
- lr=lr,
185
- weight_decay=weight_decay,
186
- momentum=momentum,
187
- nesterov=nesterov,
188
- ns_steps=ns_steps,
189
- adamw_betas=adamw_betas,
190
- adamw_eps=adamw_eps,
191
- none_grad=none_grad,
192
- )
193
-
194
- super().__init__(model.parameters(), defaults)
195
- self.is_muon_func = is_muon_func
196
- self.model = model
197
-
198
- if dist.is_initialized():
199
- self.rank = dist.get_rank()
200
- else:
201
- self.rank = None
202
-
203
- self.comm_stream = torch.cuda.Stream()
204
- self.compute_stream = torch.cuda.Stream()
205
- self.debug = debug
206
-
207
- def __setstate__(self, state):
208
- # Sort parameters into those for which we will use Muon, and those for which we will not
209
- super().__setstate__(state)
210
- self._init_state()
211
-
212
- def _init_state(self):
213
- for name, p in self.model.named_parameters():
214
- if self.is_muon_func(p, name):
215
- # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
216
- assert p.ndim == 2, p.ndim
217
- self.state[p]["use_muon"] = True
218
- else:
219
- # Do not use Muon for parameters in adamw_params
220
- self.state[p]["use_muon"] = False
221
-
222
- def _calc_flops(self, G, steps):
223
- assert len(G.shape) == 2
224
- M, N = G.shape
225
- if M > N:
226
- M, N = N, M
227
-
228
- return steps * ((M**3) * 2 + (M**2 * N) * 4 + M * N * 2 + M**2 * 3)
229
-
230
- def adjust_lr_for_muon(self, lr, param_shape):
231
- A, B = param_shape[:2]
232
- # We adjust the learning rate and weight decay based on the size of the parameter matrix
233
- # as describted in the paper
234
- adjusted_ratio = 0.2 * math.sqrt(max(A, B))
235
- adjusted_lr = lr * adjusted_ratio
236
- return adjusted_lr
237
-
238
- def init_state_and_assign_params(self, params, group):
239
- param_to_state = {}
240
- param_to_flops = {}
241
-
242
- total_flops = 0
243
- for p in params:
244
- g = p.grad
245
- if g is None:
246
- continue
247
- assert g.ndim == 2, "Muon only supports 2D parameters."
248
-
249
- flops = self._calc_flops(g, group["ns_steps"])
250
- param_to_flops[id(p)] = flops
251
- total_flops += flops
252
-
253
- if self.debug:
254
- print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs", flush=True)
255
-
256
- ordered_params = sorted(
257
- params, key=lambda p: param_to_flops[id(p)], reverse=True
258
- )
259
-
260
- round_robin = 0
261
- mesh = None
262
- for p in ordered_params:
263
- if mesh is None:
264
- mesh = p.device_mesh
265
- if mesh.ndim != 1:
266
- raise NotImplementedError(
267
- "Muon requires a 1D mesh for distributed training yet."
268
- )
269
- elif mesh != p.device_mesh:
270
- raise ValueError("All parameters must be on the same mesh.")
271
-
272
- param_to_state[id(p)] = _muon_state()
273
- param_to_state[id(p)].worker_rank = mesh.mesh[round_robin].item()
274
-
275
- round_robin = (round_robin + 1) % mesh.mesh.numel()
276
-
277
- return param_to_state, ordered_params
278
-
279
- def base(self, params, group, lr, weight_decay, momentum):
280
- # generate weight updates in distributed fashion
281
- for p in params:
282
- g = p.grad
283
- if g is None:
284
- continue
285
- if g.ndim > 2:
286
- g = g.view(g.size(0), -1)
287
- assert g is not None
288
-
289
- # calc update
290
- state = self.state[p]
291
- if "momentum_buffer" not in state:
292
- state["momentum_buffer"] = torch.zeros_like(g)
293
- buf = state["momentum_buffer"]
294
- buf.mul_(momentum).add_(g)
295
- if group["nesterov"]:
296
- g = g.add(buf, alpha=momentum)
297
- else:
298
- g = buf
299
-
300
- u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
301
-
302
- # scale update
303
- adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
304
-
305
- # apply weight decay
306
- p.data.mul_(1 - lr * weight_decay)
307
-
308
- # apply update
309
- p.data.add_(u, alpha=-adjusted_lr)
310
-
311
- def _update_g(self, p, g, group, momentum):
312
- # calc update
313
- state = self.state[p]
314
- if "momentum_buffer" not in state:
315
- state["momentum_buffer"] = torch.zeros_like(g)
316
- buf = state["momentum_buffer"]
317
- buf.mul_(momentum).add_(g)
318
- if group["nesterov"]:
319
- g = g.add(buf, alpha=momentum)
320
- else:
321
- g = buf
322
- return g
323
-
324
- def _update_p(self, p, u, lr, weight_decay):
325
- # scale update
326
- adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
327
- # apply weight decay
328
- p.data.mul_(1 - lr * weight_decay)
329
- # apply update
330
- p.data.add_(u, alpha=-adjusted_lr)
331
-
332
- def parallel(self, params, group, lr, weight_decay, momentum):
333
- """
334
- Perform a parallel optimization step using Muon.
335
- """
336
-
337
- for p in params:
338
- g = p.grad
339
- if g is None:
340
- continue
341
- if g.ndim > 2:
342
- g = g.view(g.size(0), -1)
343
-
344
- # Update g in the local rank
345
- g = self._update_g(
346
- p,
347
- g,
348
- group,
349
- momentum=momentum,
350
- )
351
- p.grad = g
352
-
353
- param_to_state, ordered_params = self.init_state_and_assign_params(
354
- params, group
355
- )
356
-
357
- def enqueue_gathers(start_idx, chunk_size):
358
- for p in ordered_params[start_idx : start_idx + chunk_size]:
359
- state = param_to_state[id(p)]
360
- _gather(p, state, self.rank, self.comm_stream, group["none_grad"])
361
-
362
- def enqueue_computes(start_idx, chunk_size):
363
- for p in ordered_params[start_idx : start_idx + chunk_size]:
364
- state = param_to_state[id(p)]
365
- _compute_u(state, group["ns_steps"], self.rank, self.compute_stream)
366
-
367
- def enqueue_scatters(start_idx, chunk_size):
368
- for p in ordered_params[start_idx : start_idx + chunk_size]:
369
- state = param_to_state[id(p)]
370
- adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
371
- _scatter(
372
- p, state, adjusted_lr, weight_decay, self.rank, self.comm_stream
373
- )
374
-
375
- chunk_size = params[0].device_mesh.mesh.numel()
376
-
377
- # Wait grad update
378
- self.comm_stream.wait_stream(torch.cuda.current_stream())
379
-
380
- enqueue_gathers(0, chunk_size)
381
- for i in range(0, len(params) + chunk_size - 1, chunk_size):
382
- enqueue_computes(i, chunk_size)
383
- enqueue_gathers(i + chunk_size, chunk_size)
384
- enqueue_scatters(i, chunk_size)
385
-
386
- torch.cuda.current_stream().wait_stream(self.comm_stream)
387
-
388
- def step(self, closure=None):
389
- """Perform a single optimization step.
390
-
391
- Args:
392
- closure (Callable, optional): A closure that reevaluates the model
393
- and returns the loss.
394
- """
395
- loss = None
396
- if closure is not None:
397
- with torch.enable_grad():
398
- loss = closure()
399
-
400
- for group in self.param_groups:
401
- ############################
402
- # Muon #
403
- ############################
404
-
405
- if "use_muon" not in self.state[group["params"][0]]:
406
- self._init_state()
407
-
408
- params = [p for p in group["params"] if self.state[p]["use_muon"]]
409
- lr = group["lr"]
410
- weight_decay = group["weight_decay"]
411
- momentum = group["momentum"]
412
-
413
- param_dtensors = []
414
- param_tensors = []
415
-
416
- for p in params:
417
- if p is None or p.grad is None:
418
- continue
419
- if isinstance(p.data, DTensor):
420
- if all(
421
- isinstance(placement, Replicate) for placement in p.placements
422
- ):
423
- param_tensors.append(p)
424
- else:
425
- param_dtensors.append(p)
426
- elif isinstance(p.data, torch.Tensor):
427
- param_tensors.append(p)
428
- else:
429
- raise TypeError(f"Unsupported parameter type: {type(p.data)}")
430
-
431
- if self.debug:
432
- print(
433
- f"[Muon] {len(param_dtensors)} DTensors, {len(param_tensors)} Tensors",
434
- flush=True,
435
- )
436
-
437
- if len(param_dtensors) > 0:
438
- if not dist.is_initialized():
439
- raise RuntimeError(
440
- "Parallel Muon requires torch.distributed to be initialized."
441
- )
442
-
443
- self.parallel(
444
- param_dtensors,
445
- group,
446
- lr=lr,
447
- weight_decay=weight_decay,
448
- momentum=momentum,
449
- )
450
-
451
- if len(param_tensors) > 0:
452
- self.base(
453
- param_tensors,
454
- group,
455
- lr=lr,
456
- weight_decay=weight_decay,
457
- momentum=momentum,
458
- )
459
-
460
- ############################
461
- # AdamW backup #
462
- ############################
463
-
464
- params = [p for p in group["params"] if not self.state[p]["use_muon"]]
465
- lr = group["lr"]
466
- beta1, beta2 = group["adamw_betas"]
467
- eps = group["adamw_eps"]
468
- weight_decay = group["weight_decay"]
469
-
470
- for p in params:
471
- g = p.grad
472
- if g is None:
473
- continue
474
- state = self.state[p]
475
- if "step" not in state:
476
- state["step"] = 0
477
- state["moment1"] = torch.zeros_like(g)
478
- state["moment2"] = torch.zeros_like(g)
479
- state["step"] += 1
480
- step = state["step"]
481
- buf1 = state["moment1"]
482
- buf2 = state["moment2"]
483
- buf1.lerp_(g, 1 - beta1)
484
- buf2.lerp_(g.square(), 1 - beta2)
485
-
486
- g = buf1 / (eps + buf2.sqrt())
487
-
488
- bias_correction1 = 1 - beta1**step
489
- bias_correction2 = 1 - beta2**step
490
- scale = bias_correction1 / bias_correction2**0.5
491
- p.data.mul_(1 - lr * weight_decay)
492
- p.data.add_(g, alpha=-lr / scale)
493
-
494
- return loss
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/torch26-cxx98-cu126-x86_64-linux/optimizer/__init__.py DELETED
@@ -1,5 +0,0 @@
1
- from .muon import Muon
2
-
3
- __all__ = [
4
- "Muon",
5
- ]
 
 
 
 
 
 
build/torch26-cxx98-cu126-x86_64-linux/optimizer/_ops.py DELETED
@@ -1,9 +0,0 @@
1
- import torch
2
- from . import _optimizer_02ac540_dirty
3
- ops = torch.ops._optimizer_02ac540_dirty
4
-
5
- def add_op_namespace_prefix(op_name: str):
6
- """
7
- Prefix op by namespace.
8
- """
9
- return f"_optimizer_02ac540_dirty::{op_name}"
 
 
 
 
 
 
 
 
 
 
build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:48795cb66a740b14266d757ac70a6b43fb11df6662970bb4040650d237e6cbc5
3
- size 1824184
 
 
 
 
build/torch26-cxx98-cu126-x86_64-linux/optimizer/muon.py DELETED
@@ -1,494 +0,0 @@
1
- import math
2
- from dataclasses import dataclass
3
-
4
- import torch
5
- import torch.distributed as dist
6
- from torch.distributed._tensor import DTensor, Replicate
7
-
8
-
9
- # This code snippet is a modified version adapted from the following GitHub repositories:
10
- # https://github.com/KellerJordan/Muon/blob/master/muon.py
11
- @torch.no_grad()
12
- def _zeropower_via_newtonschulz5(G, steps):
13
- """
14
- Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
15
- quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
16
- of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
17
- zero even beyond the point where the iteration no longer converges all the way to one everywhere
18
- on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
19
- where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
20
- performance at all relative to UV^T, where USV^T = G is the SVD.
21
- """
22
- assert len(G.shape) == 2
23
- a, b, c = (3.4445, -4.7750, 2.0315)
24
- X = G # no manual typecast
25
- if G.size(0) > G.size(1):
26
- X = X.T
27
- # Ensure spectral norm is at most 1
28
- X = X / (X.norm() + 1e-7)
29
- X = X.bfloat16()
30
- # Perform the NS iterations
31
- for _ in range(steps):
32
- A = X @ X.T
33
- # B = (
34
- # b * A + c * A @ A
35
- # )
36
- B = torch.addmm(A, A, A, alpha=c, beta=b)
37
- # X = a * X + B @ X
38
- X = torch.addmm(X, B, X, alpha=1.0, beta=a)
39
-
40
- if G.size(0) > G.size(1):
41
- X = X.T
42
- return X.to(G.dtype)
43
-
44
-
45
- @dataclass
46
- class _muon_state:
47
- # TODO: use Optional
48
- worker_rank: int | None = None
49
- gathered_grad: torch.Tensor | None = None
50
- computed_u: torch.Tensor | None = None
51
- gather_event: torch.cuda.Event | None = None
52
- compute_event: torch.cuda.Event | None = None
53
-
54
-
55
- @torch.no_grad()
56
- def _gather(p, state, rank, comm_stream, none_grad):
57
- g = p.grad
58
- mesh = g.device_mesh
59
-
60
- if rank == state.worker_rank:
61
- gather_list = [torch.empty_like(g.to_local()) for _ in range(mesh.mesh.numel())]
62
- else:
63
- gather_list = None
64
-
65
- with torch.cuda.stream(comm_stream):
66
- torch.distributed.gather(
67
- g.to_local(),
68
- dst=state.worker_rank,
69
- gather_list=gather_list,
70
- group=mesh.get_group(),
71
- )
72
- if rank == state.worker_rank:
73
- if state.gathered_grad is not None:
74
- raise RuntimeError(
75
- "Gather event already exists, which should not happen."
76
- )
77
- state.gathered_grad = torch.cat(gather_list, dim=0)
78
- state.gather_event = torch.cuda.Event()
79
- state.gather_event.record()
80
- else:
81
- state.gathered_grad = None
82
- state.gather_event = None
83
- if none_grad:
84
- p.grad = None
85
-
86
-
87
- @torch.no_grad()
88
- def _compute_u(state, steps, rank, compute_stream):
89
- with torch.cuda.stream(compute_stream):
90
- if rank == state.worker_rank:
91
- if state.gather_event is None:
92
- raise RuntimeError("Gather event must be set before compute.")
93
- compute_stream.wait_event(state.gather_event)
94
- u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
95
- state.computed_u = u
96
- state.compute_event = torch.cuda.Event()
97
- state.compute_event.record()
98
- # Clear the gathered gradient to free memory
99
- state.gathered_grad = None
100
- else:
101
- state.computed_u = None
102
- state.compute_event = None
103
-
104
-
105
- @torch.no_grad()
106
- def _scatter(p, state, lr, weight_decay, rank, comm_stream):
107
- u = state.computed_u
108
- mesh = p.device_mesh
109
-
110
- with torch.cuda.stream(comm_stream):
111
- if rank == state.worker_rank:
112
- if state.compute_event is None:
113
- raise RuntimeError("Compute event must be set before scatter.")
114
- comm_stream.wait_event(state.compute_event)
115
- scatter_list = list(torch.split(u, p.size(0) // mesh.mesh.numel(), dim=0))
116
- else:
117
- scatter_list = None
118
-
119
- u = torch.empty_like(p.to_local())
120
- torch.distributed.scatter(
121
- u,
122
- scatter_list=scatter_list,
123
- src=state.worker_rank,
124
- group=mesh.get_group(),
125
- )
126
- if rank == state.worker_rank:
127
- # Clear u to free memory
128
- state.computed_u = None
129
- u = DTensor.from_local(
130
- u,
131
- placements=p.placements,
132
- device_mesh=mesh,
133
- )
134
- p.data.mul_(1 - lr * weight_decay)
135
- p.data.add_(u, alpha=-lr)
136
-
137
-
138
- def default_is_muon(x, name):
139
- return x.ndim >= 2 and "embed_tokens" not in name and "lm_head" not in name
140
-
141
-
142
- class Muon(torch.optim.Optimizer):
143
- """
144
- Muon - MomentUm Orthogonalized by Newton-schulz
145
-
146
- Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
147
- processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
148
- matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
149
- the advantage that it can be stably run in bfloat16 on the GPU.
150
-
151
- Some warnings:
152
- - We believe this optimizer is unlikely to work well for training with small batch size.
153
- - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
154
-
155
- Arguments:
156
- muon_params: The parameters to be optimized by Muon.
157
- lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
158
- momentum: The momentum used by the internal SGD. (0.95 is a good default)
159
- nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
160
- ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
161
- adamw_params: The parameters to be optimized by AdamW. Any parameters in `muon_params` which are
162
- {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
163
- adamw_lr: The learning rate for the internal AdamW.
164
- adamw_betas: The betas for the internal AdamW.
165
- adamw_eps: The epsilon for the internal AdamW.
166
- adamw_weight_decay: The weight decay for the internal AdamW.
167
- """
168
-
169
- def __init__(
170
- self,
171
- model,
172
- is_muon_func=default_is_muon,
173
- lr=1e-3,
174
- momentum=0.95,
175
- nesterov=True,
176
- ns_steps=5,
177
- weight_decay=0.1,
178
- adamw_betas=(0.9, 0.95),
179
- adamw_eps=1e-8,
180
- none_grad=True,
181
- debug=False,
182
- ):
183
- defaults = dict(
184
- lr=lr,
185
- weight_decay=weight_decay,
186
- momentum=momentum,
187
- nesterov=nesterov,
188
- ns_steps=ns_steps,
189
- adamw_betas=adamw_betas,
190
- adamw_eps=adamw_eps,
191
- none_grad=none_grad,
192
- )
193
-
194
- super().__init__(model.parameters(), defaults)
195
- self.is_muon_func = is_muon_func
196
- self.model = model
197
-
198
- if dist.is_initialized():
199
- self.rank = dist.get_rank()
200
- else:
201
- self.rank = None
202
-
203
- self.comm_stream = torch.cuda.Stream()
204
- self.compute_stream = torch.cuda.Stream()
205
- self.debug = debug
206
-
207
- def __setstate__(self, state):
208
- # Sort parameters into those for which we will use Muon, and those for which we will not
209
- super().__setstate__(state)
210
- self._init_state()
211
-
212
- def _init_state(self):
213
- for name, p in self.model.named_parameters():
214
- if self.is_muon_func(p, name):
215
- # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
216
- assert p.ndim == 2, p.ndim
217
- self.state[p]["use_muon"] = True
218
- else:
219
- # Do not use Muon for parameters in adamw_params
220
- self.state[p]["use_muon"] = False
221
-
222
- def _calc_flops(self, G, steps):
223
- assert len(G.shape) == 2
224
- M, N = G.shape
225
- if M > N:
226
- M, N = N, M
227
-
228
- return steps * ((M**3) * 2 + (M**2 * N) * 4 + M * N * 2 + M**2 * 3)
229
-
230
- def adjust_lr_for_muon(self, lr, param_shape):
231
- A, B = param_shape[:2]
232
- # We adjust the learning rate and weight decay based on the size of the parameter matrix
233
- # as describted in the paper
234
- adjusted_ratio = 0.2 * math.sqrt(max(A, B))
235
- adjusted_lr = lr * adjusted_ratio
236
- return adjusted_lr
237
-
238
- def init_state_and_assign_params(self, params, group):
239
- param_to_state = {}
240
- param_to_flops = {}
241
-
242
- total_flops = 0
243
- for p in params:
244
- g = p.grad
245
- if g is None:
246
- continue
247
- assert g.ndim == 2, "Muon only supports 2D parameters."
248
-
249
- flops = self._calc_flops(g, group["ns_steps"])
250
- param_to_flops[id(p)] = flops
251
- total_flops += flops
252
-
253
- if self.debug:
254
- print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs", flush=True)
255
-
256
- ordered_params = sorted(
257
- params, key=lambda p: param_to_flops[id(p)], reverse=True
258
- )
259
-
260
- round_robin = 0
261
- mesh = None
262
- for p in ordered_params:
263
- if mesh is None:
264
- mesh = p.device_mesh
265
- if mesh.ndim != 1:
266
- raise NotImplementedError(
267
- "Muon requires a 1D mesh for distributed training yet."
268
- )
269
- elif mesh != p.device_mesh:
270
- raise ValueError("All parameters must be on the same mesh.")
271
-
272
- param_to_state[id(p)] = _muon_state()
273
- param_to_state[id(p)].worker_rank = mesh.mesh[round_robin].item()
274
-
275
- round_robin = (round_robin + 1) % mesh.mesh.numel()
276
-
277
- return param_to_state, ordered_params
278
-
279
- def base(self, params, group, lr, weight_decay, momentum):
280
- # generate weight updates in distributed fashion
281
- for p in params:
282
- g = p.grad
283
- if g is None:
284
- continue
285
- if g.ndim > 2:
286
- g = g.view(g.size(0), -1)
287
- assert g is not None
288
-
289
- # calc update
290
- state = self.state[p]
291
- if "momentum_buffer" not in state:
292
- state["momentum_buffer"] = torch.zeros_like(g)
293
- buf = state["momentum_buffer"]
294
- buf.mul_(momentum).add_(g)
295
- if group["nesterov"]:
296
- g = g.add(buf, alpha=momentum)
297
- else:
298
- g = buf
299
-
300
- u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
301
-
302
- # scale update
303
- adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
304
-
305
- # apply weight decay
306
- p.data.mul_(1 - lr * weight_decay)
307
-
308
- # apply update
309
- p.data.add_(u, alpha=-adjusted_lr)
310
-
311
- def _update_g(self, p, g, group, momentum):
312
- # calc update
313
- state = self.state[p]
314
- if "momentum_buffer" not in state:
315
- state["momentum_buffer"] = torch.zeros_like(g)
316
- buf = state["momentum_buffer"]
317
- buf.mul_(momentum).add_(g)
318
- if group["nesterov"]:
319
- g = g.add(buf, alpha=momentum)
320
- else:
321
- g = buf
322
- return g
323
-
324
- def _update_p(self, p, u, lr, weight_decay):
325
- # scale update
326
- adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
327
- # apply weight decay
328
- p.data.mul_(1 - lr * weight_decay)
329
- # apply update
330
- p.data.add_(u, alpha=-adjusted_lr)
331
-
332
- def parallel(self, params, group, lr, weight_decay, momentum):
333
- """
334
- Perform a parallel optimization step using Muon.
335
- """
336
-
337
- for p in params:
338
- g = p.grad
339
- if g is None:
340
- continue
341
- if g.ndim > 2:
342
- g = g.view(g.size(0), -1)
343
-
344
- # Update g in the local rank
345
- g = self._update_g(
346
- p,
347
- g,
348
- group,
349
- momentum=momentum,
350
- )
351
- p.grad = g
352
-
353
- param_to_state, ordered_params = self.init_state_and_assign_params(
354
- params, group
355
- )
356
-
357
- def enqueue_gathers(start_idx, chunk_size):
358
- for p in ordered_params[start_idx : start_idx + chunk_size]:
359
- state = param_to_state[id(p)]
360
- _gather(p, state, self.rank, self.comm_stream, group["none_grad"])
361
-
362
- def enqueue_computes(start_idx, chunk_size):
363
- for p in ordered_params[start_idx : start_idx + chunk_size]:
364
- state = param_to_state[id(p)]
365
- _compute_u(state, group["ns_steps"], self.rank, self.compute_stream)
366
-
367
- def enqueue_scatters(start_idx, chunk_size):
368
- for p in ordered_params[start_idx : start_idx + chunk_size]:
369
- state = param_to_state[id(p)]
370
- adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
371
- _scatter(
372
- p, state, adjusted_lr, weight_decay, self.rank, self.comm_stream
373
- )
374
-
375
- chunk_size = params[0].device_mesh.mesh.numel()
376
-
377
- # Wait grad update
378
- self.comm_stream.wait_stream(torch.cuda.current_stream())
379
-
380
- enqueue_gathers(0, chunk_size)
381
- for i in range(0, len(params) + chunk_size - 1, chunk_size):
382
- enqueue_computes(i, chunk_size)
383
- enqueue_gathers(i + chunk_size, chunk_size)
384
- enqueue_scatters(i, chunk_size)
385
-
386
- torch.cuda.current_stream().wait_stream(self.comm_stream)
387
-
388
- def step(self, closure=None):
389
- """Perform a single optimization step.
390
-
391
- Args:
392
- closure (Callable, optional): A closure that reevaluates the model
393
- and returns the loss.
394
- """
395
- loss = None
396
- if closure is not None:
397
- with torch.enable_grad():
398
- loss = closure()
399
-
400
- for group in self.param_groups:
401
- ############################
402
- # Muon #
403
- ############################
404
-
405
- if "use_muon" not in self.state[group["params"][0]]:
406
- self._init_state()
407
-
408
- params = [p for p in group["params"] if self.state[p]["use_muon"]]
409
- lr = group["lr"]
410
- weight_decay = group["weight_decay"]
411
- momentum = group["momentum"]
412
-
413
- param_dtensors = []
414
- param_tensors = []
415
-
416
- for p in params:
417
- if p is None or p.grad is None:
418
- continue
419
- if isinstance(p.data, DTensor):
420
- if all(
421
- isinstance(placement, Replicate) for placement in p.placements
422
- ):
423
- param_tensors.append(p)
424
- else:
425
- param_dtensors.append(p)
426
- elif isinstance(p.data, torch.Tensor):
427
- param_tensors.append(p)
428
- else:
429
- raise TypeError(f"Unsupported parameter type: {type(p.data)}")
430
-
431
- if self.debug:
432
- print(
433
- f"[Muon] {len(param_dtensors)} DTensors, {len(param_tensors)} Tensors",
434
- flush=True,
435
- )
436
-
437
- if len(param_dtensors) > 0:
438
- if not dist.is_initialized():
439
- raise RuntimeError(
440
- "Parallel Muon requires torch.distributed to be initialized."
441
- )
442
-
443
- self.parallel(
444
- param_dtensors,
445
- group,
446
- lr=lr,
447
- weight_decay=weight_decay,
448
- momentum=momentum,
449
- )
450
-
451
- if len(param_tensors) > 0:
452
- self.base(
453
- param_tensors,
454
- group,
455
- lr=lr,
456
- weight_decay=weight_decay,
457
- momentum=momentum,
458
- )
459
-
460
- ############################
461
- # AdamW backup #
462
- ############################
463
-
464
- params = [p for p in group["params"] if not self.state[p]["use_muon"]]
465
- lr = group["lr"]
466
- beta1, beta2 = group["adamw_betas"]
467
- eps = group["adamw_eps"]
468
- weight_decay = group["weight_decay"]
469
-
470
- for p in params:
471
- g = p.grad
472
- if g is None:
473
- continue
474
- state = self.state[p]
475
- if "step" not in state:
476
- state["step"] = 0
477
- state["moment1"] = torch.zeros_like(g)
478
- state["moment2"] = torch.zeros_like(g)
479
- state["step"] += 1
480
- step = state["step"]
481
- buf1 = state["moment1"]
482
- buf2 = state["moment2"]
483
- buf1.lerp_(g, 1 - beta1)
484
- buf2.lerp_(g.square(), 1 - beta2)
485
-
486
- g = buf1 / (eps + buf2.sqrt())
487
-
488
- bias_correction1 = 1 - beta1**step
489
- bias_correction2 = 1 - beta2**step
490
- scale = bias_correction1 / bias_correction2**0.5
491
- p.data.mul_(1 - lr * weight_decay)
492
- p.data.add_(g, alpha=-lr / scale)
493
-
494
- return loss
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/torch27-cxx11-cu118-x86_64-linux/optimizer/__init__.py CHANGED
File without changes
build/torch27-cxx11-cu118-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (307 Bytes). View file
 
build/torch27-cxx11-cu118-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc ADDED
Binary file (22.4 kB). View file
 
build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _optimizer_02ac540_dirty
3
- ops = torch.ops._optimizer_02ac540_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_optimizer_02ac540_dirty::{op_name}"
 
1
  import torch
2
+ from . import _optimizer_1f13dae_dirty
3
+ ops = torch.ops._optimizer_1f13dae_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_optimizer_1f13dae_dirty::{op_name}"
build/torch27-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_02ac540_dirty.abi3.so β†’ _optimizer_1f13dae_dirty.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ec1f34fd4ead50eb51db63f51afc0751d6bf0c64a46c44c713ab245f150979cc
3
  size 1787368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7dc5f8a57aa60483209dfcbb0c7cc0e54f1739d643145c1e685fbe2b6675ac43
3
  size 1787368
build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py CHANGED
File without changes
build/torch27-cxx11-cu126-x86_64-linux/optimizer/__init__.py CHANGED
File without changes
build/torch27-cxx11-cu126-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (307 Bytes). View file
 
build/torch27-cxx11-cu126-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc ADDED
Binary file (22.4 kB). View file
 
build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _optimizer_02ac540_dirty
3
- ops = torch.ops._optimizer_02ac540_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_optimizer_02ac540_dirty::{op_name}"
 
1
  import torch
2
+ from . import _optimizer_1f13dae_dirty
3
+ ops = torch.ops._optimizer_1f13dae_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_optimizer_1f13dae_dirty::{op_name}"
build/torch27-cxx11-cu126-x86_64-linux/optimizer/{_optimizer_02ac540_dirty.abi3.so β†’ _optimizer_1f13dae_dirty.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bdb8ab38f72351ae88307560aca5e1af7b2dcb63a39627dbd4c806cad3f83442
3
  size 1824256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96c7e281f9634e3b252f720f4fea4f61490f2f1a1ef1280a3e259decb41c846f
3
  size 1824256
build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py CHANGED
File without changes
build/torch27-cxx11-cu128-x86_64-linux/optimizer/__init__.py CHANGED
File without changes
build/torch27-cxx11-cu128-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (307 Bytes). View file
 
build/torch27-cxx11-cu128-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc ADDED
Binary file (22.4 kB). View file
 
build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _optimizer_02ac540_dirty
3
- ops = torch.ops._optimizer_02ac540_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_optimizer_02ac540_dirty::{op_name}"
 
1
  import torch
2
+ from . import _optimizer_1f13dae_dirty
3
+ ops = torch.ops._optimizer_1f13dae_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_optimizer_1f13dae_dirty::{op_name}"
build/torch27-cxx11-cu128-x86_64-linux/optimizer/{_optimizer_02ac540_dirty.abi3.so β†’ _optimizer_1f13dae_dirty.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0652d611e00b1bcbece47da13dffb28396ae0831dc4be43c7ae9be27ad9a10fe
3
  size 1883352
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:046a45fae81c2b7d79ff2237a1d26277f4883ef8a8b87a3980bf06d1182711b1
3
  size 1883352
build/torch27-cxx11-cu128-x86_64-linux/optimizer/muon.py CHANGED
File without changes
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__init__.py CHANGED
File without changes
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-312.pyc DELETED
Binary file (252 Bytes)
 
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (308 Bytes). View file
 
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-312.pyc DELETED
Binary file (22.3 kB)
 
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc ADDED
Binary file (22.4 kB). View file
 
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _optimizer_02ac540_dirty
3
- ops = torch.ops._optimizer_02ac540_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_optimizer_02ac540_dirty::{op_name}"
 
1
  import torch
2
+ from . import _optimizer_1f13dae_dirty
3
+ ops = torch.ops._optimizer_1f13dae_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_optimizer_1f13dae_dirty::{op_name}"
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a96bfd1f461d7cd029dd39d142d2999dcc86dd7f56fb40f045e00f3fb2c400bd
3
- size 1749648
 
 
 
 
build/{torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so β†’ torch27-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c478b90b83052c5931cb3d872adad7811663e28bd3447f12ac412f15b1d0ffc5
3
- size 1824224
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d9ee2420e8528032369c476152a1960d123034a83e2c43f38a7fb2d1423aa23
3
+ size 1749840
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/muon.py CHANGED
File without changes
build/{torch26-cxx11-cu118-x86_64-linux β†’ torch28-cxx11-cu126-x86_64-linux}/optimizer/__init__.py RENAMED
File without changes
build/torch28-cxx11-cu126-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (307 Bytes). View file
 
build/torch28-cxx11-cu126-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc ADDED
Binary file (22.4 kB). View file
 
build/{torch26-cxx11-cu118-x86_64-linux β†’ torch28-cxx11-cu126-x86_64-linux}/optimizer/_ops.py RENAMED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _optimizer_02ac540_dirty
3
- ops = torch.ops._optimizer_02ac540_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_optimizer_02ac540_dirty::{op_name}"
 
1
  import torch
2
+ from . import _optimizer_1f13dae_dirty
3
+ ops = torch.ops._optimizer_1f13dae_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_optimizer_1f13dae_dirty::{op_name}"
build/{torch26-cxx11-cu118-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so β†’ torch28-cxx11-cu126-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:829533f24bccb220101238fcbafa1343d2ec3ba3922a91a836b8a05813b44672
3
- size 1787272
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a082b5629efc4e9b8ce608713665d47904949b5d220dad350049bc806d58ecd7
3
+ size 1824256
build/{torch26-cxx11-cu118-x86_64-linux β†’ torch28-cxx11-cu126-x86_64-linux}/optimizer/muon.py RENAMED
File without changes
build/{torch26-cxx11-cu124-x86_64-linux β†’ torch28-cxx11-cu128-x86_64-linux}/optimizer/__init__.py RENAMED
File without changes
build/torch28-cxx11-cu128-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (307 Bytes). View file
 
build/torch28-cxx11-cu128-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc ADDED
Binary file (22.4 kB). View file
 
build/{torch26-cxx11-cu126-x86_64-linux β†’ torch28-cxx11-cu128-x86_64-linux}/optimizer/_ops.py RENAMED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _optimizer_02ac540_dirty
3
- ops = torch.ops._optimizer_02ac540_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_optimizer_02ac540_dirty::{op_name}"
 
1
  import torch
2
+ from . import _optimizer_1f13dae_dirty
3
+ ops = torch.ops._optimizer_1f13dae_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_optimizer_1f13dae_dirty::{op_name}"
build/{torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_02ac540_dirty.abi3.so β†’ torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_1f13dae_dirty.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ec46d147914be5998dfc62d4b87eb6730be7f012700d49543a318cadab3820db
3
- size 1749744
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d2e65e315cd82d0b6fc2043ff37ee2d1223d6bd293ef552d658db5bf4de0a45
3
+ size 1883352
build/{torch26-cxx11-cu124-x86_64-linux β†’ torch28-cxx11-cu128-x86_64-linux}/optimizer/muon.py RENAMED
File without changes
build/{torch26-cxx11-cu126-x86_64-linux β†’ torch28-cxx11-cu129-x86_64-linux}/optimizer/__init__.py RENAMED
File without changes