|
[[38;5;39m INFO[0m][28-Jun-24 14:21:31] Hydra output dir: /data/strahl/Code/ovod/outputs/ovod_20240628_142131 |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:31] Run configuration: |
|
action: train |
|
device: cuda |
|
allow_tf32: true |
|
cudnn_bench: true |
|
determ: false |
|
determ_seed: 1 |
|
dry_run: false |
|
wandb: true |
|
wandb_project: ovod |
|
wandb_entity: null |
|
wandb_group: null |
|
wandb_job_type: null |
|
wandb_name: null |
|
wandb_tags: null |
|
embedder_spec: openclip:apple/DFN5B-CLIP-ViT-H-14-378 |
|
embedder_amp: true |
|
embedder_amp_bf16: false |
|
embedder_compile: false |
|
embedder_optimum: false |
|
batch_size_token: 2048 |
|
batch_size_embed: 512 |
|
batch_size_image: 256 |
|
embedding_dataset: dfn5bl_multiset3c2_cache_vt0.bin |
|
embedding_datasets: [] |
|
batch_size: 512 |
|
dataset_workers: 8 |
|
vocab_path: $SOURCE/data/object_nouns.json |
|
vocab_thres: 0 |
|
prompt_path: $SOURCE/data/prompts.json |
|
prompt_collection: ImageNet1K | CIFAR |
|
hypernym_collection: None |
|
noun_cache: true |
|
noun_recache: false |
|
noun_cache_dir: $SOURCE/cache/noun_dataset |
|
embedding_cache_dir: $SOURCE/cache/embedding_cache |
|
strict_embedder: true |
|
save_embedding_cache: '' |
|
cls_dataset: ImageNet1K |
|
cls_datasets: [] |
|
cls_dataset_root: ~/Datasets |
|
cls_split: valid |
|
load_model: '' |
|
load_models: [] |
|
load_models_dirnum: 1 |
|
model: PrefixedIterDecoder |
|
with_start_token: false |
|
with_end_token: true |
|
compact_ids: true |
|
fixed_token_length: false |
|
auto_fixed_token_length: true |
|
use_masks: true |
|
use_weights: true |
|
multi_target: true |
|
multi_first: false |
|
fixed_multi_length: false |
|
amp: false |
|
amp_bf16: true |
|
vocab_quant: false |
|
num_end_loss: 1 |
|
label_smoothing: 0.0 |
|
hidden_dim: 512 |
|
feedfwd_scale: 1/4 |
|
mlp_hidden_layer: none |
|
mlp_hidden_bias: false |
|
mlp_hidden_norm: false |
|
mlp_hidden_activation: gelu |
|
input_dropout: 0.1 |
|
num_layers: 6 |
|
num_heads: 8 |
|
layer_dropout: 0.1 |
|
layer_activation: gelu |
|
layer_norm_first: true |
|
layer_bias: false |
|
logits_bias: false |
|
init_bias_zero: true |
|
init_mlp_mode: balanced |
|
init_mlp_unit_norm: false |
|
init_tfrm_mode: balanced |
|
init_tfrm_unit_norm: false |
|
init_tfrm_unit_postnorm: true |
|
init_tfrm_proj_layers: true |
|
init_zero_norm: false |
|
init_rezero_mode: none |
|
mlp_seq_len: 4 |
|
weight_tying: true |
|
strictly_causal: false |
|
enable_nested: false |
|
cross_encoder: true |
|
num_encoder_layers: 6 |
|
gencfg: '' |
|
gencfgs: [] |
|
gencfgs_grid: false |
|
gencfg_method: |
|
- greedy |
|
- beam |
|
gencfg_topk: |
|
- 3 |
|
- 5 |
|
- 10 |
|
gencfg_prior: |
|
- none |
|
- tgt0.25 |
|
- tgt0.5 |
|
- tok0.25 |
|
- tok0.5 |
|
gencfg_guide: |
|
- plain |
|
gencfg_tau: |
|
- 0.5 |
|
- 1 |
|
- 2 |
|
gencfg_alpha: |
|
- -0.2 |
|
- 0 |
|
- 0.2 |
|
- 0.5 |
|
test_training: true |
|
test_device: true |
|
test_patch: true |
|
test_consistent: false |
|
test_print: 0 |
|
clip_prompts: true |
|
measure_gap: false |
|
multi_target_freq: |
|
- 1 |
|
- 1 |
|
captions_path: '' |
|
template_multiplier: 10 |
|
sample_multiplier: 20 |
|
captions_print: 0 |
|
class_names_variant: clip |
|
images: [] |
|
save_targets: null |
|
multi_mode: max |
|
load_train_state: true |
|
load_lr_state: true |
|
chunk_scale: 50 |
|
save_every_min: 12 |
|
save_every_max: 48 |
|
save_top1_min: 95.0 |
|
save_top1_delta: 0.5 |
|
max_epochs: 18 |
|
max_chunks: 0 |
|
accum_factor: 16 |
|
optimizer: AdamW |
|
init_lr: 0.0015 |
|
final_lr: 0.0 |
|
lr_scheduler: cosine |
|
lr_warmup: 0 |
|
beta1: 0.9 |
|
beta2: 0.95 |
|
weight_decay: 0.1 |
|
weight_decay_1d: false |
|
nesterov: true |
|
compile: false |
|
gradient_clip: 1.0 |
|
loss_ewa_halflife: 4 |
|
last_dropout_chunks: 0 |
|
last_dropout_factor: 0.0 |
|
mean_shift: false |
|
mean_shift_path: $SOURCE/data/modality_gap_$EMBEDDER.json |
|
noise_scheme: GaussElemUniformAngle |
|
noise_vec_norm: 3.25 |
|
noise_angle_min: 45 |
|
noise_angle_max: 75 |
|
noise_angle_std: 0.0 |
|
noise_mix_ratio: 0.15 |
|
fix_force_vtx: false |
|
eval_train: false |
|
eval_guided: false |
|
eval_debug: false |
|
eval_samples_max: 0 |
|
eval_images: '' |
|
eval_images_dir: $SOURCE/extras/eval_images |
|
infer_log: true |
|
infer_texts: [] |
|
infer_images: [] |
|
infer_image_dir: $SOURCE/extras/infer_images |
|
infer_all_images_dir: '' |
|
infer_ann_json: $IMAGEDIR/_class_annotations.json |
|
infer_ann_json_update: false |
|
infer_guided: false |
|
infer_guide_dataset: '' |
|
infer_guide_targets: [] |
|
infer_debug: false |
|
infer_pred_json: false |
|
load_pred_jsons: [] |
|
pred_image_dir: $SOURCE/extras/infer_images |
|
pred_ann_json: $IMAGEDIR/_class_annotations.json |
|
pfmt_type: model_topk_v1 |
|
pfmt_topk: 3 |
|
pfmt_model_spec: true |
|
pfmt_sort: '' |
|
fmt_type: all_v2 |
|
fmt_models: '' |
|
fmt_model_hosts: '' |
|
fmt_hosts: '' |
|
fmt_min_ago: '' |
|
fmt_max_ago: '' |
|
fmt_min_stamp: '' |
|
fmt_max_stamp: '' |
|
fmt_sort: '' |
|
wiki_collect_dir: $SOURCE/extras/wiki_images |
|
sample_input_dir: '' |
|
sample_output_dir: $SOURCE/extras/sampled_images |
|
sample_count: 100 |
|
sample_special: [] |
|
sample_special_mean: 0.05 |
|
sample_special_factor: [] |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:31] TF32 tensor cores are enabled |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:31] Fast non-deterministic mode with cuDNN benchmark mode enabled |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:32] Wandb run: devoted-dragon-4474 (https://wandb.ai/pallgeuer/ovod/runs/xywaoev0) |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:32] Wandb run path: /data/strahl/Code/ovod/log/wandb/run-20240628_142132-xywaoev0 |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:33] Creating embedder of specification openclip:apple/DFN5B-CLIP-ViT-H-14-378... |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:33] Loading OpenCLIP configuration for 'apple/DFN5B-CLIP-ViT-H-14-378' |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:33] Loaded Hugging Face tokenizer: CLIPTokenizerFast |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:33] Loaded OpenCLIP tokenizer for 'apple/DFN5B-CLIP-ViT-H-14-378': HFTokenizer |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:33] Text tokenizer has context length 77 and case-insensitive vocab size 49408 |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:33] Embedder is using CUDA device |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:33] Embedder has AMP enabled with dtype torch.float16 |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:33] Embedder has manual mixed precision disabled |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:33] Text tokenizer has dtype torch.int64, start 49406 end 49407 pad 49407, and nominal batch size 2048 |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:33] Text embedding vector has dim 1024, dtype torch.float32, and nominal batch size 512 |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:33] Image component of embedder has nominal batch size 256 |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:33] Created embedder of class type OpenCLIPEmbedder |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:33] Loading embedding cache with targets... |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:33] Using embedding cache: /data/strahl/Code/ovod/cache/embedding_cache/dfn5bl_multiset3c2_cache_vt0.bin |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:33] Loaded cache header information of version 1 |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:33] Loaded 42920 target nouns from cache |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:33] Cache size is 213106456723 bytes = 198.471GiB |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:33] Loaded embedding cache dataset of class type EmbeddingCache.Dataset |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:33] Generating target configuration for loaded target nouns and model of specification PrefixedIterDecoder... |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:34] Max target tokens without start token with end token is 14 for 'cercopithecus aethiops pygerythrus' |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:34] Compacting target tokenizations down to a vocab size of 12499 tokens |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:34] Using target tokenizations of variable length 14 with padding masks |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:35] Resolving data configuration for loaded embedding dataset and model of specification PrefixedIterDecoder... |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:35] Dataset is configured to use multiple targets per embedding, and to use target weights (normalized) |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:35] Dataset is multi-target with a dynamic M of up to 3, and the M-dim is after the batch dimension B |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:35] Creating embedding dataset loader in TRAIN mode with batch size 512 and 8 workers... |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:35] Dataset: 51723264 embeddings across 101022 items |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:35] Loader: 51723264/51723264 samples used in 101022+0 = 101022 batches of size 512+0 |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:35] Loader: 8 workers prefetching 2 unpinned CUDA batches each |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:35] Gradient accumulation factor 16 results in 6313+0 = 6313 meta-batches of size 8192+0 |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:35] Gradient accumulation is using 101008/101022 available batches and 51716096/51723264 available samples |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:35] Have 51723264 training samples available in the dataset |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:35] Training 512 samples per batch |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:35] Training 101008 batches = 51716096 samples per epoch (gradient accumulation factor 16 => 6313 optimizer updates) |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:35] Training 4192 batches = 2146304 samples per chunk |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:35] Training nominally for 433 chunks (max epochs 18 specified) |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:35] Applying GaussElem noise of mean norm 3.25 to embedding vectors |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:35] Applying UniformAngle noise of angle range 45° to 75° to embedding vectors |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:35] Applying GaussElemUniformAngle noise with mix ratio 0.15 of UniformAngle |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:35] Loaded memory-mapped cache ready for use |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:35] Preloaded all target noun tokenizations from cache: 42920×14 of torch.int64/torch.bool |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:35] Preloaded all target noun IDs and weights from cache: 51723416×3 of torch.int32/torch.float32 |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:35] Training model from scratch |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:35] Creating model of class PrefixedIterDecoder... |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:35] Created model of class PrefixedIterDecoder |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:35] Model parameter counts by part: |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:35] Input MLP = 2097152 params |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:35] Token embed/logits = 6399488 params |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:35] Positional embed = 8704 params |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:35] Transformer = 7084544 params |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:35] Total = 15589888 params |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:35] Decoder AMP is disabled |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:35] Moving model to CUDA... |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:35] Applying weight decay to 27/40 trainable param tensors = 15583232/15589888 trainable parameters |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:35] Created optimizer: AdamW (Parameter Group 0, amsgrad: False, betas: (0.9, 0.95), capturable: False, differentiable: False, eps: 1e-08, foreach: None, fused: True, lr: 0.0015, maximize: False, weight_decay: 0.0 | Parameter Group 1, amsgrad: False, betas: (0.9, 0.95), capturable: False, differentiable: False, eps: 1e-08, foreach: None, fused: True, lr: 0.0015, maximize: False, weight_decay: 0.1) |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:35] Using scheduler: torch.optim.lr_scheduler.CosineAnnealingLR(chunks=433, baselr=1.50e-03, finallr=0.00e+00) |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:36] -------------------------------------------------------------------------------- |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:36] Epoch 1 = Batch 1 = Sample 1 |
|
[[38;5;39m INFO[0m][28-Jun-24 14:21:36] Chunk 1 = Batch 1 = Sample 1 |
|
[[38;5;39m INFO[0m][28-Jun-24 14:29:36] Total gradient norm stats for 262 steps: 0.1358 <= 0.606 + 0.9267z <= 11.55 (clipped to 1) |
|
[[38;5;39m INFO[0m][28-Jun-24 14:29:36] Trained chunk 1 in 480.0s at 4472noun/s: lr=1.50e-03, loss=5.47e+00, top1=45.04%/33.360% |
|
[[38;5;39m INFO[0m][28-Jun-24 14:29:36] Chunk 2 = Batch 4193 = Sample 2146305 |
|
[[38;5;39m INFO[0m][28-Jun-24 14:37:36] Total gradient norm stats for 262 steps: 0.3005 <= 0.3942 + 0.06712z <= 0.6547 |
|
[[38;5;39m INFO[0m][28-Jun-24 14:37:36] Trained chunk 2 in 479.3s at 4478noun/s: lr=1.50e-03, loss=4.28e+00, top1=52.79%/42.609% |
|
[[38;5;39m INFO[0m][28-Jun-24 14:37:36] Chunk 3 = Batch 8385 = Sample 4292609 |
|
[[38;5;39m INFO[0m][28-Jun-24 14:45:36] Total gradient norm stats for 262 steps: 0.2586 <= 0.3075 + 0.03353z <= 0.4711 |
|
[[38;5;39m INFO[0m][28-Jun-24 14:45:36] Trained chunk 3 in 480.1s at 4471noun/s: lr=1.50e-03, loss=3.72e+00, top1=56.16%/47.214% |
|
[[38;5;39m INFO[0m][28-Jun-24 14:45:36] Chunk 4 = Batch 12577 = Sample 6438913 |
|
[[38;5;39m INFO[0m][28-Jun-24 14:53:35] Total gradient norm stats for 262 steps: 0.2254 <= 0.2652 + 0.02594z <= 0.3725 |
|
[[38;5;39m INFO[0m][28-Jun-24 14:53:35] Trained chunk 4 in 479.2s at 4479noun/s: lr=1.50e-03, loss=3.38e+00, top1=57.03%/50.056% |
|
[[38;5;39m INFO[0m][28-Jun-24 14:53:35] Chunk 5 = Batch 16769 = Sample 8585217 |
|
[[38;5;39m INFO[0m][28-Jun-24 15:01:33] Total gradient norm stats for 262 steps: 0.2082 <= 0.2326 + 0.01494z <= 0.2956 |
|
[[38;5;39m INFO[0m][28-Jun-24 15:01:33] Trained chunk 5 in 478.1s at 4489noun/s: lr=1.50e-03, loss=3.14e+00, top1=57.95%/52.100% |
|
[[38;5;39m INFO[0m][28-Jun-24 15:01:33] Chunk 6 = Batch 20961 = Sample 10731521 |
|
[[38;5;39m INFO[0m][28-Jun-24 15:09:33] Total gradient norm stats for 262 steps: 0.1901 <= 0.213 + 0.01247z <= 0.2611 |
|
[[38;5;39m INFO[0m][28-Jun-24 15:09:33] Trained chunk 6 in 479.4s at 4477noun/s: lr=1.50e-03, loss=2.95e+00, top1=57.96%/53.694% |
|
[[38;5;39m INFO[0m][28-Jun-24 15:09:33] Chunk 7 = Batch 25153 = Sample 12877825 |
|
[[38;5;39m INFO[0m][28-Jun-24 15:17:32] Total gradient norm stats for 262 steps: 0.1815 <= 0.1976 + 0.009438z <= 0.2411 |
|
[[38;5;39m INFO[0m][28-Jun-24 15:17:32] Trained chunk 7 in 479.0s at 4481noun/s: lr=1.50e-03, loss=2.80e+00, top1=59.02%/55.018% |
|
[[38;5;39m INFO[0m][28-Jun-24 15:17:32] Chunk 8 = Batch 29345 = Sample 15024129 |
|
[[38;5;39m INFO[0m][28-Jun-24 15:25:31] Total gradient norm stats for 262 steps: 0.1724 <= 0.1856 + 0.00634z <= 0.2099 |
|
[[38;5;39m INFO[0m][28-Jun-24 15:25:31] Trained chunk 8 in 479.1s at 4480noun/s: lr=1.50e-03, loss=2.67e+00, top1=60.89%/56.148% |
|
[[38;5;39m INFO[0m][28-Jun-24 15:25:31] Chunk 9 = Batch 33537 = Sample 17170433 |
|
[[38;5;39m INFO[0m][28-Jun-24 15:33:28] Total gradient norm stats for 262 steps: 0.1679 <= 0.1781 + 0.005329z <= 0.2016 |
|
[[38;5;39m INFO[0m][28-Jun-24 15:33:28] Trained chunk 9 in 477.9s at 4492noun/s: lr=1.50e-03, loss=2.56e+00, top1=60.64%/57.133% |
|
[[38;5;39m INFO[0m][28-Jun-24 15:33:28] Chunk 10 = Batch 37729 = Sample 19316737 |
|
[[38;5;39m INFO[0m][28-Jun-24 15:41:26] Total gradient norm stats for 262 steps: 0.1618 <= 0.1719 + 0.006222z <= 0.1937 |
|
[[38;5;39m INFO[0m][28-Jun-24 15:41:26] Trained chunk 10 in 478.1s at 4490noun/s: lr=1.50e-03, loss=2.46e+00, top1=61.21%/58.003% |
|
[[38;5;39m INFO[0m][28-Jun-24 15:41:26] Chunk 11 = Batch 41921 = Sample 21463041 |
|
[[38;5;39m INFO[0m][28-Jun-24 15:49:25] Total gradient norm stats for 262 steps: 0.1564 <= 0.1648 + 0.004209z <= 0.1764 |
|
[[38;5;39m INFO[0m][28-Jun-24 15:49:25] Trained chunk 11 in 478.9s at 4482noun/s: lr=1.50e-03, loss=2.37e+00, top1=61.44%/58.782% |
|
[[38;5;39m INFO[0m][28-Jun-24 15:49:25] Chunk 12 = Batch 46113 = Sample 23609345 |
|
[[38;5;39m INFO[0m][28-Jun-24 15:57:24] Total gradient norm stats for 262 steps: 0.1526 <= 0.1619 + 0.004693z <= 0.1828 |
|
[[38;5;39m INFO[0m][28-Jun-24 15:57:24] Trained chunk 12 in 478.8s at 4482noun/s: lr=1.50e-03, loss=2.29e+00, top1=62.24%/59.486% |
|
[[38;5;39m INFO[0m][28-Jun-24 15:57:24] Chunk 13 = Batch 50305 = Sample 25755649 |
|
[[38;5;39m INFO[0m][28-Jun-24 16:05:23] Total gradient norm stats for 262 steps: 0.1492 <= 0.1585 + 0.00464z <= 0.1796 |
|
[[38;5;39m INFO[0m][28-Jun-24 16:05:23] Trained chunk 13 in 478.3s at 4487noun/s: lr=1.50e-03, loss=2.23e+00, top1=64.69%/60.121% |
|
[[38;5;39m INFO[0m][28-Jun-24 16:05:23] Chunk 14 = Batch 54497 = Sample 27901953 |
|
[[38;5;39m INFO[0m][28-Jun-24 16:13:21] Total gradient norm stats for 262 steps: 0.1448 <= 0.1545 + 0.004277z <= 0.1771 |
|
[[38;5;39m INFO[0m][28-Jun-24 16:13:21] Trained chunk 14 in 478.4s at 4486noun/s: lr=1.50e-03, loss=2.16e+00, top1=65.05%/60.703% |
|
[[38;5;39m INFO[0m][28-Jun-24 16:13:21] Chunk 15 = Batch 58689 = Sample 30048257 |
|
[[38;5;39m INFO[0m][28-Jun-24 16:21:20] Total gradient norm stats for 262 steps: 0.1432 <= 0.1527 + 0.005304z <= 0.1744 |
|
[[38;5;39m INFO[0m][28-Jun-24 16:21:20] Trained chunk 15 in 479.1s at 4480noun/s: lr=1.50e-03, loss=2.11e+00, top1=64.87%/61.235% |
|
[[38;5;39m INFO[0m][28-Jun-24 16:21:20] Chunk 16 = Batch 62881 = Sample 32194561 |
|
[[38;5;39m INFO[0m][28-Jun-24 16:29:19] Total gradient norm stats for 262 steps: 0.1426 <= 0.1503 + 0.005078z <= 0.1772 |
|
[[38;5;39m INFO[0m][28-Jun-24 16:29:19] Trained chunk 16 in 479.1s at 4480noun/s: lr=1.50e-03, loss=2.06e+00, top1=63.23%/61.724% |
|
[[38;5;39m INFO[0m][28-Jun-24 16:29:19] Chunk 17 = Batch 67073 = Sample 34340865 |
|
[[38;5;39m INFO[0m][28-Jun-24 16:37:18] Total gradient norm stats for 262 steps: 0.1402 <= 0.1493 + 0.005045z <= 0.1721 |
|
[[38;5;39m INFO[0m][28-Jun-24 16:37:18] Trained chunk 17 in 478.3s at 4487noun/s: lr=1.49e-03, loss=2.01e+00, top1=64.62%/62.177% |
|
[[38;5;39m INFO[0m][28-Jun-24 16:37:18] Chunk 18 = Batch 71265 = Sample 36487169 |
|
[[38;5;39m INFO[0m][28-Jun-24 16:45:16] Total gradient norm stats for 262 steps: 0.1385 <= 0.147 + 0.005598z <= 0.1775 |
|
[[38;5;39m INFO[0m][28-Jun-24 16:45:16] Trained chunk 18 in 478.0s at 4490noun/s: lr=1.49e-03, loss=1.97e+00, top1=65.62%/62.595% |
|
[[38;5;39m INFO[0m][28-Jun-24 16:45:16] Chunk 19 = Batch 75457 = Sample 38633473 |
|
[[38;5;39m INFO[0m][28-Jun-24 16:53:14] Total gradient norm stats for 262 steps: 0.1373 <= 0.1453 + 0.00463z <= 0.171 |
|
[[38;5;39m INFO[0m][28-Jun-24 16:53:14] Trained chunk 19 in 478.9s at 4481noun/s: lr=1.49e-03, loss=1.93e+00, top1=65.78%/62.984% |
|
[[38;5;39m INFO[0m][28-Jun-24 16:53:14] Chunk 20 = Batch 79649 = Sample 40779777 |
|
[[38;5;39m INFO[0m][28-Jun-24 17:01:13] Total gradient norm stats for 262 steps: 0.1361 <= 0.1442 + 0.00464z <= 0.1687 |
|
[[38;5;39m INFO[0m][28-Jun-24 17:01:13] Trained chunk 20 in 478.6s at 4485noun/s: lr=1.49e-03, loss=1.90e+00, top1=65.25%/63.342% |
|
[[38;5;39m INFO[0m][28-Jun-24 17:01:13] Chunk 21 = Batch 83841 = Sample 42926081 |
|
[[38;5;39m INFO[0m][28-Jun-24 17:09:12] Total gradient norm stats for 262 steps: 0.1356 <= 0.1434 + 0.004749z <= 0.1698 |
|
[[38;5;39m INFO[0m][28-Jun-24 17:09:12] Trained chunk 21 in 478.7s at 4484noun/s: lr=1.49e-03, loss=1.87e+00, top1=66.74%/63.681% |
|
[[38;5;39m INFO[0m][28-Jun-24 17:09:12] Chunk 22 = Batch 88033 = Sample 45072385 |
|
[[38;5;39m INFO[0m][28-Jun-24 17:17:11] Total gradient norm stats for 262 steps: 0.1345 <= 0.1428 + 0.005068z <= 0.1632 |
|
[[38;5;39m INFO[0m][28-Jun-24 17:17:11] Trained chunk 22 in 479.3s at 4478noun/s: lr=1.49e-03, loss=1.84e+00, top1=64.93%/63.995% |
|
[[38;5;39m INFO[0m][28-Jun-24 17:17:11] Chunk 23 = Batch 92225 = Sample 47218689 |
|
[[38;5;39m INFO[0m][28-Jun-24 17:25:09] Total gradient norm stats for 262 steps: 0.1324 <= 0.1423 + 0.00569z <= 0.1747 |
|
[[38;5;39m INFO[0m][28-Jun-24 17:25:09] Trained chunk 23 in 478.3s at 4487noun/s: lr=1.49e-03, loss=1.81e+00, top1=65.30%/64.286% |
|
[[38;5;39m INFO[0m][28-Jun-24 17:25:09] Chunk 24 = Batch 96417 = Sample 49364993 |
|
[[38;5;39m INFO[0m][28-Jun-24 17:33:07] Total gradient norm stats for 262 steps: 0.1335 <= 0.1416 + 0.004568z <= 0.1595 |
|
[[38;5;39m INFO[0m][28-Jun-24 17:33:07] Trained chunk 24 in 477.9s at 4491noun/s: lr=1.49e-03, loss=1.78e+00, top1=66.47%/64.558% |
|
[[38;5;39m INFO[0m][28-Jun-24 17:33:07] Chunk 25 = Batch 100609 = Sample 51511297 |
|
[[38;5;39m INFO[0m][28-Jun-24 17:33:55] Epoch 1 finished in 11538.7s |
|
[[38;5;39m INFO[0m][28-Jun-24 17:33:55] -------------------------------------------------------------------------------- |
|
[[38;5;39m INFO[0m][28-Jun-24 17:33:55] Epoch 2 = Batch 101009 = Sample 51716097 |
|
[[38;5;39m INFO[0m][28-Jun-24 17:41:09] Total gradient norm stats for 262 steps: 0.132 <= 0.141 + 0.004917z <= 0.1573 |
|
[[38;5;39m INFO[0m][28-Jun-24 17:41:09] Trained chunk 25 in 481.8s at 4455noun/s: lr=1.49e-03, loss=1.76e+00, top1=65.75%/64.814% |
|
[[38;5;39m INFO[0m][28-Jun-24 17:41:09] Chunk 26 = Batch 104801 = Sample 53657601 |
|
[[38;5;39m INFO[0m][28-Jun-24 17:49:07] Total gradient norm stats for 262 steps: 0.132 <= 0.142 + 0.006522z <= 0.1812 |
|
[[38;5;39m INFO[0m][28-Jun-24 17:49:07] Trained chunk 26 in 477.7s at 4493noun/s: lr=1.49e-03, loss=1.74e+00, top1=65.81%/65.052% |
|
[[38;5;39m INFO[0m][28-Jun-24 17:49:07] Chunk 27 = Batch 108993 = Sample 55803905 |
|
[[38;5;39m INFO[0m][28-Jun-24 17:57:05] Total gradient norm stats for 262 steps: 0.1326 <= 0.1409 + 0.005022z <= 0.173 |
|
[[38;5;39m INFO[0m][28-Jun-24 17:57:05] Trained chunk 27 in 478.6s at 4485noun/s: lr=1.49e-03, loss=1.72e+00, top1=66.34%/65.276% |
|
[[38;5;39m INFO[0m][28-Jun-24 17:57:05] Chunk 28 = Batch 113185 = Sample 57950209 |
|
[[38;5;39m INFO[0m][28-Jun-24 18:05:03] Total gradient norm stats for 262 steps: 0.1319 <= 0.1418 + 0.005825z <= 0.1756 |
|
[[38;5;39m INFO[0m][28-Jun-24 18:05:03] Trained chunk 28 in 478.1s at 4490noun/s: lr=1.49e-03, loss=1.70e+00, top1=66.10%/65.488% |
|
[[38;5;39m INFO[0m][28-Jun-24 18:05:03] Chunk 29 = Batch 117377 = Sample 60096513 |
|
[[38;5;39m INFO[0m][28-Jun-24 18:13:01] Total gradient norm stats for 262 steps: 0.1333 <= 0.1431 + 0.00651z <= 0.1711 |
|
[[38;5;39m INFO[0m][28-Jun-24 18:13:01] Trained chunk 29 in 477.8s at 4492noun/s: lr=1.48e-03, loss=1.68e+00, top1=66.57%/65.688% |
|
[[38;5;39m INFO[0m][28-Jun-24 18:13:01] Chunk 30 = Batch 121569 = Sample 62242817 |
|
[[38;5;39m INFO[0m][28-Jun-24 18:21:00] Total gradient norm stats for 262 steps: 0.1311 <= 0.143 + 0.005929z <= 0.1706 |
|
[[38;5;39m INFO[0m][28-Jun-24 18:21:00] Trained chunk 30 in 478.3s at 4488noun/s: lr=1.48e-03, loss=1.67e+00, top1=65.33%/65.873% |
|
[[38;5;39m INFO[0m][28-Jun-24 18:21:00] Chunk 31 = Batch 125761 = Sample 64389121 |
|
[[38;5;39m INFO[0m][28-Jun-24 18:28:57] Total gradient norm stats for 262 steps: 0.1306 <= 0.1428 + 0.005011z <= 0.167 |
|
[[38;5;39m INFO[0m][28-Jun-24 18:28:57] Trained chunk 31 in 477.8s at 4492noun/s: lr=1.48e-03, loss=1.65e+00, top1=66.72%/66.048% |
|
[[38;5;39m INFO[0m][28-Jun-24 18:28:57] Chunk 32 = Batch 129953 = Sample 66535425 |
|
[[38;5;39m INFO[0m][28-Jun-24 18:36:54] Total gradient norm stats for 262 steps: 0.133 <= 0.1446 + 0.00642z <= 0.1685 |
|
[[38;5;39m INFO[0m][28-Jun-24 18:36:54] Trained chunk 32 in 476.7s at 4502noun/s: lr=1.48e-03, loss=1.64e+00, top1=67.32%/66.211% |
|
[[38;5;39m INFO[0m][28-Jun-24 18:36:54] Chunk 33 = Batch 134145 = Sample 68681729 |
|
[[38;5;39m INFO[0m][28-Jun-24 18:44:53] Total gradient norm stats for 262 steps: 0.1335 <= 0.1442 + 0.005488z <= 0.1647 |
|
[[38;5;39m INFO[0m][28-Jun-24 18:44:53] Trained chunk 33 in 478.7s at 4483noun/s: lr=1.48e-03, loss=1.63e+00, top1=66.56%/66.366% |
|
[[38;5;39m INFO[0m][28-Jun-24 18:44:53] Chunk 34 = Batch 138337 = Sample 70828033 |
|
[[38;5;39m INFO[0m][28-Jun-24 18:52:52] Total gradient norm stats for 262 steps: 0.1334 <= 0.1465 + 0.006866z <= 0.1758 |
|
[[38;5;39m INFO[0m][28-Jun-24 18:52:52] Trained chunk 34 in 478.7s at 4483noun/s: lr=1.48e-03, loss=1.61e+00, top1=67.88%/66.514% |
|
[[38;5;39m INFO[0m][28-Jun-24 18:52:52] Chunk 35 = Batch 142529 = Sample 72974337 |
|
[[38;5;39m INFO[0m][28-Jun-24 19:00:50] Total gradient norm stats for 262 steps: 0.1351 <= 0.1461 + 0.005972z <= 0.1675 |
|
[[38;5;39m INFO[0m][28-Jun-24 19:00:50] Trained chunk 35 in 478.4s at 4486noun/s: lr=1.48e-03, loss=1.60e+00, top1=67.84%/66.651% |
|
[[38;5;39m INFO[0m][28-Jun-24 19:00:50] Chunk 36 = Batch 146721 = Sample 75120641 |
|
[[38;5;39m INFO[0m][28-Jun-24 19:08:48] Total gradient norm stats for 262 steps: 0.1356 <= 0.148 + 0.006556z <= 0.1741 |
|
[[38;5;39m INFO[0m][28-Jun-24 19:08:48] Trained chunk 36 in 478.3s at 4487noun/s: lr=1.48e-03, loss=1.59e+00, top1=66.86%/66.786% |
|
[[38;5;39m INFO[0m][28-Jun-24 19:08:48] Chunk 37 = Batch 150913 = Sample 77266945 |
|
[[38;5;39m INFO[0m][28-Jun-24 19:16:47] Total gradient norm stats for 262 steps: 0.1387 <= 0.1511 + 0.008419z <= 0.1954 |
|
[[38;5;39m INFO[0m][28-Jun-24 19:16:47] Trained chunk 37 in 478.9s at 4482noun/s: lr=1.47e-03, loss=1.58e+00, top1=67.00%/66.907% |
|
[[38;5;39m INFO[0m][28-Jun-24 19:16:47] Chunk 38 = Batch 155105 = Sample 79413249 |
|
[[38;5;39m INFO[0m][28-Jun-24 19:24:45] Total gradient norm stats for 262 steps: 0.1355 <= 0.1528 + 0.00959z <= 0.197 |
|
[[38;5;39m INFO[0m][28-Jun-24 19:24:45] Trained chunk 38 in 478.2s at 4488noun/s: lr=1.47e-03, loss=1.57e+00, top1=66.55%/67.026% |
|
[[38;5;39m INFO[0m][28-Jun-24 19:24:45] Chunk 39 = Batch 159297 = Sample 81559553 |
|
[[38;5;39m INFO[0m][28-Jun-24 19:32:44] Total gradient norm stats for 262 steps: 0.1372 <= 0.1538 + 0.009577z <= 0.1973 |
|
[[38;5;39m INFO[0m][28-Jun-24 19:32:44] Trained chunk 39 in 478.4s at 4486noun/s: lr=1.47e-03, loss=1.56e+00, top1=67.52%/67.135% |
|
[[38;5;39m INFO[0m][28-Jun-24 19:32:44] Chunk 40 = Batch 163489 = Sample 83705857 |
|
[[38;5;39m INFO[0m][28-Jun-24 19:40:42] Total gradient norm stats for 262 steps: 0.1376 <= 0.1559 + 0.009563z <= 0.1932 |
|
[[38;5;39m INFO[0m][28-Jun-24 19:40:42] Trained chunk 40 in 478.2s at 4488noun/s: lr=1.47e-03, loss=1.55e+00, top1=67.52%/67.240% |
|
[[38;5;39m INFO[0m][28-Jun-24 19:40:42] Chunk 41 = Batch 167681 = Sample 85852161 |
|
[[38;5;39m INFO[0m][28-Jun-24 19:48:40] Total gradient norm stats for 262 steps: 0.1399 <= 0.157 + 0.01006z <= 0.2007 |
|
[[38;5;39m INFO[0m][28-Jun-24 19:48:40] Trained chunk 41 in 477.7s at 4493noun/s: lr=1.47e-03, loss=1.55e+00, top1=67.11%/67.336% |
|
[[38;5;39m INFO[0m][28-Jun-24 19:48:40] Chunk 42 = Batch 171873 = Sample 87998465 |
|
[[38;5;39m INFO[0m][28-Jun-24 19:56:39] Total gradient norm stats for 262 steps: 0.1394 <= 0.1578 + 0.01082z <= 0.2039 |
|
[[38;5;39m INFO[0m][28-Jun-24 19:56:39] Trained chunk 42 in 478.8s at 4483noun/s: lr=1.47e-03, loss=1.54e+00, top1=67.67%/67.426% |
|
[[38;5;39m INFO[0m][28-Jun-24 19:56:39] Chunk 43 = Batch 176065 = Sample 90144769 |
|
[[38;5;39m INFO[0m][28-Jun-24 20:04:37] Total gradient norm stats for 262 steps: 0.1421 <= 0.1614 + 0.01217z <= 0.2138 |
|
[[38;5;39m INFO[0m][28-Jun-24 20:04:37] Trained chunk 43 in 478.0s at 4490noun/s: lr=1.47e-03, loss=1.53e+00, top1=68.21%/67.512% |
|
[[38;5;39m INFO[0m][28-Jun-24 20:04:37] Chunk 44 = Batch 180257 = Sample 92291073 |
|
[[38;5;39m INFO[0m][28-Jun-24 20:12:35] Total gradient norm stats for 262 steps: 0.1442 <= 0.1609 + 0.01059z <= 0.1991 |
|
[[38;5;39m INFO[0m][28-Jun-24 20:12:35] Trained chunk 44 in 478.3s at 4488noun/s: lr=1.46e-03, loss=1.53e+00, top1=68.36%/67.594% |
|
[[38;5;39m INFO[0m][28-Jun-24 20:12:35] Chunk 45 = Batch 184449 = Sample 94437377 |
|
[[38;5;39m INFO[0m][28-Jun-24 20:20:33] Total gradient norm stats for 262 steps: 0.1417 <= 0.1638 + 0.01196z <= 0.2107 |
|
[[38;5;39m INFO[0m][28-Jun-24 20:20:33] Trained chunk 45 in 478.6s at 4485noun/s: lr=1.46e-03, loss=1.52e+00, top1=68.55%/67.671% |
|
[[38;5;39m INFO[0m][28-Jun-24 20:20:33] Chunk 46 = Batch 188641 = Sample 96583681 |
|
[[38;5;39m INFO[0m][28-Jun-24 20:28:32] Total gradient norm stats for 262 steps: 0.1454 <= 0.1651 + 0.01307z <= 0.2297 |
|
[[38;5;39m INFO[0m][28-Jun-24 20:28:32] Trained chunk 46 in 478.2s at 4488noun/s: lr=1.46e-03, loss=1.52e+00, top1=68.14%/67.745% |
|
[[38;5;39m INFO[0m][28-Jun-24 20:28:32] Chunk 47 = Batch 192833 = Sample 98729985 |
|
[[38;5;39m INFO[0m][28-Jun-24 20:36:30] Total gradient norm stats for 262 steps: 0.1465 <= 0.1674 + 0.01163z <= 0.2294 |
|
[[38;5;39m INFO[0m][28-Jun-24 20:36:30] Trained chunk 47 in 478.0s at 4490noun/s: lr=1.46e-03, loss=1.51e+00, top1=67.71%/67.810% |
|
[[38;5;39m INFO[0m][28-Jun-24 20:36:30] Chunk 48 = Batch 197025 = Sample 100876289 |
|
[[38;5;39m INFO[0m][28-Jun-24 20:44:28] Total gradient norm stats for 262 steps: 0.1486 <= 0.1724 + 0.01604z <= 0.2383 |
|
[[38;5;39m INFO[0m][28-Jun-24 20:44:28] Trained chunk 48 in 478.0s at 4490noun/s: lr=1.46e-03, loss=1.51e+00, top1=67.77%/67.872% |
|
[[38;5;39m INFO[0m][28-Jun-24 20:44:28] Saved checkpoint: /data/strahl/Code/ovod/outputs/ovod_20240628_142131/ovod_chunk0048_20240628_204428.train |
|
[[38;5;39m INFO[0m][28-Jun-24 20:44:28] Chunk 49 = Batch 201217 = Sample 103022593 |
|
[[38;5;39m INFO[0m][28-Jun-24 20:46:01] Epoch 2 finished in 11526.4s |
|
[[38;5;39m INFO[0m][28-Jun-24 20:46:01] -------------------------------------------------------------------------------- |
|
[[38;5;39m INFO[0m][28-Jun-24 20:46:01] Epoch 3 = Batch 202017 = Sample 103432193 |
|
[[38;5;39m INFO[0m][28-Jun-24 20:52:29] Total gradient norm stats for 262 steps: 0.1474 <= 0.1738 + 0.01609z <= 0.246 |
|
[[38;5;39m INFO[0m][28-Jun-24 20:52:29] Trained chunk 49 in 480.9s at 4463noun/s: lr=1.45e-03, loss=1.50e+00, top1=67.89%/67.934% |
|
[[38;5;39m INFO[0m][28-Jun-24 20:52:29] Chunk 50 = Batch 205409 = Sample 105168897 |
|
[[38;5;39m INFO[0m][28-Jun-24 21:00:27] Total gradient norm stats for 262 steps: 0.1477 <= 0.1754 + 0.0173z <= 0.2916 |
|
[[38;5;39m INFO[0m][28-Jun-24 21:00:27] Trained chunk 50 in 478.5s at 4485noun/s: lr=1.45e-03, loss=1.50e+00, top1=69.45%/67.991% |
|
[[38;5;39m INFO[0m][28-Jun-24 21:00:28] Chunk 51 = Batch 209601 = Sample 107315201 |
|
[[38;5;39m INFO[0m][28-Jun-24 21:08:26] Total gradient norm stats for 262 steps: 0.1544 <= 0.1782 + 0.01526z <= 0.2313 |
|
[[38;5;39m INFO[0m][28-Jun-24 21:08:26] Trained chunk 51 in 478.0s at 4490noun/s: lr=1.45e-03, loss=1.49e+00, top1=68.40%/68.045% |
|
[[38;5;39m INFO[0m][28-Jun-24 21:08:26] Chunk 52 = Batch 213793 = Sample 109461505 |
|
[[38;5;39m INFO[0m][28-Jun-24 21:16:25] Total gradient norm stats for 262 steps: 0.1524 <= 0.178 + 0.01761z <= 0.2716 |
|
[[38;5;39m INFO[0m][28-Jun-24 21:16:25] Trained chunk 52 in 479.5s at 4476noun/s: lr=1.45e-03, loss=1.49e+00, top1=67.98%/68.094% |
|
[[38;5;39m INFO[0m][28-Jun-24 21:16:25] Chunk 53 = Batch 217985 = Sample 111607809 |
|
[[38;5;39m INFO[0m][28-Jun-24 21:24:24] Total gradient norm stats for 262 steps: 0.1541 <= 0.1778 + 0.01436z <= 0.2251 |
|
[[38;5;39m INFO[0m][28-Jun-24 21:24:24] Trained chunk 53 in 478.8s at 4482noun/s: lr=1.45e-03, loss=1.49e+00, top1=68.00%/68.141% |
|
[[38;5;39m INFO[0m][28-Jun-24 21:24:24] Chunk 54 = Batch 222177 = Sample 113754113 |
|
[[38;5;39m INFO[0m][28-Jun-24 21:32:23] Total gradient norm stats for 262 steps: 0.1537 <= 0.1802 + 0.01945z <= 0.3632 |
|
[[38;5;39m INFO[0m][28-Jun-24 21:32:23] Trained chunk 54 in 478.8s at 4483noun/s: lr=1.45e-03, loss=1.48e+00, top1=67.76%/68.183% |
|
[[38;5;39m INFO[0m][28-Jun-24 21:32:23] Chunk 55 = Batch 226369 = Sample 115900417 |
|
[[38;5;39m INFO[0m][28-Jun-24 21:40:21] Total gradient norm stats for 262 steps: 0.1557 <= 0.1822 + 0.01585z <= 0.2885 |
|
[[38;5;39m INFO[0m][28-Jun-24 21:40:21] Trained chunk 55 in 478.0s at 4490noun/s: lr=1.44e-03, loss=1.48e+00, top1=67.76%/68.224% |
|
[[38;5;39m INFO[0m][28-Jun-24 21:40:21] Chunk 56 = Batch 230561 = Sample 118046721 |
|
[[38;5;39m INFO[0m][28-Jun-24 21:48:19] Total gradient norm stats for 262 steps: 0.157 <= 0.1833 + 0.01606z <= 0.2633 |
|
[[38;5;39m INFO[0m][28-Jun-24 21:48:19] Trained chunk 56 in 478.6s at 4485noun/s: lr=1.44e-03, loss=1.48e+00, top1=68.51%/68.260% |
|
[[38;5;39m INFO[0m][28-Jun-24 21:48:19] Chunk 57 = Batch 234753 = Sample 120193025 |
|
[[38;5;39m INFO[0m][28-Jun-24 21:56:18] Total gradient norm stats for 262 steps: 0.162 <= 0.1867 + 0.0209z <= 0.3553 |
|
[[38;5;39m INFO[0m][28-Jun-24 21:56:18] Trained chunk 57 in 479.0s at 4481noun/s: lr=1.44e-03, loss=1.48e+00, top1=68.86%/68.296% |
|
[[38;5;39m INFO[0m][28-Jun-24 21:56:18] Chunk 58 = Batch 238945 = Sample 122339329 |
|
[[38;5;39m INFO[0m][28-Jun-24 22:04:18] Total gradient norm stats for 262 steps: 0.1602 <= 0.1822 + 0.01405z <= 0.2403 |
|
[[38;5;39m INFO[0m][28-Jun-24 22:04:18] Trained chunk 58 in 479.5s at 4476noun/s: lr=1.44e-03, loss=1.47e+00, top1=68.32%/68.328% |
|
[[38;5;39m INFO[0m][28-Jun-24 22:04:18] Chunk 59 = Batch 243137 = Sample 124485633 |
|
[[38;5;39m INFO[0m][28-Jun-24 22:12:17] Total gradient norm stats for 262 steps: 0.1608 <= 0.1864 + 0.01737z <= 0.2732 |
|
[[38;5;39m INFO[0m][28-Jun-24 22:12:17] Trained chunk 59 in 479.0s at 4480noun/s: lr=1.43e-03, loss=1.47e+00, top1=69.32%/68.358% |
|
[[38;5;39m INFO[0m][28-Jun-24 22:12:17] Chunk 60 = Batch 247329 = Sample 126631937 |
|
[[38;5;39m INFO[0m][28-Jun-24 22:20:16] Total gradient norm stats for 262 steps: 0.1602 <= 0.1879 + 0.02693z <= 0.5483 |
|
[[38;5;39m INFO[0m][28-Jun-24 22:20:16] Trained chunk 60 in 479.2s at 4479noun/s: lr=1.43e-03, loss=1.47e+00, top1=68.65%/68.384% |
|
[[38;5;39m INFO[0m][28-Jun-24 22:20:16] Chunk 61 = Batch 251521 = Sample 128778241 |
|
[[38;5;39m INFO[0m][28-Jun-24 22:28:14] Total gradient norm stats for 262 steps: 0.1615 <= 0.1881 + 0.01481z <= 0.2458 |
|
[[38;5;39m INFO[0m][28-Jun-24 22:28:14] Trained chunk 61 in 478.5s at 4486noun/s: lr=1.43e-03, loss=1.47e+00, top1=68.82%/68.413% |
|
[[38;5;39m INFO[0m][28-Jun-24 22:28:14] Chunk 62 = Batch 255713 = Sample 130924545 |
|
[[38;5;39m INFO[0m][28-Jun-24 22:36:13] Total gradient norm stats for 262 steps: 0.1637 <= 0.1898 + 0.01498z <= 0.2563 |
|
[[38;5;39m INFO[0m][28-Jun-24 22:36:13] Trained chunk 62 in 478.8s at 4482noun/s: lr=1.43e-03, loss=1.47e+00, top1=68.89%/68.440% |
|
[[38;5;39m INFO[0m][28-Jun-24 22:36:13] Chunk 63 = Batch 259905 = Sample 133070849 |
|
[[38;5;39m INFO[0m][28-Jun-24 22:44:12] Total gradient norm stats for 262 steps: 0.1633 <= 0.1884 + 0.01379z <= 0.2502 |
|
[[38;5;39m INFO[0m][28-Jun-24 22:44:12] Trained chunk 63 in 479.1s at 4480noun/s: lr=1.43e-03, loss=1.47e+00, top1=69.33%/68.466% |
|
[[38;5;39m INFO[0m][28-Jun-24 22:44:12] Chunk 64 = Batch 264097 = Sample 135217153 |
|
[[38;5;39m INFO[0m][28-Jun-24 22:52:10] Total gradient norm stats for 262 steps: 0.1677 <= 0.1909 + 0.01438z <= 0.2544 |
|
[[38;5;39m INFO[0m][28-Jun-24 22:52:10] Trained chunk 64 in 477.8s at 4492noun/s: lr=1.42e-03, loss=1.46e+00, top1=68.19%/68.487% |
|
[[38;5;39m INFO[0m][28-Jun-24 22:52:10] Chunk 65 = Batch 268289 = Sample 137363457 |
|
[[38;5;39m INFO[0m][28-Jun-24 23:00:09] Total gradient norm stats for 262 steps: 0.1659 <= 0.1935 + 0.01565z <= 0.2617 |
|
[[38;5;39m INFO[0m][28-Jun-24 23:00:09] Trained chunk 65 in 479.0s at 4481noun/s: lr=1.42e-03, loss=1.46e+00, top1=69.43%/68.509% |
|
[[38;5;39m INFO[0m][28-Jun-24 23:00:09] Chunk 66 = Batch 272481 = Sample 139509761 |
|
[[38;5;39m INFO[0m][28-Jun-24 23:08:08] Total gradient norm stats for 262 steps: 0.1673 <= 0.1917 + 0.01385z <= 0.2551 |
|
[[38;5;39m INFO[0m][28-Jun-24 23:08:08] Trained chunk 66 in 479.0s at 4480noun/s: lr=1.42e-03, loss=1.46e+00, top1=68.32%/68.528% |
|
[[38;5;39m INFO[0m][28-Jun-24 23:08:08] Chunk 67 = Batch 276673 = Sample 141656065 |
|
[[38;5;39m INFO[0m][28-Jun-24 23:16:07] Total gradient norm stats for 262 steps: 0.1699 <= 0.1936 + 0.01621z <= 0.2722 |
|
[[38;5;39m INFO[0m][28-Jun-24 23:16:07] Trained chunk 67 in 478.7s at 4483noun/s: lr=1.42e-03, loss=1.46e+00, top1=68.52%/68.545% |
|
[[38;5;39m INFO[0m][28-Jun-24 23:16:07] Chunk 68 = Batch 280865 = Sample 143802369 |
|
[[38;5;39m INFO[0m][28-Jun-24 23:24:06] Total gradient norm stats for 262 steps: 0.1682 <= 0.1923 + 0.01344z <= 0.2378 |
|
[[38;5;39m INFO[0m][28-Jun-24 23:24:06] Trained chunk 68 in 478.8s at 4483noun/s: lr=1.41e-03, loss=1.46e+00, top1=68.05%/68.569% |
|
[[38;5;39m INFO[0m][28-Jun-24 23:24:06] Chunk 69 = Batch 285057 = Sample 145948673 |
|
[[38;5;39m INFO[0m][28-Jun-24 23:32:04] Total gradient norm stats for 262 steps: 0.1649 <= 0.1922 + 0.01315z <= 0.2611 |
|
[[38;5;39m INFO[0m][28-Jun-24 23:32:04] Trained chunk 69 in 478.2s at 4488noun/s: lr=1.41e-03, loss=1.46e+00, top1=68.88%/68.586% |
|
[[38;5;39m INFO[0m][28-Jun-24 23:32:04] Chunk 70 = Batch 289249 = Sample 148094977 |
|
[[38;5;39m INFO[0m][28-Jun-24 23:40:02] Total gradient norm stats for 262 steps: 0.1704 <= 0.195 + 0.0133z <= 0.2413 |
|
[[38;5;39m INFO[0m][28-Jun-24 23:40:02] Trained chunk 70 in 478.4s at 4486noun/s: lr=1.41e-03, loss=1.46e+00, top1=67.65%/68.603% |
|
[[38;5;39m INFO[0m][28-Jun-24 23:40:02] Chunk 71 = Batch 293441 = Sample 150241281 |
|
[[38;5;39m INFO[0m][28-Jun-24 23:48:01] Total gradient norm stats for 262 steps: 0.1691 <= 0.1939 + 0.01429z <= 0.2702 |
|
[[38;5;39m INFO[0m][28-Jun-24 23:48:01] Trained chunk 71 in 478.3s at 4487noun/s: lr=1.41e-03, loss=1.46e+00, top1=68.25%/68.617% |
|
[[38;5;39m INFO[0m][28-Jun-24 23:48:01] Chunk 72 = Batch 297633 = Sample 152387585 |
|
[[38;5;39m INFO[0m][28-Jun-24 23:55:59] Total gradient norm stats for 262 steps: 0.1728 <= 0.1961 + 0.01397z <= 0.2505 |
|
[[38;5;39m INFO[0m][28-Jun-24 23:55:59] Trained chunk 72 in 478.5s at 4486noun/s: lr=1.40e-03, loss=1.46e+00, top1=68.81%/68.631% |
|
[[38;5;39m INFO[0m][28-Jun-24 23:55:59] Chunk 73 = Batch 301825 = Sample 154533889 |
|
[[38;5;39m INFO[0m][28-Jun-24 23:58:18] Epoch 3 finished in 11536.6s |
|
[[38;5;39m INFO[0m][28-Jun-24 23:58:18] -------------------------------------------------------------------------------- |
|
[[38;5;39m INFO[0m][28-Jun-24 23:58:18] Epoch 4 = Batch 303025 = Sample 155148289 |
|
[[38;5;39m INFO[0m][29-Jun-24 00:04:00] Total gradient norm stats for 262 steps: 0.1729 <= 0.1967 + 0.01242z <= 0.2486 |
|
[[38;5;39m INFO[0m][29-Jun-24 00:04:00] Trained chunk 73 in 480.5s at 4467noun/s: lr=1.40e-03, loss=1.45e+00, top1=68.52%/68.645% |
|
[[38;5;39m INFO[0m][29-Jun-24 00:04:00] Chunk 74 = Batch 306017 = Sample 156680193 |
|
[[38;5;39m INFO[0m][29-Jun-24 00:11:58] Total gradient norm stats for 262 steps: 0.1741 <= 0.1973 + 0.01249z <= 0.2497 |
|
[[38;5;39m INFO[0m][29-Jun-24 00:11:58] Trained chunk 74 in 478.4s at 4486noun/s: lr=1.40e-03, loss=1.45e+00, top1=68.25%/68.662% |
|
[[38;5;39m INFO[0m][29-Jun-24 00:11:58] Chunk 75 = Batch 310209 = Sample 158826497 |
|
[[38;5;39m INFO[0m][29-Jun-24 00:19:57] Total gradient norm stats for 262 steps: 0.1701 <= 0.1984 + 0.01375z <= 0.2492 |
|
[[38;5;39m INFO[0m][29-Jun-24 00:19:57] Trained chunk 75 in 479.1s at 4479noun/s: lr=1.39e-03, loss=1.45e+00, top1=68.03%/68.681% |
|
[[38;5;39m INFO[0m][29-Jun-24 00:19:57] Chunk 76 = Batch 314401 = Sample 160972801 |
|
[[38;5;39m INFO[0m][29-Jun-24 00:27:55] Total gradient norm stats for 262 steps: 0.1729 <= 0.2021 + 0.03453z <= 0.7111 |
|
[[38;5;39m INFO[0m][29-Jun-24 00:27:55] Trained chunk 76 in 478.2s at 4488noun/s: lr=1.39e-03, loss=1.45e+00, top1=69.68%/68.694% |
|
[[38;5;39m INFO[0m][29-Jun-24 00:27:55] Chunk 77 = Batch 318593 = Sample 163119105 |
|
[[38;5;39m INFO[0m][29-Jun-24 00:35:54] Total gradient norm stats for 262 steps: 0.1742 <= 0.1972 + 0.0107z <= 0.2455 |
|
[[38;5;39m INFO[0m][29-Jun-24 00:35:54] Trained chunk 77 in 478.2s at 4488noun/s: lr=1.39e-03, loss=1.45e+00, top1=68.72%/68.704% |
|
[[38;5;39m INFO[0m][29-Jun-24 00:35:54] Chunk 78 = Batch 322785 = Sample 165265409 |
|
[[38;5;39m INFO[0m][29-Jun-24 00:43:52] Total gradient norm stats for 262 steps: 0.1755 <= 0.1989 + 0.01337z <= 0.2522 |
|
[[38;5;39m INFO[0m][29-Jun-24 00:43:52] Trained chunk 78 in 478.6s at 4484noun/s: lr=1.39e-03, loss=1.45e+00, top1=68.67%/68.716% |
|
[[38;5;39m INFO[0m][29-Jun-24 00:43:52] Chunk 79 = Batch 326977 = Sample 167411713 |
|
[[38;5;39m INFO[0m][29-Jun-24 00:51:51] Total gradient norm stats for 262 steps: 0.1768 <= 0.2012 + 0.01319z <= 0.283 |
|
[[38;5;39m INFO[0m][29-Jun-24 00:51:51] Trained chunk 79 in 478.5s at 4485noun/s: lr=1.38e-03, loss=1.45e+00, top1=68.83%/68.728% |
|
[[38;5;39m INFO[0m][29-Jun-24 00:51:51] Chunk 80 = Batch 331169 = Sample 169558017 |
|
[[38;5;39m INFO[0m][29-Jun-24 00:59:49] Total gradient norm stats for 262 steps: 0.1763 <= 0.2034 + 0.0172z <= 0.3867 |
|
[[38;5;39m INFO[0m][29-Jun-24 00:59:49] Trained chunk 80 in 478.5s at 4485noun/s: lr=1.38e-03, loss=1.45e+00, top1=69.87%/68.741% |
|
[[38;5;39m INFO[0m][29-Jun-24 00:59:49] Chunk 81 = Batch 335361 = Sample 171704321 |
|
[[38;5;39m INFO[0m][29-Jun-24 01:07:48] Total gradient norm stats for 262 steps: 0.1789 <= 0.202 + 0.03363z <= 0.7079 |
|
[[38;5;39m INFO[0m][29-Jun-24 01:07:48] Trained chunk 81 in 479.0s at 4481noun/s: lr=1.38e-03, loss=1.45e+00, top1=68.31%/68.753% |
|
[[38;5;39m INFO[0m][29-Jun-24 01:07:48] Chunk 82 = Batch 339553 = Sample 173850625 |
|
[[38;5;39m INFO[0m][29-Jun-24 01:15:46] Total gradient norm stats for 262 steps: 0.1798 <= 0.201 + 0.01177z <= 0.2487 |
|
[[38;5;39m INFO[0m][29-Jun-24 01:15:46] Trained chunk 82 in 478.0s at 4490noun/s: lr=1.37e-03, loss=1.45e+00, top1=69.82%/68.761% |
|
[[38;5;39m INFO[0m][29-Jun-24 01:15:46] Chunk 83 = Batch 343745 = Sample 175996929 |
|
[[38;5;39m INFO[0m][29-Jun-24 01:23:45] Total gradient norm stats for 262 steps: 0.18 <= 0.2013 + 0.01555z <= 0.3763 |
|
[[38;5;39m INFO[0m][29-Jun-24 01:23:45] Trained chunk 83 in 478.3s at 4488noun/s: lr=1.37e-03, loss=1.45e+00, top1=69.15%/68.772% |
|
[[38;5;39m INFO[0m][29-Jun-24 01:23:45] Chunk 84 = Batch 347937 = Sample 178143233 |
|
[[38;5;39m INFO[0m][29-Jun-24 01:31:44] Total gradient norm stats for 262 steps: 0.1776 <= 0.2012 + 0.01142z <= 0.2384 |
|
[[38;5;39m INFO[0m][29-Jun-24 01:31:44] Trained chunk 84 in 479.1s at 4480noun/s: lr=1.37e-03, loss=1.45e+00, top1=69.65%/68.784% |
|
[[38;5;39m INFO[0m][29-Jun-24 01:31:44] Chunk 85 = Batch 352129 = Sample 180289537 |
|
[[38;5;39m INFO[0m][29-Jun-24 01:39:42] Total gradient norm stats for 262 steps: 0.1778 <= 0.2033 + 0.01412z <= 0.2526 |
|
[[38;5;39m INFO[0m][29-Jun-24 01:39:42] Trained chunk 85 in 477.9s at 4491noun/s: lr=1.36e-03, loss=1.45e+00, top1=69.05%/68.796% |
|
[[38;5;39m INFO[0m][29-Jun-24 01:39:42] Chunk 86 = Batch 356321 = Sample 182435841 |
|
[[38;5;39m INFO[0m][29-Jun-24 01:47:39] Total gradient norm stats for 262 steps: 0.1808 <= 0.2031 + 0.0127z <= 0.3049 |
|
[[38;5;39m INFO[0m][29-Jun-24 01:47:39] Trained chunk 86 in 477.4s at 4496noun/s: lr=1.36e-03, loss=1.45e+00, top1=68.63%/68.808% |
|
[[38;5;39m INFO[0m][29-Jun-24 01:47:39] Chunk 87 = Batch 360513 = Sample 184582145 |
|
[[38;5;39m INFO[0m][29-Jun-24 01:55:38] Total gradient norm stats for 262 steps: 0.1811 <= 0.2028 + 0.01124z <= 0.2392 |
|
[[38;5;39m INFO[0m][29-Jun-24 01:55:38] Trained chunk 87 in 478.5s at 4485noun/s: lr=1.36e-03, loss=1.44e+00, top1=68.78%/68.818% |
|
[[38;5;39m INFO[0m][29-Jun-24 01:55:38] Chunk 88 = Batch 364705 = Sample 186728449 |
|
[[38;5;39m INFO[0m][29-Jun-24 02:03:36] Total gradient norm stats for 262 steps: 0.1821 <= 0.2025 + 0.01111z <= 0.2496 |
|
[[38;5;39m INFO[0m][29-Jun-24 02:03:36] Trained chunk 88 in 478.7s at 4483noun/s: lr=1.36e-03, loss=1.44e+00, top1=68.55%/68.828% |
|
[[38;5;39m INFO[0m][29-Jun-24 02:03:36] Chunk 89 = Batch 368897 = Sample 188874753 |
|
[[38;5;39m INFO[0m][29-Jun-24 02:11:35] Total gradient norm stats for 262 steps: 0.1831 <= 0.2018 + 0.01055z <= 0.2372 |
|
[[38;5;39m INFO[0m][29-Jun-24 02:11:35] Trained chunk 89 in 478.5s at 4486noun/s: lr=1.35e-03, loss=1.44e+00, top1=68.58%/68.837% |
|
[[38;5;39m INFO[0m][29-Jun-24 02:11:35] Chunk 90 = Batch 373089 = Sample 191021057 |
|
[[38;5;39m INFO[0m][29-Jun-24 02:19:32] Total gradient norm stats for 262 steps: 0.1834 <= 0.2045 + 0.01093z <= 0.2632 |
|
[[38;5;39m INFO[0m][29-Jun-24 02:19:32] Trained chunk 90 in 477.5s at 4495noun/s: lr=1.35e-03, loss=1.44e+00, top1=70.10%/68.845% |
|
[[38;5;39m INFO[0m][29-Jun-24 02:19:32] Chunk 91 = Batch 377281 = Sample 193167361 |
|
[[38;5;39m INFO[0m][29-Jun-24 02:27:31] Total gradient norm stats for 262 steps: 0.1833 <= 0.2039 + 0.01088z <= 0.2655 |
|
[[38;5;39m INFO[0m][29-Jun-24 02:27:31] Trained chunk 91 in 478.9s at 4482noun/s: lr=1.35e-03, loss=1.44e+00, top1=70.02%/68.853% |
|
[[38;5;39m INFO[0m][29-Jun-24 02:27:31] Chunk 92 = Batch 381473 = Sample 195313665 |
|
[[38;5;39m INFO[0m][29-Jun-24 02:35:29] Total gradient norm stats for 262 steps: 0.183 <= 0.203 + 0.01468z <= 0.3703 |
|
[[38;5;39m INFO[0m][29-Jun-24 02:35:29] Trained chunk 92 in 478.1s at 4489noun/s: lr=1.34e-03, loss=1.44e+00, top1=70.12%/68.861% |
|
[[38;5;39m INFO[0m][29-Jun-24 02:35:29] Chunk 93 = Batch 385665 = Sample 197459969 |
|
[[38;5;39m INFO[0m][29-Jun-24 02:43:27] Total gradient norm stats for 262 steps: 0.1847 <= 0.2057 + 0.03306z <= 0.716 |
|
[[38;5;39m INFO[0m][29-Jun-24 02:43:27] Trained chunk 93 in 477.8s at 4492noun/s: lr=1.34e-03, loss=1.44e+00, top1=69.13%/68.872% |
|
[[38;5;39m INFO[0m][29-Jun-24 02:43:27] Chunk 94 = Batch 389857 = Sample 199606273 |
|
[[38;5;39m INFO[0m][29-Jun-24 02:51:25] Total gradient norm stats for 262 steps: 0.1862 <= 0.2046 + 0.0112z <= 0.2609 |
|
[[38;5;39m INFO[0m][29-Jun-24 02:51:25] Trained chunk 94 in 478.2s at 4488noun/s: lr=1.34e-03, loss=1.44e+00, top1=69.11%/68.882% |
|
[[38;5;39m INFO[0m][29-Jun-24 02:51:25] Chunk 95 = Batch 394049 = Sample 201752577 |
|
[[38;5;39m INFO[0m][29-Jun-24 02:59:24] Total gradient norm stats for 262 steps: 0.1814 <= 0.2048 + 0.01233z <= 0.3096 |
|
[[38;5;39m INFO[0m][29-Jun-24 02:59:24] Trained chunk 95 in 478.7s at 4484noun/s: lr=1.33e-03, loss=1.44e+00, top1=69.57%/68.891% |
|
[[38;5;39m INFO[0m][29-Jun-24 02:59:24] Chunk 96 = Batch 398241 = Sample 203898881 |
|
[[38;5;39m INFO[0m][29-Jun-24 03:07:23] Total gradient norm stats for 262 steps: 0.185 <= 0.2051 + 0.01012z <= 0.2432 |
|
[[38;5;39m INFO[0m][29-Jun-24 03:07:23] Trained chunk 96 in 478.7s at 4484noun/s: lr=1.33e-03, loss=1.44e+00, top1=68.22%/68.899% |
|
[[38;5;39m INFO[0m][29-Jun-24 03:07:23] Saved checkpoint: /data/strahl/Code/ovod/outputs/ovod_20240628_142131/ovod_chunk0096_20240629_030723.train |
|
[[38;5;39m INFO[0m][29-Jun-24 03:07:23] Chunk 97 = Batch 402433 = Sample 206045185 |
|
[[38;5;39m INFO[0m][29-Jun-24 03:10:27] Epoch 4 finished in 11529.7s |
|
[[38;5;39m INFO[0m][29-Jun-24 03:10:27] -------------------------------------------------------------------------------- |
|
[[38;5;39m INFO[0m][29-Jun-24 03:10:27] Epoch 5 = Batch 404033 = Sample 206864385 |
|
[[38;5;39m INFO[0m][29-Jun-24 03:15:23] Total gradient norm stats for 262 steps: 0.1858 <= 0.2063 + 0.01119z <= 0.2525 |
|
[[38;5;39m INFO[0m][29-Jun-24 03:15:23] Trained chunk 97 in 480.0s at 4472noun/s: lr=1.33e-03, loss=1.44e+00, top1=69.28%/68.908% |
|
[[38;5;39m INFO[0m][29-Jun-24 03:15:23] Chunk 98 = Batch 406625 = Sample 208191489 |
|
[[38;5;39m INFO[0m][29-Jun-24 03:23:22] Total gradient norm stats for 262 steps: 0.1875 <= 0.2075 + 0.01096z <= 0.2616 |
|
[[38;5;39m INFO[0m][29-Jun-24 03:23:22] Trained chunk 98 in 479.1s at 4480noun/s: lr=1.32e-03, loss=1.44e+00, top1=69.28%/68.919% |
|
[[38;5;39m INFO[0m][29-Jun-24 03:23:22] Chunk 99 = Batch 410817 = Sample 210337793 |
|
[[38;5;39m INFO[0m][29-Jun-24 03:31:21] Total gradient norm stats for 262 steps: 0.1871 <= 0.2056 + 0.01017z <= 0.2683 |
|
[[38;5;39m INFO[0m][29-Jun-24 03:31:21] Trained chunk 99 in 479.3s at 4478noun/s: lr=1.32e-03, loss=1.44e+00, top1=69.79%/68.930% |
|
[[38;5;39m INFO[0m][29-Jun-24 03:31:21] Chunk 100 = Batch 415009 = Sample 212484097 |
|
[[38;5;39m INFO[0m][29-Jun-24 03:39:20] Total gradient norm stats for 262 steps: 0.1867 <= 0.2045 + 0.009637z <= 0.2502 |
|
[[38;5;39m INFO[0m][29-Jun-24 03:39:20] Trained chunk 100 in 478.9s at 4482noun/s: lr=1.31e-03, loss=1.44e+00, top1=68.56%/68.938% |
|
[[38;5;39m INFO[0m][29-Jun-24 03:39:20] Chunk 101 = Batch 419201 = Sample 214630401 |
|
[[38;5;39m INFO[0m][29-Jun-24 03:47:19] Total gradient norm stats for 262 steps: 0.187 <= 0.207 + 0.009394z <= 0.2425 |
|
[[38;5;39m INFO[0m][29-Jun-24 03:47:19] Trained chunk 101 in 478.4s at 4486noun/s: lr=1.31e-03, loss=1.44e+00, top1=68.53%/68.945% |
|
[[38;5;39m INFO[0m][29-Jun-24 03:47:19] Chunk 102 = Batch 423393 = Sample 216776705 |
|
[[38;5;39m INFO[0m][29-Jun-24 03:55:17] Total gradient norm stats for 262 steps: 0.1875 <= 0.2085 + 0.01053z <= 0.2632 |
|
[[38;5;39m INFO[0m][29-Jun-24 03:55:17] Trained chunk 102 in 478.2s at 4488noun/s: lr=1.31e-03, loss=1.44e+00, top1=69.17%/68.956% |
|
[[38;5;39m INFO[0m][29-Jun-24 03:55:17] Chunk 103 = Batch 427585 = Sample 218923009 |
|
[[38;5;39m INFO[0m][29-Jun-24 04:03:16] Total gradient norm stats for 262 steps: 0.1891 <= 0.209 + 0.02332z <= 0.5538 |
|
[[38;5;39m INFO[0m][29-Jun-24 04:03:16] Trained chunk 103 in 479.5s at 4476noun/s: lr=1.30e-03, loss=1.44e+00, top1=68.71%/68.964% |
|
[[38;5;39m INFO[0m][29-Jun-24 04:03:16] Chunk 104 = Batch 431777 = Sample 221069313 |
|
[[38;5;39m INFO[0m][29-Jun-24 04:11:15] Total gradient norm stats for 262 steps: 0.1863 <= 0.208 + 0.01017z <= 0.2434 |
|
[[38;5;39m INFO[0m][29-Jun-24 04:11:15] Trained chunk 104 in 478.5s at 4486noun/s: lr=1.30e-03, loss=1.44e+00, top1=69.61%/68.975% |
|
[[38;5;39m INFO[0m][29-Jun-24 04:11:15] Chunk 105 = Batch 435969 = Sample 223215617 |
|
[[38;5;39m INFO[0m][29-Jun-24 04:19:14] Total gradient norm stats for 262 steps: 0.1924 <= 0.2097 + 0.02757z <= 0.6295 |
|
[[38;5;39m INFO[0m][29-Jun-24 04:19:14] Trained chunk 105 in 478.7s at 4484noun/s: lr=1.30e-03, loss=1.44e+00, top1=70.30%/68.984% |
|
[[38;5;39m INFO[0m][29-Jun-24 04:19:14] Chunk 106 = Batch 440161 = Sample 225361921 |
|
[[38;5;39m INFO[0m][29-Jun-24 04:27:13] Total gradient norm stats for 262 steps: 0.1894 <= 0.2079 + 0.01167z <= 0.3327 |
|
[[38;5;39m INFO[0m][29-Jun-24 04:27:13] Trained chunk 106 in 479.1s at 4480noun/s: lr=1.29e-03, loss=1.44e+00, top1=69.79%/68.991% |
|
[[38;5;39m INFO[0m][29-Jun-24 04:27:13] Chunk 107 = Batch 444353 = Sample 227508225 |
|
[[38;5;39m INFO[0m][29-Jun-24 04:35:12] Total gradient norm stats for 262 steps: 0.1886 <= 0.2078 + 0.009306z <= 0.2382 |
|
[[38;5;39m INFO[0m][29-Jun-24 04:35:12] Trained chunk 107 in 479.3s at 4478noun/s: lr=1.29e-03, loss=1.43e+00, top1=69.52%/69.002% |
|
[[38;5;39m INFO[0m][29-Jun-24 04:35:12] Chunk 108 = Batch 448545 = Sample 229654529 |
|
[[38;5;39m INFO[0m][29-Jun-24 04:43:10] Total gradient norm stats for 262 steps: 0.1875 <= 0.2088 + 0.009296z <= 0.2555 |
|
[[38;5;39m INFO[0m][29-Jun-24 04:43:10] Trained chunk 108 in 477.9s at 4491noun/s: lr=1.29e-03, loss=1.43e+00, top1=69.47%/69.012% |
|
[[38;5;39m INFO[0m][29-Jun-24 04:43:10] Chunk 109 = Batch 452737 = Sample 231800833 |
|
[[38;5;39m INFO[0m][29-Jun-24 04:51:09] Total gradient norm stats for 262 steps: 0.1904 <= 0.2076 + 0.008772z <= 0.242 |
|
[[38;5;39m INFO[0m][29-Jun-24 04:51:09] Trained chunk 109 in 478.8s at 4482noun/s: lr=1.28e-03, loss=1.43e+00, top1=68.95%/69.024% |
|
[[38;5;39m INFO[0m][29-Jun-24 04:51:09] Chunk 110 = Batch 456929 = Sample 233947137 |
|
[[38;5;39m INFO[0m][29-Jun-24 04:59:08] Total gradient norm stats for 262 steps: 0.1892 <= 0.2084 + 0.009687z <= 0.2476 |
|
[[38;5;39m INFO[0m][29-Jun-24 04:59:08] Trained chunk 110 in 479.0s at 4481noun/s: lr=1.28e-03, loss=1.43e+00, top1=68.46%/69.033% |
|
[[38;5;39m INFO[0m][29-Jun-24 04:59:08] Chunk 111 = Batch 461121 = Sample 236093441 |
|
[[38;5;39m INFO[0m][29-Jun-24 05:07:07] Total gradient norm stats for 262 steps: 0.1906 <= 0.2068 + 0.007874z <= 0.2429 |
|
[[38;5;39m INFO[0m][29-Jun-24 05:07:07] Trained chunk 111 in 478.6s at 4485noun/s: lr=1.27e-03, loss=1.43e+00, top1=68.79%/69.042% |
|
[[38;5;39m INFO[0m][29-Jun-24 05:07:07] Chunk 112 = Batch 465313 = Sample 238239745 |
|
[[38;5;39m INFO[0m][29-Jun-24 05:15:05] Total gradient norm stats for 262 steps: 0.1874 <= 0.2101 + 0.02908z <= 0.6542 |
|
[[38;5;39m INFO[0m][29-Jun-24 05:15:05] Trained chunk 112 in 478.8s at 4483noun/s: lr=1.27e-03, loss=1.43e+00, top1=68.22%/69.048% |
|
[[38;5;39m INFO[0m][29-Jun-24 05:15:05] Chunk 113 = Batch 469505 = Sample 240386049 |
|
[[38;5;39m INFO[0m][29-Jun-24 05:23:04] Total gradient norm stats for 262 steps: 0.1908 <= 0.2118 + 0.04417z <= 0.9087 |
|
[[38;5;39m INFO[0m][29-Jun-24 05:23:04] Trained chunk 113 in 478.5s at 4485noun/s: lr=1.27e-03, loss=1.43e+00, top1=68.65%/69.054% |
|
[[38;5;39m INFO[0m][29-Jun-24 05:23:04] Chunk 114 = Batch 473697 = Sample 242532353 |
|
[[38;5;39m INFO[0m][29-Jun-24 05:31:01] Total gradient norm stats for 262 steps: 0.1913 <= 0.211 + 0.0105z <= 0.2701 |
|
[[38;5;39m INFO[0m][29-Jun-24 05:31:01] Trained chunk 114 in 477.4s at 4495noun/s: lr=1.26e-03, loss=1.43e+00, top1=68.07%/69.061% |
|
[[38;5;39m INFO[0m][29-Jun-24 05:31:01] Chunk 115 = Batch 477889 = Sample 244678657 |
|
[[38;5;39m INFO[0m][29-Jun-24 05:39:01] Total gradient norm stats for 262 steps: 0.1926 <= 0.2084 + 0.008341z <= 0.2441 |
|
[[38;5;39m INFO[0m][29-Jun-24 05:39:01] Trained chunk 115 in 479.3s at 4478noun/s: lr=1.26e-03, loss=1.43e+00, top1=69.78%/69.066% |
|
[[38;5;39m INFO[0m][29-Jun-24 05:39:01] Chunk 116 = Batch 482081 = Sample 246824961 |
|
[[38;5;39m INFO[0m][29-Jun-24 05:46:59] Total gradient norm stats for 262 steps: 0.1939 <= 0.2086 + 0.008585z <= 0.2408 |
|
[[38;5;39m INFO[0m][29-Jun-24 05:46:59] Trained chunk 116 in 478.1s at 4489noun/s: lr=1.25e-03, loss=1.43e+00, top1=68.06%/69.076% |
|
[[38;5;39m INFO[0m][29-Jun-24 05:46:59] Chunk 117 = Batch 486273 = Sample 248971265 |
|
[[38;5;39m INFO[0m][29-Jun-24 05:54:56] Total gradient norm stats for 262 steps: 0.1951 <= 0.2112 + 0.01084z <= 0.2608 |
|
[[38;5;39m INFO[0m][29-Jun-24 05:54:56] Trained chunk 117 in 477.2s at 4498noun/s: lr=1.25e-03, loss=1.43e+00, top1=69.34%/69.084% |
|
[[38;5;39m INFO[0m][29-Jun-24 05:54:56] Chunk 118 = Batch 490465 = Sample 251117569 |
|
[[38;5;39m INFO[0m][29-Jun-24 06:02:54] Total gradient norm stats for 262 steps: 0.1909 <= 0.2101 + 0.009136z <= 0.2781 |
|
[[38;5;39m INFO[0m][29-Jun-24 06:02:54] Trained chunk 118 in 477.8s at 4492noun/s: lr=1.25e-03, loss=1.43e+00, top1=69.25%/69.096% |
|
[[38;5;39m INFO[0m][29-Jun-24 06:02:54] Chunk 119 = Batch 494657 = Sample 253263873 |
|
[[38;5;39m INFO[0m][29-Jun-24 06:10:52] Total gradient norm stats for 262 steps: 0.1909 <= 0.2083 + 0.008476z <= 0.2407 |
|
[[38;5;39m INFO[0m][29-Jun-24 06:10:52] Trained chunk 119 in 478.6s at 4485noun/s: lr=1.24e-03, loss=1.43e+00, top1=68.80%/69.105% |
|
[[38;5;39m INFO[0m][29-Jun-24 06:10:52] Chunk 120 = Batch 498849 = Sample 255410177 |
|
[[38;5;39m INFO[0m][29-Jun-24 06:18:51] Total gradient norm stats for 262 steps: 0.1936 <= 0.2107 + 0.009989z <= 0.2614 |
|
[[38;5;39m INFO[0m][29-Jun-24 06:18:51] Trained chunk 120 in 479.2s at 4479noun/s: lr=1.24e-03, loss=1.43e+00, top1=68.69%/69.114% |
|
[[38;5;39m INFO[0m][29-Jun-24 06:18:51] Chunk 121 = Batch 503041 = Sample 257556481 |
|
[[38;5;39m INFO[0m][29-Jun-24 06:22:41] Epoch 5 finished in 11534.1s |
|
[[38;5;39m INFO[0m][29-Jun-24 06:22:41] -------------------------------------------------------------------------------- |
|
[[38;5;39m INFO[0m][29-Jun-24 06:22:41] Epoch 6 = Batch 505041 = Sample 258580481 |
|
[[38;5;39m INFO[0m][29-Jun-24 06:26:52] Total gradient norm stats for 262 steps: 0.192 <= 0.2126 + 0.02034z <= 0.4957 |
|
[[38;5;39m INFO[0m][29-Jun-24 06:26:52] Trained chunk 121 in 480.1s at 4471noun/s: lr=1.23e-03, loss=1.43e+00, top1=70.10%/69.120% |
|
[[38;5;39m INFO[0m][29-Jun-24 06:26:52] Chunk 122 = Batch 507233 = Sample 259702785 |
|
[[38;5;39m INFO[0m][29-Jun-24 06:34:50] Total gradient norm stats for 262 steps: 0.1941 <= 0.2107 + 0.009949z <= 0.2922 |
|
[[38;5;39m INFO[0m][29-Jun-24 06:34:50] Trained chunk 122 in 478.6s at 4485noun/s: lr=1.23e-03, loss=1.43e+00, top1=69.61%/69.129% |
|
[[38;5;39m INFO[0m][29-Jun-24 06:34:50] Chunk 123 = Batch 511425 = Sample 261849089 |
|
[[38;5;39m INFO[0m][29-Jun-24 06:42:48] Total gradient norm stats for 262 steps: 0.1947 <= 0.2133 + 0.0467z <= 0.9547 |
|
[[38;5;39m INFO[0m][29-Jun-24 06:42:48] Trained chunk 123 in 478.0s at 4491noun/s: lr=1.22e-03, loss=1.43e+00, top1=69.00%/69.137% |
|
[[38;5;39m INFO[0m][29-Jun-24 06:42:48] Chunk 124 = Batch 515617 = Sample 263995393 |
|
[[38;5;39m INFO[0m][29-Jun-24 06:50:46] Total gradient norm stats for 262 steps: 0.197 <= 0.2115 + 0.008111z <= 0.2423 |
|
[[38;5;39m INFO[0m][29-Jun-24 06:50:46] Trained chunk 124 in 477.8s at 4492noun/s: lr=1.22e-03, loss=1.43e+00, top1=69.06%/69.145% |
|
[[38;5;39m INFO[0m][29-Jun-24 06:50:46] Chunk 125 = Batch 519809 = Sample 266141697 |
|
[[38;5;39m INFO[0m][29-Jun-24 06:58:43] Total gradient norm stats for 262 steps: 0.1951 <= 0.2107 + 0.007827z <= 0.2456 |
|
[[38;5;39m INFO[0m][29-Jun-24 06:58:43] Trained chunk 125 in 477.4s at 4496noun/s: lr=1.22e-03, loss=1.43e+00, top1=69.75%/69.154% |
|
[[38;5;39m INFO[0m][29-Jun-24 06:58:43] Chunk 126 = Batch 524001 = Sample 268288001 |
|
[[38;5;39m INFO[0m][29-Jun-24 07:06:43] Total gradient norm stats for 262 steps: 0.1905 <= 0.2116 + 0.008712z <= 0.2473 |
|
[[38;5;39m INFO[0m][29-Jun-24 07:06:43] Trained chunk 126 in 479.6s at 4475noun/s: lr=1.21e-03, loss=1.42e+00, top1=70.18%/69.164% |
|
[[38;5;39m INFO[0m][29-Jun-24 07:06:43] Chunk 127 = Batch 528193 = Sample 270434305 |
|
[[38;5;39m INFO[0m][29-Jun-24 07:14:42] Total gradient norm stats for 262 steps: 0.1946 <= 0.2115 + 0.009154z <= 0.2718 |
|
[[38;5;39m INFO[0m][29-Jun-24 07:14:42] Trained chunk 127 in 478.9s at 4482noun/s: lr=1.21e-03, loss=1.42e+00, top1=67.43%/69.170% |
|
[[38;5;39m INFO[0m][29-Jun-24 07:14:42] Chunk 128 = Batch 532385 = Sample 272580609 |
|
[[38;5;39m INFO[0m][29-Jun-24 07:22:40] Total gradient norm stats for 262 steps: 0.1939 <= 0.2139 + 0.02925z <= 0.6632 |
|
[[38;5;39m INFO[0m][29-Jun-24 07:22:40] Trained chunk 128 in 478.0s at 4490noun/s: lr=1.20e-03, loss=1.42e+00, top1=69.85%/69.179% |
|
[[38;5;39m INFO[0m][29-Jun-24 07:22:40] Chunk 129 = Batch 536577 = Sample 274726913 |
|
[[38;5;39m INFO[0m][29-Jun-24 07:30:38] Total gradient norm stats for 262 steps: 0.1939 <= 0.2102 + 0.007557z <= 0.2473 |
|
[[38;5;39m INFO[0m][29-Jun-24 07:30:38] Trained chunk 129 in 478.6s at 4485noun/s: lr=1.20e-03, loss=1.42e+00, top1=68.66%/69.185% |
|
[[38;5;39m INFO[0m][29-Jun-24 07:30:38] Chunk 130 = Batch 540769 = Sample 276873217 |
|
[[38;5;39m INFO[0m][29-Jun-24 07:38:37] Total gradient norm stats for 262 steps: 0.1957 <= 0.2124 + 0.01001z <= 0.3083 |
|
[[38;5;39m INFO[0m][29-Jun-24 07:38:37] Trained chunk 130 in 478.5s at 4485noun/s: lr=1.19e-03, loss=1.42e+00, top1=69.72%/69.197% |
|
[[38;5;39m INFO[0m][29-Jun-24 07:38:37] Chunk 131 = Batch 544961 = Sample 279019521 |
|
[[38;5;39m INFO[0m][29-Jun-24 07:46:35] Total gradient norm stats for 262 steps: 0.1948 <= 0.2117 + 0.0084z <= 0.242 |
|
[[38;5;39m INFO[0m][29-Jun-24 07:46:35] Trained chunk 131 in 478.4s at 4486noun/s: lr=1.19e-03, loss=1.42e+00, top1=70.30%/69.201% |
|
[[38;5;39m INFO[0m][29-Jun-24 07:46:35] Chunk 132 = Batch 549153 = Sample 281165825 |
|
[[38;5;39m INFO[0m][29-Jun-24 07:54:33] Total gradient norm stats for 262 steps: 0.1946 <= 0.2127 + 0.01388z <= 0.3797 |
|
[[38;5;39m INFO[0m][29-Jun-24 07:54:33] Trained chunk 132 in 478.1s at 4489noun/s: lr=1.19e-03, loss=1.42e+00, top1=69.92%/69.208% |
|
[[38;5;39m INFO[0m][29-Jun-24 07:54:33] Chunk 133 = Batch 553345 = Sample 283312129 |
|
[[38;5;39m INFO[0m][29-Jun-24 08:02:32] Total gradient norm stats for 262 steps: 0.1971 <= 0.2123 + 0.01227z <= 0.3669 |
|
[[38;5;39m INFO[0m][29-Jun-24 08:02:32] Trained chunk 133 in 479.1s at 4480noun/s: lr=1.18e-03, loss=1.42e+00, top1=69.95%/69.219% |
|
[[38;5;39m INFO[0m][29-Jun-24 08:02:32] Chunk 134 = Batch 557537 = Sample 285458433 |
|
[[38;5;39m INFO[0m][29-Jun-24 08:10:31] Total gradient norm stats for 262 steps: 0.1953 <= 0.2116 + 0.007585z <= 0.2508 |
|
[[38;5;39m INFO[0m][29-Jun-24 08:10:31] Trained chunk 134 in 478.1s at 4489noun/s: lr=1.18e-03, loss=1.42e+00, top1=69.62%/69.226% |
|
[[38;5;39m INFO[0m][29-Jun-24 08:10:31] Chunk 135 = Batch 561729 = Sample 287604737 |
|
[[38;5;39m INFO[0m][29-Jun-24 08:18:29] Total gradient norm stats for 262 steps: 0.1973 <= 0.2128 + 0.01134z <= 0.3451 |
|
[[38;5;39m INFO[0m][29-Jun-24 08:18:29] Trained chunk 135 in 478.3s at 4487noun/s: lr=1.17e-03, loss=1.42e+00, top1=68.70%/69.234% |
|
[[38;5;39m INFO[0m][29-Jun-24 08:18:29] Chunk 136 = Batch 565921 = Sample 289751041 |
|
[[38;5;39m INFO[0m][29-Jun-24 08:26:28] Total gradient norm stats for 262 steps: 0.1955 <= 0.2142 + 0.01116z <= 0.3386 |
|
[[38;5;39m INFO[0m][29-Jun-24 08:26:28] Trained chunk 136 in 478.8s at 4483noun/s: lr=1.17e-03, loss=1.42e+00, top1=71.41%/69.243% |
|
[[38;5;39m INFO[0m][29-Jun-24 08:26:28] Chunk 137 = Batch 570113 = Sample 291897345 |
|
[[38;5;39m INFO[0m][29-Jun-24 08:34:26] Total gradient norm stats for 262 steps: 0.1969 <= 0.2143 + 0.007899z <= 0.2366 |
|
[[38;5;39m INFO[0m][29-Jun-24 08:34:26] Trained chunk 137 in 478.3s at 4488noun/s: lr=1.16e-03, loss=1.42e+00, top1=70.18%/69.249% |
|
[[38;5;39m INFO[0m][29-Jun-24 08:34:26] Chunk 138 = Batch 574305 = Sample 294043649 |
|
[[38;5;39m INFO[0m][29-Jun-24 08:42:25] Total gradient norm stats for 262 steps: 0.1972 <= 0.2136 + 0.01002z <= 0.3213 |
|
[[38;5;39m INFO[0m][29-Jun-24 08:42:25] Trained chunk 138 in 478.6s at 4484noun/s: lr=1.16e-03, loss=1.42e+00, top1=69.90%/69.258% |
|
[[38;5;39m INFO[0m][29-Jun-24 08:42:25] Chunk 139 = Batch 578497 = Sample 296189953 |
|
[[38;5;39m INFO[0m][29-Jun-24 08:50:24] Total gradient norm stats for 262 steps: 0.1977 <= 0.2137 + 0.00744z <= 0.2445 |
|
[[38;5;39m INFO[0m][29-Jun-24 08:50:24] Trained chunk 139 in 478.9s at 4481noun/s: lr=1.15e-03, loss=1.42e+00, top1=68.99%/69.265% |
|
[[38;5;39m INFO[0m][29-Jun-24 08:50:24] Chunk 140 = Batch 582689 = Sample 298336257 |
|
[[38;5;39m INFO[0m][29-Jun-24 08:58:21] Total gradient norm stats for 262 steps: 0.1952 <= 0.2144 + 0.007332z <= 0.2418 |
|
[[38;5;39m INFO[0m][29-Jun-24 08:58:21] Trained chunk 140 in 477.7s at 4493noun/s: lr=1.15e-03, loss=1.42e+00, top1=70.33%/69.275% |
|
[[38;5;39m INFO[0m][29-Jun-24 08:58:21] Chunk 141 = Batch 586881 = Sample 300482561 |
|
[[38;5;39m INFO[0m][29-Jun-24 09:06:20] Total gradient norm stats for 262 steps: 0.1983 <= 0.2141 + 0.008102z <= 0.2552 |
|
[[38;5;39m INFO[0m][29-Jun-24 09:06:20] Trained chunk 141 in 478.4s at 4486noun/s: lr=1.15e-03, loss=1.42e+00, top1=69.26%/69.282% |
|
[[38;5;39m INFO[0m][29-Jun-24 09:06:20] Chunk 142 = Batch 591073 = Sample 302628865 |
|
[[38;5;39m INFO[0m][29-Jun-24 09:14:18] Total gradient norm stats for 262 steps: 0.1946 <= 0.2145 + 0.008773z <= 0.2513 |
|
[[38;5;39m INFO[0m][29-Jun-24 09:14:18] Trained chunk 142 in 478.4s at 4486noun/s: lr=1.14e-03, loss=1.42e+00, top1=69.59%/69.290% |
|
[[38;5;39m INFO[0m][29-Jun-24 09:14:18] Chunk 143 = Batch 595265 = Sample 304775169 |
|
[[38;5;39m INFO[0m][29-Jun-24 09:22:16] Total gradient norm stats for 262 steps: 0.1976 <= 0.2149 + 0.008098z <= 0.2424 |
|
[[38;5;39m INFO[0m][29-Jun-24 09:22:16] Trained chunk 143 in 477.5s at 4495noun/s: lr=1.14e-03, loss=1.42e+00, top1=69.31%/69.299% |
|
[[38;5;39m INFO[0m][29-Jun-24 09:22:16] Chunk 144 = Batch 599457 = Sample 306921473 |
|
[[38;5;39m INFO[0m][29-Jun-24 09:30:14] Total gradient norm stats for 262 steps: 0.1998 <= 0.2166 + 0.008533z <= 0.2489 |
|
[[38;5;39m INFO[0m][29-Jun-24 09:30:14] Trained chunk 144 in 478.9s at 4482noun/s: lr=1.13e-03, loss=1.42e+00, top1=70.12%/69.306% |
|
[[38;5;39m INFO[0m][29-Jun-24 09:30:15] Saved checkpoint: /data/strahl/Code/ovod/outputs/ovod_20240628_142131/ovod_chunk0144_20240629_093014.train |
|
[[38;5;39m INFO[0m][29-Jun-24 09:30:15] Chunk 145 = Batch 603649 = Sample 309067777 |
|
[[38;5;39m INFO[0m][29-Jun-24 09:34:50] Epoch 6 finished in 11528.3s |
|
[[38;5;39m INFO[0m][29-Jun-24 09:34:50] -------------------------------------------------------------------------------- |
|
[[38;5;39m INFO[0m][29-Jun-24 09:34:50] Epoch 7 = Batch 606049 = Sample 310296577 |
|
[[38;5;39m INFO[0m][29-Jun-24 09:38:14] Total gradient norm stats for 262 steps: 0.2004 <= 0.2159 + 0.008103z <= 0.2464 |
|
[[38;5;39m INFO[0m][29-Jun-24 09:38:14] Trained chunk 145 in 479.5s at 4476noun/s: lr=1.13e-03, loss=1.42e+00, top1=69.20%/69.316% |
|
[[38;5;39m INFO[0m][29-Jun-24 09:38:14] Chunk 146 = Batch 607841 = Sample 311214081 |
|
[[38;5;39m INFO[0m][29-Jun-24 09:46:13] Total gradient norm stats for 262 steps: 0.1992 <= 0.2153 + 0.007724z <= 0.2463 |
|
[[38;5;39m INFO[0m][29-Jun-24 09:46:13] Trained chunk 146 in 478.4s at 4487noun/s: lr=1.12e-03, loss=1.41e+00, top1=69.29%/69.324% |
|
[[38;5;39m INFO[0m][29-Jun-24 09:46:13] Chunk 147 = Batch 612033 = Sample 313360385 |
|
[[38;5;39m INFO[0m][29-Jun-24 09:54:09] Total gradient norm stats for 262 steps: 0.1969 <= 0.2159 + 0.008203z <= 0.261 |
|
[[38;5;39m INFO[0m][29-Jun-24 09:54:09] Trained chunk 147 in 476.5s at 4504noun/s: lr=1.12e-03, loss=1.41e+00, top1=68.95%/69.333% |
|
[[38;5;39m INFO[0m][29-Jun-24 09:54:09] Chunk 148 = Batch 616225 = Sample 315506689 |
|
[[38;5;39m INFO[0m][29-Jun-24 10:02:07] Total gradient norm stats for 262 steps: 0.2003 <= 0.2158 + 0.008089z <= 0.2475 |
|
[[38;5;39m INFO[0m][29-Jun-24 10:02:07] Trained chunk 148 in 477.5s at 4495noun/s: lr=1.11e-03, loss=1.41e+00, top1=70.19%/69.341% |
|
[[38;5;39m INFO[0m][29-Jun-24 10:02:07] Chunk 149 = Batch 620417 = Sample 317652993 |
|
[[38;5;39m INFO[0m][29-Jun-24 10:10:04] Total gradient norm stats for 262 steps: 0.2015 <= 0.2204 + 0.05091z <= 1.005 (clipped to 1) |
|
[[38;5;39m INFO[0m][29-Jun-24 10:10:04] Trained chunk 149 in 477.4s at 4496noun/s: lr=1.11e-03, loss=1.41e+00, top1=69.48%/69.347% |
|
[[38;5;39m INFO[0m][29-Jun-24 10:10:04] Chunk 150 = Batch 624609 = Sample 319799297 |
|
[[38;5;39m INFO[0m][29-Jun-24 10:18:02] Total gradient norm stats for 262 steps: 0.1955 <= 0.2156 + 0.007388z <= 0.2543 |
|
[[38;5;39m INFO[0m][29-Jun-24 10:18:02] Trained chunk 150 in 478.2s at 4488noun/s: lr=1.10e-03, loss=1.41e+00, top1=69.63%/69.354% |
|
[[38;5;39m INFO[0m][29-Jun-24 10:18:02] Chunk 151 = Batch 628801 = Sample 321945601 |
|
[[38;5;39m INFO[0m][29-Jun-24 10:26:01] Total gradient norm stats for 262 steps: 0.2018 <= 0.2174 + 0.007771z <= 0.2493 |
|
[[38;5;39m INFO[0m][29-Jun-24 10:26:01] Trained chunk 151 in 478.7s at 4484noun/s: lr=1.10e-03, loss=1.41e+00, top1=68.85%/69.360% |
|
[[38;5;39m INFO[0m][29-Jun-24 10:26:01] Chunk 152 = Batch 632993 = Sample 324091905 |
|
[[38;5;39m INFO[0m][29-Jun-24 10:33:59] Total gradient norm stats for 262 steps: 0.2008 <= 0.2168 + 0.008012z <= 0.2498 |
|
[[38;5;39m INFO[0m][29-Jun-24 10:33:59] Trained chunk 152 in 478.4s at 4487noun/s: lr=1.09e-03, loss=1.41e+00, top1=69.47%/69.367% |
|
[[38;5;39m INFO[0m][29-Jun-24 10:33:59] Chunk 153 = Batch 637185 = Sample 326238209 |
|
[[38;5;39m INFO[0m][29-Jun-24 10:41:58] Total gradient norm stats for 262 steps: 0.1991 <= 0.2159 + 0.007463z <= 0.242 |
|
[[38;5;39m INFO[0m][29-Jun-24 10:41:58] Trained chunk 153 in 478.5s at 4485noun/s: lr=1.09e-03, loss=1.41e+00, top1=70.26%/69.374% |
|
[[38;5;39m INFO[0m][29-Jun-24 10:41:58] Chunk 154 = Batch 641377 = Sample 328384513 |
|
[[38;5;39m INFO[0m][29-Jun-24 10:49:55] Total gradient norm stats for 262 steps: 0.2022 <= 0.2188 + 0.01003z <= 0.3129 |
|
[[38;5;39m INFO[0m][29-Jun-24 10:49:55] Trained chunk 154 in 477.0s at 4500noun/s: lr=1.08e-03, loss=1.41e+00, top1=68.76%/69.384% |
|
[[38;5;39m INFO[0m][29-Jun-24 10:49:55] Chunk 155 = Batch 645569 = Sample 330530817 |
|
[[38;5;39m INFO[0m][29-Jun-24 10:57:53] Total gradient norm stats for 262 steps: 0.2056 <= 0.2179 + 0.007413z <= 0.2465 |
|
[[38;5;39m INFO[0m][29-Jun-24 10:57:53] Trained chunk 155 in 478.1s at 4489noun/s: lr=1.08e-03, loss=1.41e+00, top1=69.01%/69.390% |
|
[[38;5;39m INFO[0m][29-Jun-24 10:57:53] Chunk 156 = Batch 649761 = Sample 332677121 |
|
[[38;5;39m INFO[0m][29-Jun-24 11:05:51] Total gradient norm stats for 262 steps: 0.205 <= 0.2187 + 0.00731z <= 0.2587 |
|
[[38;5;39m INFO[0m][29-Jun-24 11:05:51] Trained chunk 156 in 478.1s at 4489noun/s: lr=1.07e-03, loss=1.41e+00, top1=70.42%/69.396% |
|
[[38;5;39m INFO[0m][29-Jun-24 11:05:51] Chunk 157 = Batch 653953 = Sample 334823425 |
|
[[38;5;39m INFO[0m][29-Jun-24 11:13:49] Total gradient norm stats for 262 steps: 0.2052 <= 0.2194 + 0.008225z <= 0.2614 |
|
[[38;5;39m INFO[0m][29-Jun-24 11:13:49] Trained chunk 157 in 477.8s at 4492noun/s: lr=1.07e-03, loss=1.41e+00, top1=70.10%/69.403% |
|
[[38;5;39m INFO[0m][29-Jun-24 11:13:49] Chunk 158 = Batch 658145 = Sample 336969729 |
|
[[38;5;39m INFO[0m][29-Jun-24 11:21:47] Total gradient norm stats for 262 steps: 0.2037 <= 0.2194 + 0.00849z <= 0.253 |
|
[[38;5;39m INFO[0m][29-Jun-24 11:21:47] Trained chunk 158 in 478.4s at 4486noun/s: lr=1.06e-03, loss=1.41e+00, top1=69.86%/69.411% |
|
[[38;5;39m INFO[0m][29-Jun-24 11:21:47] Chunk 159 = Batch 662337 = Sample 339116033 |
|
[[38;5;39m INFO[0m][29-Jun-24 11:29:46] Total gradient norm stats for 262 steps: 0.2036 <= 0.2188 + 0.006987z <= 0.2446 |
|
[[38;5;39m INFO[0m][29-Jun-24 11:29:46] Trained chunk 159 in 478.7s at 4483noun/s: lr=1.06e-03, loss=1.41e+00, top1=69.49%/69.419% |
|
[[38;5;39m INFO[0m][29-Jun-24 11:29:46] Chunk 160 = Batch 666529 = Sample 341262337 |
|
[[38;5;39m INFO[0m][29-Jun-24 11:37:44] Total gradient norm stats for 262 steps: 0.2046 <= 0.2197 + 0.007455z <= 0.2513 |
|
[[38;5;39m INFO[0m][29-Jun-24 11:37:44] Trained chunk 160 in 478.1s at 4489noun/s: lr=1.05e-03, loss=1.41e+00, top1=70.86%/69.430% |
|
[[38;5;39m INFO[0m][29-Jun-24 11:37:44] Chunk 161 = Batch 670721 = Sample 343408641 |
|
[[38;5;39m INFO[0m][29-Jun-24 11:45:43] Total gradient norm stats for 262 steps: 0.2042 <= 0.2194 + 0.007932z <= 0.2612 |
|
[[38;5;39m INFO[0m][29-Jun-24 11:45:43] Trained chunk 161 in 478.4s at 4487noun/s: lr=1.05e-03, loss=1.41e+00, top1=69.91%/69.437% |
|
[[38;5;39m INFO[0m][29-Jun-24 11:45:43] Chunk 162 = Batch 674913 = Sample 345554945 |
|
[[38;5;39m INFO[0m][29-Jun-24 11:53:41] Total gradient norm stats for 262 steps: 0.2051 <= 0.2207 + 0.007849z <= 0.2753 |
|
[[38;5;39m INFO[0m][29-Jun-24 11:53:41] Trained chunk 162 in 478.1s at 4489noun/s: lr=1.04e-03, loss=1.41e+00, top1=68.96%/69.445% |
|
[[38;5;39m INFO[0m][29-Jun-24 11:53:41] Chunk 163 = Batch 679105 = Sample 347701249 |
|
[[38;5;39m INFO[0m][29-Jun-24 12:01:39] Total gradient norm stats for 262 steps: 0.2074 <= 0.2221 + 0.02478z <= 0.5984 |
|
[[38;5;39m INFO[0m][29-Jun-24 12:01:39] Trained chunk 163 in 477.9s at 4491noun/s: lr=1.04e-03, loss=1.41e+00, top1=69.08%/69.450% |
|
[[38;5;39m INFO[0m][29-Jun-24 12:01:39] Chunk 164 = Batch 683297 = Sample 349847553 |
|
[[38;5;39m INFO[0m][29-Jun-24 12:09:36] Total gradient norm stats for 262 steps: 0.2016 <= 0.2205 + 0.007474z <= 0.255 |
|
[[38;5;39m INFO[0m][29-Jun-24 12:09:36] Trained chunk 164 in 477.8s at 4492noun/s: lr=1.03e-03, loss=1.41e+00, top1=70.45%/69.459% |
|
[[38;5;39m INFO[0m][29-Jun-24 12:09:36] Chunk 165 = Batch 687489 = Sample 351993857 |
|
[[38;5;39m INFO[0m][29-Jun-24 12:17:34] Total gradient norm stats for 262 steps: 0.2058 <= 0.2217 + 0.009324z <= 0.3133 |
|
[[38;5;39m INFO[0m][29-Jun-24 12:17:34] Trained chunk 165 in 477.8s at 4492noun/s: lr=1.03e-03, loss=1.41e+00, top1=68.10%/69.466% |
|
[[38;5;39m INFO[0m][29-Jun-24 12:17:34] Chunk 166 = Batch 691681 = Sample 354140161 |
|
[[38;5;39m INFO[0m][29-Jun-24 12:25:32] Total gradient norm stats for 262 steps: 0.2029 <= 0.2214 + 0.007564z <= 0.2452 |
|
[[38;5;39m INFO[0m][29-Jun-24 12:25:32] Trained chunk 166 in 478.1s at 4489noun/s: lr=1.02e-03, loss=1.41e+00, top1=70.15%/69.476% |
|
[[38;5;39m INFO[0m][29-Jun-24 12:25:32] Chunk 167 = Batch 695873 = Sample 356286465 |
|
[[38;5;39m INFO[0m][29-Jun-24 12:33:30] Total gradient norm stats for 262 steps: 0.2061 <= 0.2226 + 0.008735z <= 0.2766 |
|
[[38;5;39m INFO[0m][29-Jun-24 12:33:30] Trained chunk 167 in 477.7s at 4493noun/s: lr=1.02e-03, loss=1.41e+00, top1=69.38%/69.484% |
|
[[38;5;39m INFO[0m][29-Jun-24 12:33:30] Chunk 168 = Batch 700065 = Sample 358432769 |
|
[[38;5;39m INFO[0m][29-Jun-24 12:41:29] Total gradient norm stats for 262 steps: 0.2069 <= 0.2222 + 0.007678z <= 0.2531 |
|
[[38;5;39m INFO[0m][29-Jun-24 12:41:29] Trained chunk 168 in 478.6s at 4485noun/s: lr=1.01e-03, loss=1.40e+00, top1=69.62%/69.491% |
|
[[38;5;39m INFO[0m][29-Jun-24 12:41:29] Chunk 169 = Batch 704257 = Sample 360579073 |
|
[[38;5;39m INFO[0m][29-Jun-24 12:46:50] Epoch 7 finished in 11519.9s |
|
[[38;5;39m INFO[0m][29-Jun-24 12:46:50] -------------------------------------------------------------------------------- |
|
[[38;5;39m INFO[0m][29-Jun-24 12:46:50] Epoch 8 = Batch 707057 = Sample 362012673 |
|
[[38;5;39m INFO[0m][29-Jun-24 12:49:29] Total gradient norm stats for 262 steps: 0.2077 <= 0.2213 + 0.008234z <= 0.2533 |
|
[[38;5;39m INFO[0m][29-Jun-24 12:49:29] Trained chunk 169 in 480.1s at 4470noun/s: lr=1.01e-03, loss=1.40e+00, top1=70.04%/69.499% |
|
[[38;5;39m INFO[0m][29-Jun-24 12:49:29] Chunk 170 = Batch 708449 = Sample 362725377 |
|
[[38;5;39m INFO[0m][29-Jun-24 12:57:27] Total gradient norm stats for 262 steps: 0.2064 <= 0.2217 + 0.007699z <= 0.2581 |
|
[[38;5;39m INFO[0m][29-Jun-24 12:57:27] Trained chunk 170 in 478.1s at 4490noun/s: lr=1.00e-03, loss=1.40e+00, top1=69.90%/69.512% |
|
[[38;5;39m INFO[0m][29-Jun-24 12:57:27] Chunk 171 = Batch 712641 = Sample 364871681 |
|
[[38;5;39m INFO[0m][29-Jun-24 13:05:25] Total gradient norm stats for 262 steps: 0.2048 <= 0.2221 + 0.006935z <= 0.2494 |
|
[[38;5;39m INFO[0m][29-Jun-24 13:05:25] Trained chunk 171 in 478.5s at 4485noun/s: lr=9.98e-04, loss=1.40e+00, top1=70.46%/69.520% |
|
[[38;5;39m INFO[0m][29-Jun-24 13:05:25] Chunk 172 = Batch 716833 = Sample 367017985 |
|
[[38;5;39m INFO[0m][29-Jun-24 13:13:24] Total gradient norm stats for 262 steps: 0.2084 <= 0.224 + 0.02061z <= 0.5352 |
|
[[38;5;39m INFO[0m][29-Jun-24 13:13:24] Trained chunk 172 in 478.2s at 4488noun/s: lr=9.93e-04, loss=1.40e+00, top1=71.25%/69.527% |
|
[[38;5;39m INFO[0m][29-Jun-24 13:13:24] Chunk 173 = Batch 721025 = Sample 369164289 |
|
[[38;5;39m INFO[0m][29-Jun-24 13:21:21] Total gradient norm stats for 262 steps: 0.208 <= 0.2226 + 0.007413z <= 0.2629 |
|
[[38;5;39m INFO[0m][29-Jun-24 13:21:21] Trained chunk 173 in 477.5s at 4495noun/s: lr=9.88e-04, loss=1.40e+00, top1=70.27%/69.534% |
|
[[38;5;39m INFO[0m][29-Jun-24 13:21:21] Chunk 174 = Batch 725217 = Sample 371310593 |
|
[[38;5;39m INFO[0m][29-Jun-24 13:29:19] Total gradient norm stats for 262 steps: 0.2081 <= 0.2236 + 0.007342z <= 0.2472 |
|
[[38;5;39m INFO[0m][29-Jun-24 13:29:19] Trained chunk 174 in 478.0s at 4490noun/s: lr=9.83e-04, loss=1.40e+00, top1=69.12%/69.541% |
|
[[38;5;39m INFO[0m][29-Jun-24 13:29:19] Chunk 175 = Batch 729409 = Sample 373456897 |
|
[[38;5;39m INFO[0m][29-Jun-24 13:37:17] Total gradient norm stats for 262 steps: 0.2081 <= 0.224 + 0.007344z <= 0.2563 |
|
[[38;5;39m INFO[0m][29-Jun-24 13:37:17] Trained chunk 175 in 478.3s at 4488noun/s: lr=9.78e-04, loss=1.40e+00, top1=69.77%/69.550% |
|
[[38;5;39m INFO[0m][29-Jun-24 13:37:17] Chunk 176 = Batch 733601 = Sample 375603201 |
|
[[38;5;39m INFO[0m][29-Jun-24 13:45:16] Total gradient norm stats for 262 steps: 0.2104 <= 0.2259 + 0.008266z <= 0.2625 |
|
[[38;5;39m INFO[0m][29-Jun-24 13:45:16] Trained chunk 176 in 478.2s at 4488noun/s: lr=9.72e-04, loss=1.40e+00, top1=70.23%/69.557% |
|
[[38;5;39m INFO[0m][29-Jun-24 13:45:16] Chunk 177 = Batch 737793 = Sample 377749505 |
|
[[38;5;39m INFO[0m][29-Jun-24 13:53:14] Total gradient norm stats for 262 steps: 0.2129 <= 0.2248 + 0.006669z <= 0.2518 |
|
[[38;5;39m INFO[0m][29-Jun-24 13:53:14] Trained chunk 177 in 478.0s at 4490noun/s: lr=9.67e-04, loss=1.40e+00, top1=70.08%/69.563% |
|
[[38;5;39m INFO[0m][29-Jun-24 13:53:14] Chunk 178 = Batch 741985 = Sample 379895809 |
|
[[38;5;39m INFO[0m][29-Jun-24 14:01:12] Total gradient norm stats for 262 steps: 0.2099 <= 0.2247 + 0.00709z <= 0.2524 |
|
[[38;5;39m INFO[0m][29-Jun-24 14:01:12] Trained chunk 178 in 478.5s at 4485noun/s: lr=9.62e-04, loss=1.40e+00, top1=70.61%/69.570% |
|
[[38;5;39m INFO[0m][29-Jun-24 14:01:12] Chunk 179 = Batch 746177 = Sample 382042113 |
|
[[38;5;39m INFO[0m][29-Jun-24 14:09:10] Total gradient norm stats for 262 steps: 0.2109 <= 0.2247 + 0.007605z <= 0.2834 |
|
[[38;5;39m INFO[0m][29-Jun-24 14:09:10] Trained chunk 179 in 477.9s at 4491noun/s: lr=9.57e-04, loss=1.40e+00, top1=70.48%/69.577% |
|
[[38;5;39m INFO[0m][29-Jun-24 14:09:10] Chunk 180 = Batch 750369 = Sample 384188417 |
|
[[38;5;39m INFO[0m][29-Jun-24 14:17:08] Total gradient norm stats for 262 steps: 0.2098 <= 0.227 + 0.01351z <= 0.4105 |
|
[[38;5;39m INFO[0m][29-Jun-24 14:17:08] Trained chunk 180 in 478.5s at 4485noun/s: lr=9.52e-04, loss=1.40e+00, top1=68.41%/69.586% |
|
[[38;5;39m INFO[0m][29-Jun-24 14:17:08] Chunk 181 = Batch 754561 = Sample 386334721 |
|
[[38;5;39m INFO[0m][29-Jun-24 14:25:07] Total gradient norm stats for 262 steps: 0.2118 <= 0.228 + 0.008103z <= 0.261 |
|
[[38;5;39m INFO[0m][29-Jun-24 14:25:07] Trained chunk 181 in 478.0s at 4490noun/s: lr=9.46e-04, loss=1.40e+00, top1=70.10%/69.591% |
|
[[38;5;39m INFO[0m][29-Jun-24 14:25:07] Chunk 182 = Batch 758753 = Sample 388481025 |
|
[[38;5;39m INFO[0m][29-Jun-24 14:33:05] Total gradient norm stats for 262 steps: 0.2123 <= 0.2268 + 0.008807z <= 0.3092 |
|
[[38;5;39m INFO[0m][29-Jun-24 14:33:05] Trained chunk 182 in 478.1s at 4490noun/s: lr=9.41e-04, loss=1.40e+00, top1=68.46%/69.596% |
|
[[38;5;39m INFO[0m][29-Jun-24 14:33:05] Chunk 183 = Batch 762945 = Sample 390627329 |
|
[[38;5;39m INFO[0m][29-Jun-24 14:41:03] Total gradient norm stats for 262 steps: 0.2106 <= 0.2274 + 0.00951z <= 0.3044 |
|
[[38;5;39m INFO[0m][29-Jun-24 14:41:03] Trained chunk 183 in 478.2s at 4489noun/s: lr=9.36e-04, loss=1.40e+00, top1=69.17%/69.605% |
|
[[38;5;39m INFO[0m][29-Jun-24 14:41:03] Chunk 184 = Batch 767137 = Sample 392773633 |
|
[[38;5;39m INFO[0m][29-Jun-24 14:49:01] Total gradient norm stats for 262 steps: 0.2084 <= 0.2281 + 0.007908z <= 0.255 |
|
[[38;5;39m INFO[0m][29-Jun-24 14:49:01] Trained chunk 184 in 478.1s at 4489noun/s: lr=9.31e-04, loss=1.40e+00, top1=70.04%/69.613% |
|
[[38;5;39m INFO[0m][29-Jun-24 14:49:01] Chunk 185 = Batch 771329 = Sample 394919937 |
|
[[38;5;39m INFO[0m][29-Jun-24 14:56:59] Total gradient norm stats for 262 steps: 0.2117 <= 0.2283 + 0.007603z <= 0.2591 |
|
[[38;5;39m INFO[0m][29-Jun-24 14:56:59] Trained chunk 185 in 478.0s at 4490noun/s: lr=9.25e-04, loss=1.40e+00, top1=69.78%/69.621% |
|
[[38;5;39m INFO[0m][29-Jun-24 14:56:59] Chunk 186 = Batch 775521 = Sample 397066241 |
|
[[38;5;39m INFO[0m][29-Jun-24 15:04:57] Total gradient norm stats for 262 steps: 0.2135 <= 0.2283 + 0.007492z <= 0.2691 |
|
[[38;5;39m INFO[0m][29-Jun-24 15:04:57] Trained chunk 186 in 477.6s at 4494noun/s: lr=9.20e-04, loss=1.40e+00, top1=70.17%/69.632% |
|
[[38;5;39m INFO[0m][29-Jun-24 15:04:57] Chunk 187 = Batch 779713 = Sample 399212545 |
|
[[38;5;39m INFO[0m][29-Jun-24 15:12:55] Total gradient norm stats for 262 steps: 0.214 <= 0.227 + 0.006531z <= 0.2508 |
|
[[38;5;39m INFO[0m][29-Jun-24 15:12:55] Trained chunk 187 in 478.0s at 4490noun/s: lr=9.15e-04, loss=1.40e+00, top1=69.24%/69.639% |
|
[[38;5;39m INFO[0m][29-Jun-24 15:12:55] Chunk 188 = Batch 783905 = Sample 401358849 |
|
[[38;5;39m INFO[0m][29-Jun-24 15:20:52] Total gradient norm stats for 262 steps: 0.2136 <= 0.227 + 0.006833z <= 0.2692 |
|
[[38;5;39m INFO[0m][29-Jun-24 15:20:52] Trained chunk 188 in 477.7s at 4493noun/s: lr=9.09e-04, loss=1.39e+00, top1=69.29%/69.649% |
|
[[38;5;39m INFO[0m][29-Jun-24 15:20:52] Chunk 189 = Batch 788097 = Sample 403505153 |
|
[[38;5;39m INFO[0m][29-Jun-24 15:28:51] Total gradient norm stats for 262 steps: 0.2145 <= 0.2286 + 0.007674z <= 0.2817 |
|
[[38;5;39m INFO[0m][29-Jun-24 15:28:51] Trained chunk 189 in 478.6s at 4485noun/s: lr=9.04e-04, loss=1.39e+00, top1=71.65%/69.654% |
|
[[38;5;39m INFO[0m][29-Jun-24 15:28:51] Chunk 190 = Batch 792289 = Sample 405651457 |
|
[[38;5;39m INFO[0m][29-Jun-24 15:36:50] Total gradient norm stats for 262 steps: 0.2155 <= 0.229 + 0.01118z <= 0.3746 |
|
[[38;5;39m INFO[0m][29-Jun-24 15:36:50] Trained chunk 190 in 478.7s at 4484noun/s: lr=8.99e-04, loss=1.39e+00, top1=69.49%/69.662% |
|
[[38;5;39m INFO[0m][29-Jun-24 15:36:50] Chunk 191 = Batch 796481 = Sample 407797761 |
|
[[38;5;39m INFO[0m][29-Jun-24 15:44:48] Total gradient norm stats for 262 steps: 0.2164 <= 0.2285 + 0.008088z <= 0.3024 |
|
[[38;5;39m INFO[0m][29-Jun-24 15:44:48] Trained chunk 191 in 478.0s at 4490noun/s: lr=8.93e-04, loss=1.39e+00, top1=69.91%/69.669% |
|
[[38;5;39m INFO[0m][29-Jun-24 15:44:48] Chunk 192 = Batch 800673 = Sample 409944065 |
|
[[38;5;39m INFO[0m][29-Jun-24 15:52:46] Total gradient norm stats for 262 steps: 0.2146 <= 0.2301 + 0.009673z <= 0.3322 |
|
[[38;5;39m INFO[0m][29-Jun-24 15:52:46] Trained chunk 192 in 478.6s at 4485noun/s: lr=8.88e-04, loss=1.39e+00, top1=70.23%/69.678% |
|
[[38;5;39m INFO[0m][29-Jun-24 15:52:47] Saved checkpoint: /data/strahl/Code/ovod/outputs/ovod_20240628_142131/ovod_chunk0192_20240629_155246.train |
|
[[38;5;39m INFO[0m][29-Jun-24 15:52:47] Chunk 193 = Batch 804865 = Sample 412090369 |
|
[[38;5;39m INFO[0m][29-Jun-24 15:58:53] Epoch 8 finished in 11522.9s |
|
[[38;5;39m INFO[0m][29-Jun-24 15:58:53] -------------------------------------------------------------------------------- |
|
[[38;5;39m INFO[0m][29-Jun-24 15:58:53] Epoch 9 = Batch 808065 = Sample 413728769 |
|
[[38;5;39m INFO[0m][29-Jun-24 16:00:46] Total gradient norm stats for 262 steps: 0.2176 <= 0.2298 + 0.006413z <= 0.2519 |
|
[[38;5;39m INFO[0m][29-Jun-24 16:00:46] Trained chunk 193 in 479.9s at 4473noun/s: lr=8.83e-04, loss=1.39e+00, top1=68.85%/69.686% |
|
[[38;5;39m INFO[0m][29-Jun-24 16:00:46] Chunk 194 = Batch 809057 = Sample 414236673 |
|
[[38;5;39m INFO[0m][29-Jun-24 16:08:45] Total gradient norm stats for 262 steps: 0.2183 <= 0.2304 + 0.006363z <= 0.2643 |
|
[[38;5;39m INFO[0m][29-Jun-24 16:08:45] Trained chunk 194 in 479.0s at 4481noun/s: lr=8.77e-04, loss=1.39e+00, top1=69.81%/69.697% |
|
[[38;5;39m INFO[0m][29-Jun-24 16:08:45] Chunk 195 = Batch 813249 = Sample 416382977 |
|
[[38;5;39m INFO[0m][29-Jun-24 16:16:45] Total gradient norm stats for 262 steps: 0.2195 <= 0.2307 + 0.007058z <= 0.2826 |
|
[[38;5;39m INFO[0m][29-Jun-24 16:16:45] Trained chunk 195 in 479.6s at 4475noun/s: lr=8.72e-04, loss=1.39e+00, top1=69.26%/69.706% |
|
[[38;5;39m INFO[0m][29-Jun-24 16:16:45] Chunk 196 = Batch 817441 = Sample 418529281 |
|
[[38;5;39m INFO[0m][29-Jun-24 16:24:42] Total gradient norm stats for 262 steps: 0.216 <= 0.2317 + 0.01281z <= 0.4033 |
|
[[38;5;39m INFO[0m][29-Jun-24 16:24:42] Trained chunk 196 in 477.3s at 4497noun/s: lr=8.67e-04, loss=1.39e+00, top1=69.73%/69.709% |
|
[[38;5;39m INFO[0m][29-Jun-24 16:24:42] Chunk 197 = Batch 821633 = Sample 420675585 |
|
[[38;5;39m INFO[0m][29-Jun-24 16:32:39] Total gradient norm stats for 262 steps: 0.2168 <= 0.2318 + 0.01185z <= 0.3813 |
|
[[38;5;39m INFO[0m][29-Jun-24 16:32:39] Trained chunk 197 in 476.7s at 4502noun/s: lr=8.61e-04, loss=1.39e+00, top1=69.94%/69.717% |
|
[[38;5;39m INFO[0m][29-Jun-24 16:32:39] Chunk 198 = Batch 825825 = Sample 422821889 |
|
[[38;5;39m INFO[0m][29-Jun-24 16:40:37] Total gradient norm stats for 262 steps: 0.2186 <= 0.2323 + 0.007534z <= 0.2607 |
|
[[38;5;39m INFO[0m][29-Jun-24 16:40:37] Trained chunk 198 in 478.1s at 4489noun/s: lr=8.56e-04, loss=1.39e+00, top1=69.98%/69.726% |
|
[[38;5;39m INFO[0m][29-Jun-24 16:40:37] Chunk 199 = Batch 830017 = Sample 424968193 |
|
[[38;5;39m INFO[0m][29-Jun-24 16:48:36] Total gradient norm stats for 262 steps: 0.2195 <= 0.2336 + 0.007728z <= 0.2605 |
|
[[38;5;39m INFO[0m][29-Jun-24 16:48:36] Trained chunk 199 in 479.1s at 4479noun/s: lr=8.50e-04, loss=1.39e+00, top1=69.22%/69.734% |
|
[[38;5;39m INFO[0m][29-Jun-24 16:48:36] Chunk 200 = Batch 834209 = Sample 427114497 |
|
[[38;5;39m INFO[0m][29-Jun-24 16:56:33] Total gradient norm stats for 262 steps: 0.2161 <= 0.235 + 0.02918z <= 0.6896 |
|
[[38;5;39m INFO[0m][29-Jun-24 16:56:33] Trained chunk 200 in 477.3s at 4497noun/s: lr=8.45e-04, loss=1.39e+00, top1=69.76%/69.747% |
|
[[38;5;39m INFO[0m][29-Jun-24 16:56:33] Chunk 201 = Batch 838401 = Sample 429260801 |
|
[[38;5;39m INFO[0m][29-Jun-24 17:04:32] Total gradient norm stats for 262 steps: 0.2192 <= 0.2339 + 0.008834z <= 0.2796 |
|
[[38;5;39m INFO[0m][29-Jun-24 17:04:32] Trained chunk 201 in 478.2s at 4488noun/s: lr=8.40e-04, loss=1.39e+00, top1=69.73%/69.754% |
|
[[38;5;39m INFO[0m][29-Jun-24 17:04:32] Chunk 202 = Batch 842593 = Sample 431407105 |
|
[[38;5;39m INFO[0m][29-Jun-24 17:12:30] Total gradient norm stats for 262 steps: 0.22 <= 0.233 + 0.00677z <= 0.2667 |
|
[[38;5;39m INFO[0m][29-Jun-24 17:12:30] Trained chunk 202 in 478.7s at 4484noun/s: lr=8.34e-04, loss=1.39e+00, top1=68.93%/69.761% |
|
[[38;5;39m INFO[0m][29-Jun-24 17:12:30] Chunk 203 = Batch 846785 = Sample 433553409 |
|
[[38;5;39m INFO[0m][29-Jun-24 17:20:29] Total gradient norm stats for 262 steps: 0.2205 <= 0.2326 + 0.005549z <= 0.256 |
|
[[38;5;39m INFO[0m][29-Jun-24 17:20:29] Trained chunk 203 in 478.3s at 4488noun/s: lr=8.29e-04, loss=1.39e+00, top1=71.01%/69.769% |
|
[[38;5;39m INFO[0m][29-Jun-24 17:20:29] Chunk 204 = Batch 850977 = Sample 435699713 |
|
[[38;5;39m INFO[0m][29-Jun-24 17:28:27] Total gradient norm stats for 262 steps: 0.2195 <= 0.2355 + 0.03809z <= 0.841 |
|
[[38;5;39m INFO[0m][29-Jun-24 17:28:27] Trained chunk 204 in 478.5s at 4486noun/s: lr=8.23e-04, loss=1.39e+00, top1=70.09%/69.775% |
|
[[38;5;39m INFO[0m][29-Jun-24 17:28:27] Chunk 205 = Batch 855169 = Sample 437846017 |
|
[[38;5;39m INFO[0m][29-Jun-24 17:36:25] Total gradient norm stats for 262 steps: 0.2215 <= 0.2366 + 0.01305z <= 0.4186 |
|
[[38;5;39m INFO[0m][29-Jun-24 17:36:25] Trained chunk 205 in 478.1s at 4489noun/s: lr=8.18e-04, loss=1.39e+00, top1=69.43%/69.783% |
|
[[38;5;39m INFO[0m][29-Jun-24 17:36:25] Chunk 206 = Batch 859361 = Sample 439992321 |
|
[[38;5;39m INFO[0m][29-Jun-24 17:44:23] Total gradient norm stats for 262 steps: 0.2214 <= 0.236 + 0.007383z <= 0.2814 |
|
[[38;5;39m INFO[0m][29-Jun-24 17:44:23] Trained chunk 206 in 477.5s at 4495noun/s: lr=8.13e-04, loss=1.39e+00, top1=70.24%/69.794% |
|
[[38;5;39m INFO[0m][29-Jun-24 17:44:23] Chunk 207 = Batch 863553 = Sample 442138625 |
|
[[38;5;39m INFO[0m][29-Jun-24 17:52:21] Total gradient norm stats for 262 steps: 0.2221 <= 0.2364 + 0.006912z <= 0.2634 |
|
[[38;5;39m INFO[0m][29-Jun-24 17:52:21] Trained chunk 207 in 478.4s at 4487noun/s: lr=8.07e-04, loss=1.39e+00, top1=69.80%/69.800% |
|
[[38;5;39m INFO[0m][29-Jun-24 17:52:21] Chunk 208 = Batch 867745 = Sample 444284929 |
|
[[38;5;39m INFO[0m][29-Jun-24 18:00:19] Total gradient norm stats for 262 steps: 0.2228 <= 0.2364 + 0.006673z <= 0.2595 |
|
[[38;5;39m INFO[0m][29-Jun-24 18:00:19] Trained chunk 208 in 478.2s at 4488noun/s: lr=8.02e-04, loss=1.39e+00, top1=69.81%/69.810% |
|
[[38;5;39m INFO[0m][29-Jun-24 18:00:19] Chunk 209 = Batch 871937 = Sample 446431233 |
|
[[38;5;39m INFO[0m][29-Jun-24 18:08:18] Total gradient norm stats for 262 steps: 0.2244 <= 0.2374 + 0.01944z <= 0.5272 |
|
[[38;5;39m INFO[0m][29-Jun-24 18:08:18] Trained chunk 209 in 478.7s at 4484noun/s: lr=7.96e-04, loss=1.38e+00, top1=69.55%/69.816% |
|
[[38;5;39m INFO[0m][29-Jun-24 18:08:18] Chunk 210 = Batch 876129 = Sample 448577537 |
|
[[38;5;39m INFO[0m][29-Jun-24 18:16:17] Total gradient norm stats for 262 steps: 0.2219 <= 0.2363 + 0.008857z <= 0.3325 |
|
[[38;5;39m INFO[0m][29-Jun-24 18:16:17] Trained chunk 210 in 478.5s at 4486noun/s: lr=7.91e-04, loss=1.38e+00, top1=70.36%/69.823% |
|
[[38;5;39m INFO[0m][29-Jun-24 18:16:17] Chunk 211 = Batch 880321 = Sample 450723841 |
|
[[38;5;39m INFO[0m][29-Jun-24 18:24:14] Total gradient norm stats for 262 steps: 0.2245 <= 0.2389 + 0.008201z <= 0.2665 |
|
[[38;5;39m INFO[0m][29-Jun-24 18:24:14] Trained chunk 211 in 477.4s at 4496noun/s: lr=7.85e-04, loss=1.38e+00, top1=70.25%/69.832% |
|
[[38;5;39m INFO[0m][29-Jun-24 18:24:14] Chunk 212 = Batch 884513 = Sample 452870145 |
|
[[38;5;39m INFO[0m][29-Jun-24 18:32:12] Total gradient norm stats for 262 steps: 0.2247 <= 0.2391 + 0.01557z <= 0.4596 |
|
[[38;5;39m INFO[0m][29-Jun-24 18:32:12] Trained chunk 212 in 477.8s at 4492noun/s: lr=7.80e-04, loss=1.38e+00, top1=69.61%/69.837% |
|
[[38;5;39m INFO[0m][29-Jun-24 18:32:12] Chunk 213 = Batch 888705 = Sample 455016449 |
|
[[38;5;39m INFO[0m][29-Jun-24 18:40:10] Total gradient norm stats for 262 steps: 0.2256 <= 0.2403 + 0.0266z <= 0.5625 |
|
[[38;5;39m INFO[0m][29-Jun-24 18:40:10] Trained chunk 213 in 478.3s at 4488noun/s: lr=7.74e-04, loss=1.38e+00, top1=70.23%/69.845% |
|
[[38;5;39m INFO[0m][29-Jun-24 18:40:10] Chunk 214 = Batch 892897 = Sample 457162753 |
|
[[38;5;39m INFO[0m][29-Jun-24 18:48:08] Total gradient norm stats for 262 steps: 0.226 <= 0.2392 + 0.007734z <= 0.3119 |
|
[[38;5;39m INFO[0m][29-Jun-24 18:48:08] Trained chunk 214 in 477.4s at 4496noun/s: lr=7.69e-04, loss=1.38e+00, top1=70.11%/69.856% |
|
[[38;5;39m INFO[0m][29-Jun-24 18:48:08] Chunk 215 = Batch 897089 = Sample 459309057 |
|
[[38;5;39m INFO[0m][29-Jun-24 18:56:05] Total gradient norm stats for 262 steps: 0.2268 <= 0.2393 + 0.007044z <= 0.2668 |
|
[[38;5;39m INFO[0m][29-Jun-24 18:56:05] Trained chunk 215 in 477.4s at 4496noun/s: lr=7.64e-04, loss=1.38e+00, top1=69.25%/69.864% |
|
[[38;5;39m INFO[0m][29-Jun-24 18:56:05] Chunk 216 = Batch 901281 = Sample 461455361 |
|
[[38;5;39m INFO[0m][29-Jun-24 19:04:03] Total gradient norm stats for 262 steps: 0.2281 <= 0.2411 + 0.009625z <= 0.3439 |
|
[[38;5;39m INFO[0m][29-Jun-24 19:04:03] Trained chunk 216 in 477.7s at 4493noun/s: lr=7.58e-04, loss=1.38e+00, top1=69.04%/69.871% |
|
[[38;5;39m INFO[0m][29-Jun-24 19:04:03] Chunk 217 = Batch 905473 = Sample 463601665 |
|
[[38;5;39m INFO[0m][29-Jun-24 19:10:55] Epoch 9 finished in 11522.5s |
|
[[38;5;39m INFO[0m][29-Jun-24 19:10:55] -------------------------------------------------------------------------------- |
|
[[38;5;39m INFO[0m][29-Jun-24 19:10:55] Epoch 10 = Batch 909073 = Sample 465444865 |
|
[[38;5;39m INFO[0m][29-Jun-24 19:12:03] Total gradient norm stats for 262 steps: 0.2291 <= 0.2404 + 0.005867z <= 0.2584 |
|
[[38;5;39m INFO[0m][29-Jun-24 19:12:03] Trained chunk 217 in 480.0s at 4471noun/s: lr=7.53e-04, loss=1.38e+00, top1=70.63%/69.877% |
|
[[38;5;39m INFO[0m][29-Jun-24 19:12:03] Chunk 218 = Batch 909665 = Sample 465747969 |
|
[[38;5;39m INFO[0m][29-Jun-24 19:20:01] Total gradient norm stats for 262 steps: 0.2259 <= 0.241 + 0.008368z <= 0.3325 |
|
[[38;5;39m INFO[0m][29-Jun-24 19:20:01] Trained chunk 218 in 478.6s at 4485noun/s: lr=7.47e-04, loss=1.38e+00, top1=69.58%/69.888% |
|
[[38;5;39m INFO[0m][29-Jun-24 19:20:01] Chunk 219 = Batch 913857 = Sample 467894273 |
|
[[38;5;39m INFO[0m][29-Jun-24 19:27:59] Total gradient norm stats for 262 steps: 0.2272 <= 0.242 + 0.0228z <= 0.5923 |
|
[[38;5;39m INFO[0m][29-Jun-24 19:27:59] Trained chunk 219 in 478.0s at 4490noun/s: lr=7.42e-04, loss=1.38e+00, top1=69.82%/69.897% |
|
[[38;5;39m INFO[0m][29-Jun-24 19:27:59] Chunk 220 = Batch 918049 = Sample 470040577 |
|
[[38;5;39m INFO[0m][29-Jun-24 19:35:57] Total gradient norm stats for 262 steps: 0.2274 <= 0.2431 + 0.00687z <= 0.267 |
|
[[38;5;39m INFO[0m][29-Jun-24 19:35:57] Trained chunk 220 in 477.8s at 4492noun/s: lr=7.36e-04, loss=1.38e+00, top1=69.95%/69.906% |
|
[[38;5;39m INFO[0m][29-Jun-24 19:35:57] Chunk 221 = Batch 922241 = Sample 472186881 |
|
[[38;5;39m INFO[0m][29-Jun-24 19:43:54] Total gradient norm stats for 262 steps: 0.2277 <= 0.2425 + 0.007408z <= 0.3037 |
|
[[38;5;39m INFO[0m][29-Jun-24 19:43:54] Trained chunk 221 in 476.7s at 4503noun/s: lr=7.31e-04, loss=1.38e+00, top1=69.38%/69.915% |
|
[[38;5;39m INFO[0m][29-Jun-24 19:43:54] Chunk 222 = Batch 926433 = Sample 474333185 |
|
[[38;5;39m INFO[0m][29-Jun-24 19:51:51] Total gradient norm stats for 262 steps: 0.2287 <= 0.2428 + 0.007213z <= 0.2711 |
|
[[38;5;39m INFO[0m][29-Jun-24 19:51:51] Trained chunk 222 in 477.4s at 4496noun/s: lr=7.26e-04, loss=1.38e+00, top1=71.29%/69.923% |
|
[[38;5;39m INFO[0m][29-Jun-24 19:51:51] Chunk 223 = Batch 930625 = Sample 476479489 |
|
[[38;5;39m INFO[0m][29-Jun-24 19:59:49] Total gradient norm stats for 262 steps: 0.229 <= 0.2446 + 0.008886z <= 0.3354 |
|
[[38;5;39m INFO[0m][29-Jun-24 19:59:49] Trained chunk 223 in 477.9s at 4491noun/s: lr=7.20e-04, loss=1.38e+00, top1=68.51%/69.930% |
|
[[38;5;39m INFO[0m][29-Jun-24 19:59:49] Chunk 224 = Batch 934817 = Sample 478625793 |
|
[[38;5;39m INFO[0m][29-Jun-24 20:07:47] Total gradient norm stats for 262 steps: 0.2275 <= 0.2445 + 0.00733z <= 0.2736 |
|
[[38;5;39m INFO[0m][29-Jun-24 20:07:47] Trained chunk 224 in 478.0s at 4490noun/s: lr=7.15e-04, loss=1.38e+00, top1=69.04%/69.937% |
|
[[38;5;39m INFO[0m][29-Jun-24 20:07:47] Chunk 225 = Batch 939009 = Sample 480772097 |
|
[[38;5;39m INFO[0m][29-Jun-24 20:15:45] Total gradient norm stats for 262 steps: 0.229 <= 0.2446 + 0.01025z <= 0.3593 |
|
[[38;5;39m INFO[0m][29-Jun-24 20:15:45] Trained chunk 225 in 478.2s at 4488noun/s: lr=7.09e-04, loss=1.38e+00, top1=69.32%/69.947% |
|
[[38;5;39m INFO[0m][29-Jun-24 20:15:45] Chunk 226 = Batch 943201 = Sample 482918401 |
|
[[38;5;39m INFO[0m][29-Jun-24 20:23:43] Total gradient norm stats for 262 steps: 0.2314 <= 0.2456 + 0.006863z <= 0.2729 |
|
[[38;5;39m INFO[0m][29-Jun-24 20:23:43] Trained chunk 226 in 477.5s at 4494noun/s: lr=7.04e-04, loss=1.38e+00, top1=69.67%/69.956% |
|
[[38;5;39m INFO[0m][29-Jun-24 20:23:43] Chunk 227 = Batch 947393 = Sample 485064705 |
|
[[38;5;39m INFO[0m][29-Jun-24 20:31:42] Total gradient norm stats for 262 steps: 0.2332 <= 0.2456 + 0.006541z <= 0.2692 |
|
[[38;5;39m INFO[0m][29-Jun-24 20:31:42] Trained chunk 227 in 478.8s at 4482noun/s: lr=6.98e-04, loss=1.38e+00, top1=69.48%/69.963% |
|
[[38;5;39m INFO[0m][29-Jun-24 20:31:42] Chunk 228 = Batch 951585 = Sample 487211009 |
|
[[38;5;39m INFO[0m][29-Jun-24 20:39:39] Total gradient norm stats for 262 steps: 0.2318 <= 0.2478 + 0.008251z <= 0.3027 |
|
[[38;5;39m INFO[0m][29-Jun-24 20:39:39] Trained chunk 228 in 477.7s at 4493noun/s: lr=6.93e-04, loss=1.38e+00, top1=68.59%/69.969% |
|
[[38;5;39m INFO[0m][29-Jun-24 20:39:39] Chunk 229 = Batch 955777 = Sample 489357313 |
|
[[38;5;39m INFO[0m][29-Jun-24 20:47:37] Total gradient norm stats for 262 steps: 0.2334 <= 0.2483 + 0.007832z <= 0.2986 |
|
[[38;5;39m INFO[0m][29-Jun-24 20:47:37] Trained chunk 229 in 477.8s at 4492noun/s: lr=6.87e-04, loss=1.37e+00, top1=69.94%/69.980% |
|
[[38;5;39m INFO[0m][29-Jun-24 20:47:37] Chunk 230 = Batch 959969 = Sample 491503617 |
|
[[38;5;39m INFO[0m][29-Jun-24 20:55:35] Total gradient norm stats for 262 steps: 0.2342 <= 0.252 + 0.04491z <= 0.9601 |
|
[[38;5;39m INFO[0m][29-Jun-24 20:55:35] Trained chunk 230 in 477.9s at 4491noun/s: lr=6.82e-04, loss=1.37e+00, top1=69.86%/69.987% |
|
[[38;5;39m INFO[0m][29-Jun-24 20:55:35] Chunk 231 = Batch 964161 = Sample 493649921 |
|
[[38;5;39m INFO[0m][29-Jun-24 21:03:33] Total gradient norm stats for 262 steps: 0.2358 <= 0.2501 + 0.01628z <= 0.48 |
|
[[38;5;39m INFO[0m][29-Jun-24 21:03:33] Trained chunk 231 in 477.7s at 4493noun/s: lr=6.77e-04, loss=1.37e+00, top1=70.14%/69.997% |
|
[[38;5;39m INFO[0m][29-Jun-24 21:03:33] Chunk 232 = Batch 968353 = Sample 495796225 |
|
[[38;5;39m INFO[0m][29-Jun-24 21:11:31] Total gradient norm stats for 262 steps: 0.2333 <= 0.2497 + 0.008185z <= 0.2785 |
|
[[38;5;39m INFO[0m][29-Jun-24 21:11:31] Trained chunk 232 in 478.7s at 4484noun/s: lr=6.71e-04, loss=1.37e+00, top1=70.41%/70.006% |
|
[[38;5;39m INFO[0m][29-Jun-24 21:11:31] Chunk 233 = Batch 972545 = Sample 497942529 |
|
[[38;5;39m INFO[0m][29-Jun-24 21:19:29] Total gradient norm stats for 262 steps: 0.2352 <= 0.2513 + 0.009022z <= 0.3115 |
|
[[38;5;39m INFO[0m][29-Jun-24 21:19:29] Trained chunk 233 in 478.0s at 4490noun/s: lr=6.66e-04, loss=1.37e+00, top1=68.59%/70.016% |
|
[[38;5;39m INFO[0m][29-Jun-24 21:19:29] Chunk 234 = Batch 976737 = Sample 500088833 |
|
[[38;5;39m INFO[0m][29-Jun-24 21:27:27] Total gradient norm stats for 262 steps: 0.2363 <= 0.2506 + 0.008323z <= 0.3025 |
|
[[38;5;39m INFO[0m][29-Jun-24 21:27:27] Trained chunk 234 in 478.0s at 4490noun/s: lr=6.60e-04, loss=1.37e+00, top1=68.91%/70.024% |
|
[[38;5;39m INFO[0m][29-Jun-24 21:27:27] Chunk 235 = Batch 980929 = Sample 502235137 |
|
[[38;5;39m INFO[0m][29-Jun-24 21:35:26] Total gradient norm stats for 262 steps: 0.2369 <= 0.2515 + 0.01248z <= 0.4006 |
|
[[38;5;39m INFO[0m][29-Jun-24 21:35:26] Trained chunk 235 in 478.8s at 4483noun/s: lr=6.55e-04, loss=1.37e+00, top1=70.00%/70.034% |
|
[[38;5;39m INFO[0m][29-Jun-24 21:35:26] Chunk 236 = Batch 985121 = Sample 504381441 |
|
[[38;5;39m INFO[0m][29-Jun-24 21:43:25] Total gradient norm stats for 262 steps: 0.2376 <= 0.2516 + 0.006931z <= 0.2864 |
|
[[38;5;39m INFO[0m][29-Jun-24 21:43:25] Trained chunk 236 in 478.4s at 4486noun/s: lr=6.50e-04, loss=1.37e+00, top1=69.77%/70.043% |
|
[[38;5;39m INFO[0m][29-Jun-24 21:43:25] Chunk 237 = Batch 989313 = Sample 506527745 |
|
[[38;5;39m INFO[0m][29-Jun-24 21:51:22] Total gradient norm stats for 262 steps: 0.2386 <= 0.2539 + 0.01561z <= 0.4689 |
|
[[38;5;39m INFO[0m][29-Jun-24 21:51:22] Trained chunk 237 in 477.8s at 4492noun/s: lr=6.44e-04, loss=1.37e+00, top1=71.15%/70.054% |
|
[[38;5;39m INFO[0m][29-Jun-24 21:51:22] Chunk 238 = Batch 993505 = Sample 508674049 |
|
[[38;5;39m INFO[0m][29-Jun-24 21:59:21] Total gradient norm stats for 262 steps: 0.2383 <= 0.2531 + 0.00904z <= 0.2921 |
|
[[38;5;39m INFO[0m][29-Jun-24 21:59:21] Trained chunk 238 in 478.1s at 4489noun/s: lr=6.39e-04, loss=1.37e+00, top1=69.48%/70.061% |
|
[[38;5;39m INFO[0m][29-Jun-24 21:59:21] Chunk 239 = Batch 997697 = Sample 510820353 |
|
[[38;5;39m INFO[0m][29-Jun-24 22:07:19] Total gradient norm stats for 262 steps: 0.2383 <= 0.253 + 0.007313z <= 0.2793 |
|
[[38;5;39m INFO[0m][29-Jun-24 22:07:19] Trained chunk 239 in 478.3s at 4487noun/s: lr=6.33e-04, loss=1.37e+00, top1=70.61%/70.067% |
|
[[38;5;39m INFO[0m][29-Jun-24 22:07:19] Chunk 240 = Batch 1001889 = Sample 512966657 |
|
[[38;5;39m INFO[0m][29-Jun-24 22:15:18] Total gradient norm stats for 262 steps: 0.2386 <= 0.2546 + 0.008644z <= 0.2895 |
|
[[38;5;39m INFO[0m][29-Jun-24 22:15:18] Trained chunk 240 in 478.9s at 4482noun/s: lr=6.28e-04, loss=1.37e+00, top1=70.50%/70.073% |
|
[[38;5;39m INFO[0m][29-Jun-24 22:15:18] Saved checkpoint: /data/strahl/Code/ovod/outputs/ovod_20240628_142131/ovod_chunk0240_20240629_221518.train |
|
[[38;5;39m INFO[0m][29-Jun-24 22:15:18] Chunk 241 = Batch 1006081 = Sample 515112961 |
|
[[38;5;39m INFO[0m][29-Jun-24 22:22:56] Epoch 10 finished in 11521.2s |
|
[[38;5;39m INFO[0m][29-Jun-24 22:22:56] -------------------------------------------------------------------------------- |
|
[[38;5;39m INFO[0m][29-Jun-24 22:22:56] Epoch 11 = Batch 1010081 = Sample 517160961 |
|
[[38;5;39m INFO[0m][29-Jun-24 22:23:19] Total gradient norm stats for 262 steps: 0.2406 <= 0.2556 + 0.009099z <= 0.3027 |
|
[[38;5;39m INFO[0m][29-Jun-24 22:23:19] Trained chunk 241 in 480.5s at 4467noun/s: lr=6.23e-04, loss=1.37e+00, top1=70.60%/70.081% |
|
[[38;5;39m INFO[0m][29-Jun-24 22:23:19] Chunk 242 = Batch 1010273 = Sample 517259265 |
|
[[38;5;39m INFO[0m][29-Jun-24 22:31:18] Total gradient norm stats for 262 steps: 0.2436 <= 0.2557 + 0.00778z <= 0.2952 |
|
[[38;5;39m INFO[0m][29-Jun-24 22:31:18] Trained chunk 242 in 479.5s at 4476noun/s: lr=6.17e-04, loss=1.37e+00, top1=70.35%/70.092% |
|
[[38;5;39m INFO[0m][29-Jun-24 22:31:18] Chunk 243 = Batch 1014465 = Sample 519405569 |
|
[[38;5;39m INFO[0m][29-Jun-24 22:39:18] Total gradient norm stats for 262 steps: 0.2425 <= 0.2576 + 0.009126z <= 0.3038 |
|
[[38;5;39m INFO[0m][29-Jun-24 22:39:18] Trained chunk 243 in 479.3s at 4478noun/s: lr=6.12e-04, loss=1.37e+00, top1=70.39%/70.100% |
|
[[38;5;39m INFO[0m][29-Jun-24 22:39:18] Chunk 244 = Batch 1018657 = Sample 521551873 |
|
[[38;5;39m INFO[0m][29-Jun-24 22:47:17] Total gradient norm stats for 262 steps: 0.2422 <= 0.259 + 0.03092z <= 0.7391 |
|
[[38;5;39m INFO[0m][29-Jun-24 22:47:17] Trained chunk 244 in 478.9s at 4481noun/s: lr=6.07e-04, loss=1.37e+00, top1=69.96%/70.110% |
|
[[38;5;39m INFO[0m][29-Jun-24 22:47:17] Chunk 245 = Batch 1022849 = Sample 523698177 |
|
[[38;5;39m INFO[0m][29-Jun-24 22:55:15] Total gradient norm stats for 262 steps: 0.2438 <= 0.263 + 0.091z <= 1.725 (clipped to 1) |
|
[[38;5;39m INFO[0m][29-Jun-24 22:55:15] Trained chunk 245 in 478.4s at 4487noun/s: lr=6.01e-04, loss=1.37e+00, top1=69.09%/70.120% |
|
[[38;5;39m INFO[0m][29-Jun-24 22:55:15] Chunk 246 = Batch 1027041 = Sample 525844481 |
|
[[38;5;39m INFO[0m][29-Jun-24 23:03:13] Total gradient norm stats for 262 steps: 0.2451 <= 0.2593 + 0.009851z <= 0.3228 |
|
[[38;5;39m INFO[0m][29-Jun-24 23:03:13] Trained chunk 246 in 478.4s at 4486noun/s: lr=5.96e-04, loss=1.37e+00, top1=70.38%/70.128% |
|
[[38;5;39m INFO[0m][29-Jun-24 23:03:13] Chunk 247 = Batch 1031233 = Sample 527990785 |
|
[[38;5;39m INFO[0m][29-Jun-24 23:11:12] Total gradient norm stats for 262 steps: 0.2429 <= 0.2589 + 0.008579z <= 0.3219 |
|
[[38;5;39m INFO[0m][29-Jun-24 23:11:12] Trained chunk 247 in 478.9s at 4482noun/s: lr=5.91e-04, loss=1.36e+00, top1=70.79%/70.138% |
|
[[38;5;39m INFO[0m][29-Jun-24 23:11:12] Chunk 248 = Batch 1035425 = Sample 530137089 |
|
[[38;5;39m INFO[0m][29-Jun-24 23:19:11] Total gradient norm stats for 262 steps: 0.2441 <= 0.2607 + 0.02253z <= 0.5842 |
|
[[38;5;39m INFO[0m][29-Jun-24 23:19:11] Trained chunk 248 in 478.5s at 4486noun/s: lr=5.85e-04, loss=1.36e+00, top1=70.13%/70.146% |
|
[[38;5;39m INFO[0m][29-Jun-24 23:19:11] Chunk 249 = Batch 1039617 = Sample 532283393 |
|
[[38;5;39m INFO[0m][29-Jun-24 23:27:10] Total gradient norm stats for 262 steps: 0.2432 <= 0.2591 + 0.007653z <= 0.2937 |
|
[[38;5;39m INFO[0m][29-Jun-24 23:27:10] Trained chunk 249 in 479.0s at 4481noun/s: lr=5.80e-04, loss=1.36e+00, top1=69.56%/70.157% |
|
[[38;5;39m INFO[0m][29-Jun-24 23:27:10] Chunk 250 = Batch 1043809 = Sample 534429697 |
|
[[38;5;39m INFO[0m][29-Jun-24 23:35:08] Total gradient norm stats for 262 steps: 0.2472 <= 0.2601 + 0.007927z <= 0.2949 |
|
[[38;5;39m INFO[0m][29-Jun-24 23:35:08] Trained chunk 250 in 478.6s at 4485noun/s: lr=5.75e-04, loss=1.36e+00, top1=70.59%/70.168% |
|
[[38;5;39m INFO[0m][29-Jun-24 23:35:08] Chunk 251 = Batch 1048001 = Sample 536576001 |
|
[[38;5;39m INFO[0m][29-Jun-24 23:43:08] Total gradient norm stats for 262 steps: 0.2485 <= 0.2607 + 0.007072z <= 0.2862 |
|
[[38;5;39m INFO[0m][29-Jun-24 23:43:08] Trained chunk 251 in 479.5s at 4476noun/s: lr=5.69e-04, loss=1.36e+00, top1=70.19%/70.176% |
|
[[38;5;39m INFO[0m][29-Jun-24 23:43:08] Chunk 252 = Batch 1052193 = Sample 538722305 |
|
[[38;5;39m INFO[0m][29-Jun-24 23:51:07] Total gradient norm stats for 262 steps: 0.2482 <= 0.2646 + 0.0547z <= 1.138 (clipped to 1) |
|
[[38;5;39m INFO[0m][29-Jun-24 23:51:07] Trained chunk 252 in 479.1s at 4480noun/s: lr=5.64e-04, loss=1.36e+00, top1=70.31%/70.188% |
|
[[38;5;39m INFO[0m][29-Jun-24 23:51:07] Chunk 253 = Batch 1056385 = Sample 540868609 |
|
[[38;5;39m INFO[0m][29-Jun-24 23:59:05] Total gradient norm stats for 262 steps: 0.2485 <= 0.2643 + 0.03071z <= 0.7424 |
|
[[38;5;39m INFO[0m][29-Jun-24 23:59:05] Trained chunk 253 in 478.5s at 4486noun/s: lr=5.59e-04, loss=1.36e+00, top1=70.46%/70.197% |
|
[[38;5;39m INFO[0m][29-Jun-24 23:59:05] Chunk 254 = Batch 1060577 = Sample 543014913 |
|
[[38;5;39m INFO[0m][30-Jun-24 00:07:04] Total gradient norm stats for 262 steps: 0.2465 <= 0.2618 + 0.007016z <= 0.2972 |
|
[[38;5;39m INFO[0m][30-Jun-24 00:07:04] Trained chunk 254 in 478.3s at 4487noun/s: lr=5.54e-04, loss=1.36e+00, top1=69.92%/70.204% |
|
[[38;5;39m INFO[0m][30-Jun-24 00:07:04] Chunk 255 = Batch 1064769 = Sample 545161217 |
|
[[38;5;39m INFO[0m][30-Jun-24 00:15:02] Total gradient norm stats for 262 steps: 0.247 <= 0.2649 + 0.01918z <= 0.538 |
|
[[38;5;39m INFO[0m][30-Jun-24 00:15:02] Trained chunk 255 in 478.6s at 4484noun/s: lr=5.48e-04, loss=1.36e+00, top1=71.06%/70.212% |
|
[[38;5;39m INFO[0m][30-Jun-24 00:15:02] Chunk 256 = Batch 1068961 = Sample 547307521 |
|
[[38;5;39m INFO[0m][30-Jun-24 00:23:00] Total gradient norm stats for 262 steps: 0.2507 <= 0.2637 + 0.006754z <= 0.2928 |
|
[[38;5;39m INFO[0m][30-Jun-24 00:23:00] Trained chunk 256 in 477.9s at 4491noun/s: lr=5.43e-04, loss=1.36e+00, top1=70.45%/70.220% |
|
[[38;5;39m INFO[0m][30-Jun-24 00:23:00] Chunk 257 = Batch 1073153 = Sample 549453825 |
|
[[38;5;39m INFO[0m][30-Jun-24 00:30:59] Total gradient norm stats for 262 steps: 0.2523 <= 0.2657 + 0.01651z <= 0.4979 |
|
[[38;5;39m INFO[0m][30-Jun-24 00:30:59] Trained chunk 257 in 478.9s at 4482noun/s: lr=5.38e-04, loss=1.36e+00, top1=70.91%/70.230% |
|
[[38;5;39m INFO[0m][30-Jun-24 00:30:59] Chunk 258 = Batch 1077345 = Sample 551600129 |
|
[[38;5;39m INFO[0m][30-Jun-24 00:38:57] Total gradient norm stats for 262 steps: 0.2495 <= 0.2656 + 0.008565z <= 0.3266 |
|
[[38;5;39m INFO[0m][30-Jun-24 00:38:57] Trained chunk 258 in 478.0s at 4490noun/s: lr=5.33e-04, loss=1.36e+00, top1=69.50%/70.236% |
|
[[38;5;39m INFO[0m][30-Jun-24 00:38:57] Chunk 259 = Batch 1081537 = Sample 553746433 |
|
[[38;5;39m INFO[0m][30-Jun-24 00:46:55] Total gradient norm stats for 262 steps: 0.2535 <= 0.2676 + 0.009303z <= 0.3037 |
|
[[38;5;39m INFO[0m][30-Jun-24 00:46:55] Trained chunk 259 in 478.1s at 4490noun/s: lr=5.28e-04, loss=1.36e+00, top1=70.90%/70.247% |
|
[[38;5;39m INFO[0m][30-Jun-24 00:46:55] Chunk 260 = Batch 1085729 = Sample 555892737 |
|
[[38;5;39m INFO[0m][30-Jun-24 00:54:54] Total gradient norm stats for 262 steps: 0.2524 <= 0.2666 + 0.01281z <= 0.4179 |
|
[[38;5;39m INFO[0m][30-Jun-24 00:54:54] Trained chunk 260 in 479.3s at 4478noun/s: lr=5.22e-04, loss=1.36e+00, top1=69.58%/70.258% |
|
[[38;5;39m INFO[0m][30-Jun-24 00:54:54] Chunk 261 = Batch 1089921 = Sample 558039041 |
|
[[38;5;39m INFO[0m][30-Jun-24 01:02:53] Total gradient norm stats for 262 steps: 0.2536 <= 0.2669 + 0.007289z <= 0.2944 |
|
[[38;5;39m INFO[0m][30-Jun-24 01:02:53] Trained chunk 261 in 478.6s at 4484noun/s: lr=5.17e-04, loss=1.36e+00, top1=71.06%/70.267% |
|
[[38;5;39m INFO[0m][30-Jun-24 01:02:53] Chunk 262 = Batch 1094113 = Sample 560185345 |
|
[[38;5;39m INFO[0m][30-Jun-24 01:10:51] Total gradient norm stats for 262 steps: 0.2536 <= 0.2698 + 0.02267z <= 0.6097 |
|
[[38;5;39m INFO[0m][30-Jun-24 01:10:51] Trained chunk 262 in 477.7s at 4493noun/s: lr=5.12e-04, loss=1.36e+00, top1=70.89%/70.276% |
|
[[38;5;39m INFO[0m][30-Jun-24 01:10:51] Chunk 263 = Batch 1098305 = Sample 562331649 |
|
[[38;5;39m INFO[0m][30-Jun-24 01:18:50] Total gradient norm stats for 262 steps: 0.2536 <= 0.2672 + 0.007316z <= 0.3065 |
|
[[38;5;39m INFO[0m][30-Jun-24 01:18:50] Trained chunk 263 in 479.4s at 4477noun/s: lr=5.07e-04, loss=1.36e+00, top1=70.73%/70.285% |
|
[[38;5;39m INFO[0m][30-Jun-24 01:18:50] Chunk 264 = Batch 1102497 = Sample 564477953 |
|
[[38;5;39m INFO[0m][30-Jun-24 01:26:48] Total gradient norm stats for 262 steps: 0.2561 <= 0.2749 + 0.09318z <= 1.772 (clipped to 1) |
|
[[38;5;39m INFO[0m][30-Jun-24 01:26:48] Trained chunk 264 in 478.2s at 4488noun/s: lr=5.02e-04, loss=1.36e+00, top1=70.56%/70.296% |
|
[[38;5;39m INFO[0m][30-Jun-24 01:26:48] Chunk 265 = Batch 1106689 = Sample 566624257 |
|
[[38;5;39m INFO[0m][30-Jun-24 01:34:47] Total gradient norm stats for 262 steps: 0.2562 <= 0.2703 + 0.008159z <= 0.3042 |
|
[[38;5;39m INFO[0m][30-Jun-24 01:34:47] Trained chunk 265 in 478.1s at 4489noun/s: lr=4.97e-04, loss=1.35e+00, top1=70.77%/70.307% |
|
[[38;5;39m INFO[0m][30-Jun-24 01:34:47] Chunk 266 = Batch 1110881 = Sample 568770561 |
|
[[38;5;39m INFO[0m][30-Jun-24 01:35:12] Epoch 11 finished in 11535.6s |
|
[[38;5;39m INFO[0m][30-Jun-24 01:35:12] -------------------------------------------------------------------------------- |
|
[[38;5;39m INFO[0m][30-Jun-24 01:35:12] Epoch 12 = Batch 1111089 = Sample 568877057 |
|
[[38;5;39m INFO[0m][30-Jun-24 01:42:48] Total gradient norm stats for 262 steps: 0.2588 <= 0.2714 + 0.008353z <= 0.3207 |
|
[[38;5;39m INFO[0m][30-Jun-24 01:42:48] Trained chunk 266 in 481.5s at 4458noun/s: lr=4.91e-04, loss=1.35e+00, top1=70.29%/70.321% |
|
[[38;5;39m INFO[0m][30-Jun-24 01:42:48] Chunk 267 = Batch 1115073 = Sample 570916865 |
|
[[38;5;39m INFO[0m][30-Jun-24 01:50:48] Total gradient norm stats for 262 steps: 0.2575 <= 0.2717 + 0.007424z <= 0.2986 |
|
[[38;5;39m INFO[0m][30-Jun-24 01:50:48] Trained chunk 267 in 479.7s at 4474noun/s: lr=4.86e-04, loss=1.35e+00, top1=69.86%/70.331% |
|
[[38;5;39m INFO[0m][30-Jun-24 01:50:48] Chunk 268 = Batch 1119265 = Sample 573063169 |
|
[[38;5;39m INFO[0m][30-Jun-24 01:58:47] Total gradient norm stats for 262 steps: 0.2574 <= 0.2721 + 0.01186z <= 0.4211 |
|
[[38;5;39m INFO[0m][30-Jun-24 01:58:47] Trained chunk 268 in 479.1s at 4480noun/s: lr=4.81e-04, loss=1.35e+00, top1=69.98%/70.340% |
|
[[38;5;39m INFO[0m][30-Jun-24 01:58:47] Chunk 269 = Batch 1123457 = Sample 575209473 |
|
[[38;5;39m INFO[0m][30-Jun-24 02:06:45] Total gradient norm stats for 262 steps: 0.2589 <= 0.2732 + 0.008775z <= 0.3485 |
|
[[38;5;39m INFO[0m][30-Jun-24 02:06:45] Trained chunk 269 in 478.4s at 4487noun/s: lr=4.76e-04, loss=1.35e+00, top1=70.60%/70.350% |
|
[[38;5;39m INFO[0m][30-Jun-24 02:06:45] Chunk 270 = Batch 1127649 = Sample 577355777 |
|
[[38;5;39m INFO[0m][30-Jun-24 02:14:44] Total gradient norm stats for 262 steps: 0.2596 <= 0.2729 + 0.00705z <= 0.302 |
|
[[38;5;39m INFO[0m][30-Jun-24 02:14:44] Trained chunk 270 in 479.0s at 4481noun/s: lr=4.71e-04, loss=1.35e+00, top1=70.30%/70.362% |
|
[[38;5;39m INFO[0m][30-Jun-24 02:14:44] Chunk 271 = Batch 1131841 = Sample 579502081 |
|
[[38;5;39m INFO[0m][30-Jun-24 02:22:43] Total gradient norm stats for 262 steps: 0.2614 <= 0.2742 + 0.01006z <= 0.3735 |
|
[[38;5;39m INFO[0m][30-Jun-24 02:22:43] Trained chunk 271 in 479.2s at 4479noun/s: lr=4.66e-04, loss=1.35e+00, top1=71.05%/70.371% |
|
[[38;5;39m INFO[0m][30-Jun-24 02:22:43] Chunk 272 = Batch 1136033 = Sample 581648385 |
|
[[38;5;39m INFO[0m][30-Jun-24 02:30:42] Total gradient norm stats for 262 steps: 0.2608 <= 0.2752 + 0.009783z <= 0.3492 |
|
[[38;5;39m INFO[0m][30-Jun-24 02:30:42] Trained chunk 272 in 478.3s at 4487noun/s: lr=4.61e-04, loss=1.35e+00, top1=71.75%/70.380% |
|
[[38;5;39m INFO[0m][30-Jun-24 02:30:42] Chunk 273 = Batch 1140225 = Sample 583794689 |
|
[[38;5;39m INFO[0m][30-Jun-24 02:38:41] Total gradient norm stats for 262 steps: 0.2625 <= 0.2766 + 0.02211z <= 0.6121 |
|
[[38;5;39m INFO[0m][30-Jun-24 02:38:41] Trained chunk 273 in 479.1s at 4480noun/s: lr=4.56e-04, loss=1.35e+00, top1=70.37%/70.390% |
|
[[38;5;39m INFO[0m][30-Jun-24 02:38:41] Chunk 274 = Batch 1144417 = Sample 585940993 |
|
[[38;5;39m INFO[0m][30-Jun-24 02:46:39] Total gradient norm stats for 262 steps: 0.2622 <= 0.276 + 0.006946z <= 0.3165 |
|
[[38;5;39m INFO[0m][30-Jun-24 02:46:39] Trained chunk 274 in 478.1s at 4489noun/s: lr=4.51e-04, loss=1.35e+00, top1=70.81%/70.396% |
|
[[38;5;39m INFO[0m][30-Jun-24 02:46:39] Chunk 275 = Batch 1148609 = Sample 588087297 |
|
[[38;5;39m INFO[0m][30-Jun-24 02:54:37] Total gradient norm stats for 262 steps: 0.264 <= 0.2772 + 0.008567z <= 0.3331 |
|
[[38;5;39m INFO[0m][30-Jun-24 02:54:37] Trained chunk 275 in 477.7s at 4493noun/s: lr=4.46e-04, loss=1.35e+00, top1=69.97%/70.405% |
|
[[38;5;39m INFO[0m][30-Jun-24 02:54:37] Chunk 276 = Batch 1152801 = Sample 590233601 |
|
[[38;5;39m INFO[0m][30-Jun-24 03:02:35] Total gradient norm stats for 262 steps: 0.2652 <= 0.2763 + 0.006443z <= 0.3131 |
|
[[38;5;39m INFO[0m][30-Jun-24 03:02:35] Trained chunk 276 in 478.5s at 4486noun/s: lr=4.41e-04, loss=1.35e+00, top1=69.92%/70.412% |
|
[[38;5;39m INFO[0m][30-Jun-24 03:02:35] Chunk 277 = Batch 1156993 = Sample 592379905 |
|
[[38;5;39m INFO[0m][30-Jun-24 03:10:35] Total gradient norm stats for 262 steps: 0.2656 <= 0.2773 + 0.007438z <= 0.3329 |
|
[[38;5;39m INFO[0m][30-Jun-24 03:10:35] Trained chunk 277 in 479.7s at 4474noun/s: lr=4.36e-04, loss=1.35e+00, top1=71.54%/70.420% |
|
[[38;5;39m INFO[0m][30-Jun-24 03:10:35] Chunk 278 = Batch 1161185 = Sample 594526209 |
|
[[38;5;39m INFO[0m][30-Jun-24 03:18:35] Total gradient norm stats for 262 steps: 0.2651 <= 0.2791 + 0.007937z <= 0.3154 |
|
[[38;5;39m INFO[0m][30-Jun-24 03:18:35] Trained chunk 278 in 479.7s at 4474noun/s: lr=4.31e-04, loss=1.35e+00, top1=69.93%/70.431% |
|
[[38;5;39m INFO[0m][30-Jun-24 03:18:35] Chunk 279 = Batch 1165377 = Sample 596672513 |
|
[[38;5;39m INFO[0m][30-Jun-24 03:26:33] Total gradient norm stats for 262 steps: 0.2674 <= 0.2796 + 0.008057z <= 0.3349 |
|
[[38;5;39m INFO[0m][30-Jun-24 03:26:33] Trained chunk 279 in 478.4s at 4487noun/s: lr=4.26e-04, loss=1.35e+00, top1=69.38%/70.442% |
|
[[38;5;39m INFO[0m][30-Jun-24 03:26:33] Chunk 280 = Batch 1169569 = Sample 598818817 |
|
[[38;5;39m INFO[0m][30-Jun-24 03:34:31] Total gradient norm stats for 262 steps: 0.2668 <= 0.2807 + 0.009233z <= 0.363 |
|
[[38;5;39m INFO[0m][30-Jun-24 03:34:31] Trained chunk 280 in 478.2s at 4488noun/s: lr=4.21e-04, loss=1.35e+00, top1=70.59%/70.451% |
|
[[38;5;39m INFO[0m][30-Jun-24 03:34:31] Chunk 281 = Batch 1173761 = Sample 600965121 |
|
[[38;5;39m INFO[0m][30-Jun-24 03:42:29] Total gradient norm stats for 262 steps: 0.2689 <= 0.2817 + 0.01317z <= 0.4142 |
|
[[38;5;39m INFO[0m][30-Jun-24 03:42:29] Trained chunk 281 in 477.4s at 4496noun/s: lr=4.17e-04, loss=1.34e+00, top1=70.03%/70.459% |
|
[[38;5;39m INFO[0m][30-Jun-24 03:42:29] Chunk 282 = Batch 1177953 = Sample 603111425 |
|
[[38;5;39m INFO[0m][30-Jun-24 03:50:27] Total gradient norm stats for 262 steps: 0.2683 <= 0.2804 + 0.005932z <= 0.3092 |
|
[[38;5;39m INFO[0m][30-Jun-24 03:50:27] Trained chunk 282 in 478.2s at 4489noun/s: lr=4.12e-04, loss=1.34e+00, top1=69.42%/70.470% |
|
[[38;5;39m INFO[0m][30-Jun-24 03:50:27] Chunk 283 = Batch 1182145 = Sample 605257729 |
|
[[38;5;39m INFO[0m][30-Jun-24 03:58:25] Total gradient norm stats for 262 steps: 0.2697 <= 0.2837 + 0.0462z <= 1.02 (clipped to 1) |
|
[[38;5;39m INFO[0m][30-Jun-24 03:58:25] Trained chunk 283 in 478.6s at 4484noun/s: lr=4.07e-04, loss=1.34e+00, top1=69.56%/70.479% |
|
[[38;5;39m INFO[0m][30-Jun-24 03:58:25] Chunk 284 = Batch 1186337 = Sample 607404033 |
|
[[38;5;39m INFO[0m][30-Jun-24 04:06:24] Total gradient norm stats for 262 steps: 0.2712 <= 0.2843 + 0.01136z <= 0.4248 |
|
[[38;5;39m INFO[0m][30-Jun-24 04:06:24] Trained chunk 284 in 478.8s at 4482noun/s: lr=4.02e-04, loss=1.34e+00, top1=70.21%/70.487% |
|
[[38;5;39m INFO[0m][30-Jun-24 04:06:24] Chunk 285 = Batch 1190529 = Sample 609550337 |
|
[[38;5;39m INFO[0m][30-Jun-24 04:14:23] Total gradient norm stats for 262 steps: 0.2712 <= 0.2836 + 0.008247z <= 0.3632 |
|
[[38;5;39m INFO[0m][30-Jun-24 04:14:23] Trained chunk 285 in 478.6s at 4484noun/s: lr=3.97e-04, loss=1.34e+00, top1=70.64%/70.499% |
|
[[38;5;39m INFO[0m][30-Jun-24 04:14:23] Chunk 286 = Batch 1194721 = Sample 611696641 |
|
[[38;5;39m INFO[0m][30-Jun-24 04:22:21] Total gradient norm stats for 262 steps: 0.2718 <= 0.2835 + 0.006175z <= 0.3043 |
|
[[38;5;39m INFO[0m][30-Jun-24 04:22:21] Trained chunk 286 in 477.7s at 4493noun/s: lr=3.92e-04, loss=1.34e+00, top1=69.94%/70.509% |
|
[[38;5;39m INFO[0m][30-Jun-24 04:22:21] Chunk 287 = Batch 1198913 = Sample 613842945 |
|
[[38;5;39m INFO[0m][30-Jun-24 04:30:18] Total gradient norm stats for 262 steps: 0.2715 <= 0.2853 + 0.009343z <= 0.3874 |
|
[[38;5;39m INFO[0m][30-Jun-24 04:30:18] Trained chunk 287 in 477.7s at 4493noun/s: lr=3.88e-04, loss=1.34e+00, top1=70.27%/70.520% |
|
[[38;5;39m INFO[0m][30-Jun-24 04:30:18] Chunk 288 = Batch 1203105 = Sample 615989249 |
|
[[38;5;39m INFO[0m][30-Jun-24 04:38:16] Total gradient norm stats for 262 steps: 0.2709 <= 0.2858 + 0.007408z <= 0.3386 |
|
[[38;5;39m INFO[0m][30-Jun-24 04:38:16] Trained chunk 288 in 477.9s at 4491noun/s: lr=3.83e-04, loss=1.34e+00, top1=71.02%/70.528% |
|
[[38;5;39m INFO[0m][30-Jun-24 04:38:17] Saved checkpoint: /data/strahl/Code/ovod/outputs/ovod_20240628_142131/ovod_chunk0288_20240630_043816.train |
|
[[38;5;39m INFO[0m][30-Jun-24 04:38:17] Chunk 289 = Batch 1207297 = Sample 618135553 |
|
[[38;5;39m INFO[0m][30-Jun-24 04:46:16] Total gradient norm stats for 262 steps: 0.2738 <= 0.2863 + 0.006011z <= 0.3042 |
|
[[38;5;39m INFO[0m][30-Jun-24 04:46:16] Trained chunk 289 in 479.4s at 4477noun/s: lr=3.78e-04, loss=1.34e+00, top1=69.97%/70.541% |
|
[[38;5;39m INFO[0m][30-Jun-24 04:46:16] Chunk 290 = Batch 1211489 = Sample 620281857 |
|
[[38;5;39m INFO[0m][30-Jun-24 04:47:27] Epoch 12 finished in 11534.7s |
|
[[38;5;39m INFO[0m][30-Jun-24 04:47:27] -------------------------------------------------------------------------------- |
|
[[38;5;39m INFO[0m][30-Jun-24 04:47:27] Epoch 13 = Batch 1212097 = Sample 620593153 |
|
[[38;5;39m INFO[0m][30-Jun-24 04:54:16] Total gradient norm stats for 262 steps: 0.2735 <= 0.2873 + 0.008925z <= 0.3903 |
|
[[38;5;39m INFO[0m][30-Jun-24 04:54:16] Trained chunk 290 in 480.0s at 4472noun/s: lr=3.73e-04, loss=1.34e+00, top1=70.39%/70.552% |
|
[[38;5;39m INFO[0m][30-Jun-24 04:54:16] Chunk 291 = Batch 1215681 = Sample 622428161 |
|
[[38;5;39m INFO[0m][30-Jun-24 05:02:14] Total gradient norm stats for 262 steps: 0.2749 <= 0.289 + 0.03795z <= 0.8934 |
|
[[38;5;39m INFO[0m][30-Jun-24 05:02:14] Trained chunk 291 in 477.9s at 4491noun/s: lr=3.69e-04, loss=1.34e+00, top1=70.91%/70.564% |
|
[[38;5;39m INFO[0m][30-Jun-24 05:02:14] Chunk 292 = Batch 1219873 = Sample 624574465 |
|
[[38;5;39m INFO[0m][30-Jun-24 05:10:12] Total gradient norm stats for 262 steps: 0.276 <= 0.2883 + 0.006686z <= 0.3332 |
|
[[38;5;39m INFO[0m][30-Jun-24 05:10:12] Trained chunk 292 in 477.8s at 4492noun/s: lr=3.64e-04, loss=1.34e+00, top1=69.91%/70.574% |
|
[[38;5;39m INFO[0m][30-Jun-24 05:10:12] Chunk 293 = Batch 1224065 = Sample 626720769 |
|
[[38;5;39m INFO[0m][30-Jun-24 05:18:10] Total gradient norm stats for 262 steps: 0.2773 <= 0.2898 + 0.006928z <= 0.3285 |
|
[[38;5;39m INFO[0m][30-Jun-24 05:18:10] Trained chunk 293 in 478.5s at 4485noun/s: lr=3.59e-04, loss=1.34e+00, top1=70.20%/70.585% |
|
[[38;5;39m INFO[0m][30-Jun-24 05:18:10] Chunk 294 = Batch 1228257 = Sample 628867073 |
|
[[38;5;39m INFO[0m][30-Jun-24 05:26:08] Total gradient norm stats for 262 steps: 0.2756 <= 0.2894 + 0.00621z <= 0.3136 |
|
[[38;5;39m INFO[0m][30-Jun-24 05:26:08] Trained chunk 294 in 478.2s at 4489noun/s: lr=3.55e-04, loss=1.34e+00, top1=70.49%/70.597% |
|
[[38;5;39m INFO[0m][30-Jun-24 05:26:08] Chunk 295 = Batch 1232449 = Sample 631013377 |
|
[[38;5;39m INFO[0m][30-Jun-24 05:34:06] Total gradient norm stats for 262 steps: 0.278 <= 0.2913 + 0.03024z <= 0.7701 |
|
[[38;5;39m INFO[0m][30-Jun-24 05:34:06] Trained chunk 295 in 477.3s at 4497noun/s: lr=3.50e-04, loss=1.34e+00, top1=71.41%/70.607% |
|
[[38;5;39m INFO[0m][30-Jun-24 05:34:06] Chunk 296 = Batch 1236641 = Sample 633159681 |
|
[[38;5;39m INFO[0m][30-Jun-24 05:42:04] Total gradient norm stats for 262 steps: 0.2796 <= 0.2909 + 0.008893z <= 0.4072 |
|
[[38;5;39m INFO[0m][30-Jun-24 05:42:04] Trained chunk 296 in 478.4s at 4487noun/s: lr=3.46e-04, loss=1.34e+00, top1=70.91%/70.617% |
|
[[38;5;39m INFO[0m][30-Jun-24 05:42:04] Chunk 297 = Batch 1240833 = Sample 635305985 |
|
[[38;5;39m INFO[0m][30-Jun-24 05:50:02] Total gradient norm stats for 262 steps: 0.281 <= 0.2907 + 0.004795z <= 0.3075 |
|
[[38;5;39m INFO[0m][30-Jun-24 05:50:02] Trained chunk 297 in 478.4s at 4486noun/s: lr=3.41e-04, loss=1.33e+00, top1=69.84%/70.629% |
|
[[38;5;39m INFO[0m][30-Jun-24 05:50:02] Chunk 298 = Batch 1245025 = Sample 637452289 |
|
[[38;5;39m INFO[0m][30-Jun-24 05:58:02] Total gradient norm stats for 262 steps: 0.2811 <= 0.2923 + 0.01202z <= 0.4615 |
|
[[38;5;39m INFO[0m][30-Jun-24 05:58:02] Trained chunk 298 in 479.2s at 4479noun/s: lr=3.36e-04, loss=1.33e+00, top1=70.40%/70.636% |
|
[[38;5;39m INFO[0m][30-Jun-24 05:58:02] Chunk 299 = Batch 1249217 = Sample 639598593 |
|
[[38;5;39m INFO[0m][30-Jun-24 06:06:00] Total gradient norm stats for 262 steps: 0.2821 <= 0.2943 + 0.01841z <= 0.5754 |
|
[[38;5;39m INFO[0m][30-Jun-24 06:06:00] Trained chunk 299 in 478.3s at 4488noun/s: lr=3.32e-04, loss=1.33e+00, top1=71.07%/70.647% |
|
[[38;5;39m INFO[0m][30-Jun-24 06:06:00] Chunk 300 = Batch 1253409 = Sample 641744897 |
|
[[38;5;39m INFO[0m][30-Jun-24 06:13:58] Total gradient norm stats for 262 steps: 0.2804 <= 0.2937 + 0.00611z <= 0.3256 |
|
[[38;5;39m INFO[0m][30-Jun-24 06:13:58] Trained chunk 300 in 478.3s at 4488noun/s: lr=3.27e-04, loss=1.33e+00, top1=70.82%/70.659% |
|
[[38;5;39m INFO[0m][30-Jun-24 06:13:58] Chunk 301 = Batch 1257601 = Sample 643891201 |
|
[[38;5;39m INFO[0m][30-Jun-24 06:21:57] Total gradient norm stats for 262 steps: 0.2847 <= 0.2954 + 0.007456z <= 0.3359 |
|
[[38;5;39m INFO[0m][30-Jun-24 06:21:57] Trained chunk 301 in 478.3s at 4487noun/s: lr=3.23e-04, loss=1.33e+00, top1=70.28%/70.667% |
|
[[38;5;39m INFO[0m][30-Jun-24 06:21:57] Chunk 302 = Batch 1261793 = Sample 646037505 |
|
[[38;5;39m INFO[0m][30-Jun-24 06:29:55] Total gradient norm stats for 262 steps: 0.2838 <= 0.2962 + 0.006729z <= 0.3371 |
|
[[38;5;39m INFO[0m][30-Jun-24 06:29:55] Trained chunk 302 in 478.6s at 4484noun/s: lr=3.18e-04, loss=1.33e+00, top1=71.14%/70.679% |
|
[[38;5;39m INFO[0m][30-Jun-24 06:29:55] Chunk 303 = Batch 1265985 = Sample 648183809 |
|
[[38;5;39m INFO[0m][30-Jun-24 06:37:53] Total gradient norm stats for 262 steps: 0.2828 <= 0.2968 + 0.006458z <= 0.3231 |
|
[[38;5;39m INFO[0m][30-Jun-24 06:37:53] Trained chunk 303 in 477.7s at 4493noun/s: lr=3.14e-04, loss=1.33e+00, top1=71.39%/70.690% |
|
[[38;5;39m INFO[0m][30-Jun-24 06:37:53] Chunk 304 = Batch 1270177 = Sample 650330113 |
|
[[38;5;39m INFO[0m][30-Jun-24 06:45:51] Total gradient norm stats for 262 steps: 0.2845 <= 0.2973 + 0.008881z <= 0.4051 |
|
[[38;5;39m INFO[0m][30-Jun-24 06:45:51] Trained chunk 304 in 478.1s at 4489noun/s: lr=3.10e-04, loss=1.33e+00, top1=70.52%/70.703% |
|
[[38;5;39m INFO[0m][30-Jun-24 06:45:51] Chunk 305 = Batch 1274369 = Sample 652476417 |
|
[[38;5;39m INFO[0m][30-Jun-24 06:53:48] Total gradient norm stats for 262 steps: 0.2875 <= 0.2988 + 0.01898z <= 0.5892 |
|
[[38;5;39m INFO[0m][30-Jun-24 06:53:48] Trained chunk 305 in 477.4s at 4496noun/s: lr=3.05e-04, loss=1.33e+00, top1=70.11%/70.711% |
|
[[38;5;39m INFO[0m][30-Jun-24 06:53:48] Chunk 306 = Batch 1278561 = Sample 654622721 |
|
[[38;5;39m INFO[0m][30-Jun-24 07:01:46] Total gradient norm stats for 262 steps: 0.2874 <= 0.3043 + 0.08093z <= 1.595 (clipped to 1) |
|
[[38;5;39m INFO[0m][30-Jun-24 07:01:46] Trained chunk 306 in 477.8s at 4492noun/s: lr=3.01e-04, loss=1.33e+00, top1=71.76%/70.721% |
|
[[38;5;39m INFO[0m][30-Jun-24 07:01:46] Chunk 307 = Batch 1282753 = Sample 656769025 |
|
[[38;5;39m INFO[0m][30-Jun-24 07:09:44] Total gradient norm stats for 262 steps: 0.2877 <= 0.2995 + 0.009571z <= 0.4215 |
|
[[38;5;39m INFO[0m][30-Jun-24 07:09:44] Trained chunk 307 in 477.5s at 4495noun/s: lr=2.96e-04, loss=1.33e+00, top1=71.43%/70.731% |
|
[[38;5;39m INFO[0m][30-Jun-24 07:09:44] Chunk 308 = Batch 1286945 = Sample 658915329 |
|
[[38;5;39m INFO[0m][30-Jun-24 07:17:41] Total gradient norm stats for 262 steps: 0.2908 <= 0.302 + 0.01469z <= 0.5058 |
|
[[38;5;39m INFO[0m][30-Jun-24 07:17:41] Trained chunk 308 in 477.7s at 4493noun/s: lr=2.92e-04, loss=1.33e+00, top1=69.73%/70.740% |
|
[[38;5;39m INFO[0m][30-Jun-24 07:17:41] Chunk 309 = Batch 1291137 = Sample 661061633 |
|
[[38;5;39m INFO[0m][30-Jun-24 07:25:39] Total gradient norm stats for 262 steps: 0.2905 <= 0.3007 + 0.006791z <= 0.3796 |
|
[[38;5;39m INFO[0m][30-Jun-24 07:25:39] Trained chunk 309 in 477.3s at 4497noun/s: lr=2.88e-04, loss=1.33e+00, top1=71.11%/70.752% |
|
[[38;5;39m INFO[0m][30-Jun-24 07:25:39] Chunk 310 = Batch 1295329 = Sample 663207937 |
|
[[38;5;39m INFO[0m][30-Jun-24 07:33:37] Total gradient norm stats for 262 steps: 0.2891 <= 0.3015 + 0.006493z <= 0.3421 |
|
[[38;5;39m INFO[0m][30-Jun-24 07:33:37] Trained chunk 310 in 478.0s at 4490noun/s: lr=2.84e-04, loss=1.33e+00, top1=71.30%/70.760% |
|
[[38;5;39m INFO[0m][30-Jun-24 07:33:37] Chunk 311 = Batch 1299521 = Sample 665354241 |
|
[[38;5;39m INFO[0m][30-Jun-24 07:41:36] Total gradient norm stats for 262 steps: 0.292 <= 0.3019 + 0.005011z <= 0.3253 |
|
[[38;5;39m INFO[0m][30-Jun-24 07:41:36] Trained chunk 311 in 479.8s at 4473noun/s: lr=2.79e-04, loss=1.33e+00, top1=70.68%/70.772% |
|
[[38;5;39m INFO[0m][30-Jun-24 07:41:36] Chunk 312 = Batch 1303713 = Sample 667500545 |
|
[[38;5;39m INFO[0m][30-Jun-24 07:49:35] Total gradient norm stats for 262 steps: 0.2899 <= 0.3038 + 0.006162z <= 0.33 |
|
[[38;5;39m INFO[0m][30-Jun-24 07:49:35] Trained chunk 312 in 478.4s at 4486noun/s: lr=2.75e-04, loss=1.33e+00, top1=71.06%/70.785% |
|
[[38;5;39m INFO[0m][30-Jun-24 07:49:35] Chunk 313 = Batch 1307905 = Sample 669646849 |
|
[[38;5;39m INFO[0m][30-Jun-24 07:57:33] Total gradient norm stats for 262 steps: 0.2932 <= 0.305 + 0.02531z <= 0.6997 |
|
[[38;5;39m INFO[0m][30-Jun-24 07:57:33] Trained chunk 313 in 478.3s at 4487noun/s: lr=2.71e-04, loss=1.32e+00, top1=70.67%/70.797% |
|
[[38;5;39m INFO[0m][30-Jun-24 07:57:33] Chunk 314 = Batch 1312097 = Sample 671793153 |
|
[[38;5;39m INFO[0m][30-Jun-24 07:59:29] Epoch 13 finished in 11522.5s |
|
[[38;5;39m INFO[0m][30-Jun-24 07:59:29] -------------------------------------------------------------------------------- |
|
[[38;5;39m INFO[0m][30-Jun-24 07:59:29] Epoch 14 = Batch 1313105 = Sample 672309249 |
|
[[38;5;39m INFO[0m][30-Jun-24 08:05:33] Total gradient norm stats for 262 steps: 0.2917 <= 0.3046 + 0.004834z <= 0.3216 |
|
[[38;5;39m INFO[0m][30-Jun-24 08:05:33] Trained chunk 314 in 479.9s at 4473noun/s: lr=2.67e-04, loss=1.32e+00, top1=72.30%/70.809% |
|
[[38;5;39m INFO[0m][30-Jun-24 08:05:33] Chunk 315 = Batch 1316289 = Sample 673939457 |
|
[[38;5;39m INFO[0m][30-Jun-24 08:13:31] Total gradient norm stats for 262 steps: 0.2951 <= 0.3065 + 0.01411z <= 0.494 |
|
[[38;5;39m INFO[0m][30-Jun-24 08:13:31] Trained chunk 315 in 478.0s at 4490noun/s: lr=2.63e-04, loss=1.32e+00, top1=70.88%/70.822% |
|
[[38;5;39m INFO[0m][30-Jun-24 08:13:31] Chunk 316 = Batch 1320481 = Sample 676085761 |
|
[[38;5;39m INFO[0m][30-Jun-24 08:21:30] Total gradient norm stats for 262 steps: 0.2964 <= 0.3063 + 0.004818z <= 0.3201 |
|
[[38;5;39m INFO[0m][30-Jun-24 08:21:30] Trained chunk 316 in 478.9s at 4482noun/s: lr=2.58e-04, loss=1.32e+00, top1=70.41%/70.832% |
|
[[38;5;39m INFO[0m][30-Jun-24 08:21:30] Chunk 317 = Batch 1324673 = Sample 678232065 |
|
[[38;5;39m INFO[0m][30-Jun-24 08:29:30] Total gradient norm stats for 262 steps: 0.297 <= 0.3077 + 0.005263z <= 0.323 |
|
[[38;5;39m INFO[0m][30-Jun-24 08:29:30] Trained chunk 317 in 479.7s at 4474noun/s: lr=2.54e-04, loss=1.32e+00, top1=70.17%/70.841% |
|
[[38;5;39m INFO[0m][30-Jun-24 08:29:30] Chunk 318 = Batch 1328865 = Sample 680378369 |
|
[[38;5;39m INFO[0m][30-Jun-24 08:37:29] Total gradient norm stats for 262 steps: 0.2992 <= 0.3101 + 0.01471z <= 0.5302 |
|
[[38;5;39m INFO[0m][30-Jun-24 08:37:29] Trained chunk 318 in 479.1s at 4480noun/s: lr=2.50e-04, loss=1.32e+00, top1=70.40%/70.850% |
|
[[38;5;39m INFO[0m][30-Jun-24 08:37:29] Chunk 319 = Batch 1333057 = Sample 682524673 |
|
[[38;5;39m INFO[0m][30-Jun-24 08:45:27] Total gradient norm stats for 262 steps: 0.2982 <= 0.3087 + 0.005028z <= 0.3239 |
|
[[38;5;39m INFO[0m][30-Jun-24 08:45:27] Trained chunk 319 in 478.6s at 4485noun/s: lr=2.46e-04, loss=1.32e+00, top1=69.73%/70.857% |
|
[[38;5;39m INFO[0m][30-Jun-24 08:45:27] Chunk 320 = Batch 1337249 = Sample 684670977 |
|
[[38;5;39m INFO[0m][30-Jun-24 08:53:26] Total gradient norm stats for 262 steps: 0.2997 <= 0.311 + 0.007088z <= 0.3599 |
|
[[38;5;39m INFO[0m][30-Jun-24 08:53:26] Trained chunk 320 in 478.2s at 4488noun/s: lr=2.42e-04, loss=1.32e+00, top1=70.95%/70.868% |
|
[[38;5;39m INFO[0m][30-Jun-24 08:53:26] Chunk 321 = Batch 1341441 = Sample 686817281 |
|
[[38;5;39m INFO[0m][30-Jun-24 09:01:24] Total gradient norm stats for 262 steps: 0.2998 <= 0.3102 + 0.005316z <= 0.328 |
|
[[38;5;39m INFO[0m][30-Jun-24 09:01:24] Trained chunk 321 in 478.2s at 4488noun/s: lr=2.38e-04, loss=1.32e+00, top1=70.56%/70.878% |
|
[[38;5;39m INFO[0m][30-Jun-24 09:01:24] Chunk 322 = Batch 1345633 = Sample 688963585 |
|
[[38;5;39m INFO[0m][30-Jun-24 09:09:22] Total gradient norm stats for 262 steps: 0.3014 <= 0.3149 + 0.03732z <= 0.8826 |
|
[[38;5;39m INFO[0m][30-Jun-24 09:09:22] Trained chunk 322 in 478.3s at 4487noun/s: lr=2.34e-04, loss=1.32e+00, top1=71.56%/70.889% |
|
[[38;5;39m INFO[0m][30-Jun-24 09:09:22] Chunk 323 = Batch 1349825 = Sample 691109889 |
|
[[38;5;39m INFO[0m][30-Jun-24 09:17:21] Total gradient norm stats for 262 steps: 0.303 <= 0.3148 + 0.03054z <= 0.7985 |
|
[[38;5;39m INFO[0m][30-Jun-24 09:17:21] Trained chunk 323 in 478.3s at 4487noun/s: lr=2.30e-04, loss=1.32e+00, top1=71.84%/70.900% |
|
[[38;5;39m INFO[0m][30-Jun-24 09:17:21] Chunk 324 = Batch 1354017 = Sample 693256193 |
|
[[38;5;39m INFO[0m][30-Jun-24 09:25:19] Total gradient norm stats for 262 steps: 0.3026 <= 0.3164 + 0.06691z <= 1.392 (clipped to 1) |
|
[[38;5;39m INFO[0m][30-Jun-24 09:25:19] Trained chunk 324 in 478.1s at 4489noun/s: lr=2.26e-04, loss=1.32e+00, top1=70.82%/70.910% |
|
[[38;5;39m INFO[0m][30-Jun-24 09:25:19] Chunk 325 = Batch 1358209 = Sample 695402497 |
|
[[38;5;39m INFO[0m][30-Jun-24 09:33:18] Total gradient norm stats for 262 steps: 0.3026 <= 0.314 + 0.005792z <= 0.3376 |
|
[[38;5;39m INFO[0m][30-Jun-24 09:33:18] Trained chunk 325 in 479.4s at 4477noun/s: lr=2.23e-04, loss=1.32e+00, top1=72.25%/70.924% |
|
[[38;5;39m INFO[0m][30-Jun-24 09:33:18] Chunk 326 = Batch 1362401 = Sample 697548801 |
|
[[38;5;39m INFO[0m][30-Jun-24 09:41:17] Total gradient norm stats for 262 steps: 0.3028 <= 0.3143 + 0.007594z <= 0.4014 |
|
[[38;5;39m INFO[0m][30-Jun-24 09:41:17] Trained chunk 326 in 479.2s at 4479noun/s: lr=2.19e-04, loss=1.32e+00, top1=70.33%/70.934% |
|
[[38;5;39m INFO[0m][30-Jun-24 09:41:17] Chunk 327 = Batch 1366593 = Sample 699695105 |
|
[[38;5;39m INFO[0m][30-Jun-24 09:49:16] Total gradient norm stats for 262 steps: 0.3036 <= 0.3156 + 0.01497z <= 0.5399 |
|
[[38;5;39m INFO[0m][30-Jun-24 09:49:16] Trained chunk 327 in 478.7s at 4484noun/s: lr=2.15e-04, loss=1.32e+00, top1=71.04%/70.944% |
|
[[38;5;39m INFO[0m][30-Jun-24 09:49:16] Chunk 328 = Batch 1370785 = Sample 701841409 |
|
[[38;5;39m INFO[0m][30-Jun-24 09:57:15] Total gradient norm stats for 262 steps: 0.3048 <= 0.3154 + 0.01211z <= 0.4954 |
|
[[38;5;39m INFO[0m][30-Jun-24 09:57:15] Trained chunk 328 in 479.1s at 4480noun/s: lr=2.11e-04, loss=1.32e+00, top1=69.85%/70.953% |
|
[[38;5;39m INFO[0m][30-Jun-24 09:57:15] Chunk 329 = Batch 1374977 = Sample 703987713 |
|
[[38;5;39m INFO[0m][30-Jun-24 10:05:13] Total gradient norm stats for 262 steps: 0.3038 <= 0.3175 + 0.01008z <= 0.4543 |
|
[[38;5;39m INFO[0m][30-Jun-24 10:05:13] Trained chunk 329 in 478.3s at 4487noun/s: lr=2.07e-04, loss=1.31e+00, top1=71.67%/70.963% |
|
[[38;5;39m INFO[0m][30-Jun-24 10:05:13] Chunk 330 = Batch 1379169 = Sample 706134017 |
|
[[38;5;39m INFO[0m][30-Jun-24 10:13:12] Total gradient norm stats for 262 steps: 0.3067 <= 0.3176 + 0.00574z <= 0.3572 |
|
[[38;5;39m INFO[0m][30-Jun-24 10:13:12] Trained chunk 330 in 478.2s at 4488noun/s: lr=2.04e-04, loss=1.31e+00, top1=70.50%/70.975% |
|
[[38;5;39m INFO[0m][30-Jun-24 10:13:12] Chunk 331 = Batch 1383361 = Sample 708280321 |
|
[[38;5;39m INFO[0m][30-Jun-24 10:21:11] Total gradient norm stats for 262 steps: 0.3073 <= 0.3182 + 0.005245z <= 0.3396 |
|
[[38;5;39m INFO[0m][30-Jun-24 10:21:11] Trained chunk 331 in 478.9s at 4481noun/s: lr=2.00e-04, loss=1.31e+00, top1=70.25%/70.985% |
|
[[38;5;39m INFO[0m][30-Jun-24 10:21:11] Chunk 332 = Batch 1387553 = Sample 710426625 |
|
[[38;5;39m INFO[0m][30-Jun-24 10:29:09] Total gradient norm stats for 262 steps: 0.3076 <= 0.3202 + 0.009827z <= 0.4429 |
|
[[38;5;39m INFO[0m][30-Jun-24 10:29:09] Trained chunk 332 in 478.1s at 4489noun/s: lr=1.96e-04, loss=1.31e+00, top1=71.33%/70.994% |
|
[[38;5;39m INFO[0m][30-Jun-24 10:29:09] Chunk 333 = Batch 1391745 = Sample 712572929 |
|
[[38;5;39m INFO[0m][30-Jun-24 10:37:07] Total gradient norm stats for 262 steps: 0.3084 <= 0.3211 + 0.01462z <= 0.5374 |
|
[[38;5;39m INFO[0m][30-Jun-24 10:37:07] Trained chunk 333 in 478.4s at 4487noun/s: lr=1.93e-04, loss=1.31e+00, top1=71.30%/71.004% |
|
[[38;5;39m INFO[0m][30-Jun-24 10:37:07] Chunk 334 = Batch 1395937 = Sample 714719233 |
|
[[38;5;39m INFO[0m][30-Jun-24 10:45:06] Total gradient norm stats for 262 steps: 0.309 <= 0.3202 + 0.004861z <= 0.3367 |
|
[[38;5;39m INFO[0m][30-Jun-24 10:45:06] Trained chunk 334 in 478.7s at 4483noun/s: lr=1.89e-04, loss=1.31e+00, top1=70.13%/71.013% |
|
[[38;5;39m INFO[0m][30-Jun-24 10:45:06] Chunk 335 = Batch 1400129 = Sample 716865537 |
|
[[38;5;39m INFO[0m][30-Jun-24 10:53:03] Total gradient norm stats for 262 steps: 0.3097 <= 0.3205 + 0.004587z <= 0.3363 |
|
[[38;5;39m INFO[0m][30-Jun-24 10:53:03] Trained chunk 335 in 477.5s at 4495noun/s: lr=1.85e-04, loss=1.31e+00, top1=71.60%/71.024% |
|
[[38;5;39m INFO[0m][30-Jun-24 10:53:03] Chunk 336 = Batch 1404321 = Sample 719011841 |
|
[[38;5;39m INFO[0m][30-Jun-24 11:01:03] Total gradient norm stats for 262 steps: 0.311 <= 0.3226 + 0.005981z <= 0.3557 |
|
[[38;5;39m INFO[0m][30-Jun-24 11:01:03] Trained chunk 336 in 479.2s at 4479noun/s: lr=1.82e-04, loss=1.31e+00, top1=70.96%/71.037% |
|
[[38;5;39m INFO[0m][30-Jun-24 11:01:03] Saved checkpoint: /data/strahl/Code/ovod/outputs/ovod_20240628_142131/ovod_chunk0336_20240630_110103.train |
|
[[38;5;39m INFO[0m][30-Jun-24 11:01:03] Chunk 337 = Batch 1408513 = Sample 721158145 |
|
[[38;5;39m INFO[0m][30-Jun-24 11:09:01] Total gradient norm stats for 262 steps: 0.3103 <= 0.3223 + 0.005367z <= 0.34 |
|
[[38;5;39m INFO[0m][30-Jun-24 11:09:01] Trained chunk 337 in 478.0s at 4490noun/s: lr=1.78e-04, loss=1.31e+00, top1=70.63%/71.048% |
|
[[38;5;39m INFO[0m][30-Jun-24 11:09:01] Chunk 338 = Batch 1412705 = Sample 723304449 |
|
[[38;5;39m INFO[0m][30-Jun-24 11:11:43] Epoch 14 finished in 11534.1s |
|
[[38;5;39m INFO[0m][30-Jun-24 11:11:43] -------------------------------------------------------------------------------- |
|
[[38;5;39m INFO[0m][30-Jun-24 11:11:43] Epoch 15 = Batch 1414113 = Sample 724025345 |
|
[[38;5;39m INFO[0m][30-Jun-24 11:17:01] Total gradient norm stats for 262 steps: 0.3123 <= 0.3233 + 0.009821z <= 0.4587 |
|
[[38;5;39m INFO[0m][30-Jun-24 11:17:01] Trained chunk 338 in 479.6s at 4475noun/s: lr=1.75e-04, loss=1.31e+00, top1=71.55%/71.060% |
|
[[38;5;39m INFO[0m][30-Jun-24 11:17:01] Chunk 339 = Batch 1416897 = Sample 725450753 |
|
[[38;5;39m INFO[0m][30-Jun-24 11:24:59] Total gradient norm stats for 262 steps: 0.311 <= 0.323 + 0.004955z <= 0.3487 |
|
[[38;5;39m INFO[0m][30-Jun-24 11:24:59] Trained chunk 339 in 478.1s at 4489noun/s: lr=1.71e-04, loss=1.31e+00, top1=71.01%/71.071% |
|
[[38;5;39m INFO[0m][30-Jun-24 11:24:59] Chunk 340 = Batch 1421089 = Sample 727597057 |
|
[[38;5;39m INFO[0m][30-Jun-24 11:32:58] Total gradient norm stats for 262 steps: 0.3143 <= 0.3254 + 0.014z <= 0.5071 |
|
[[38;5;39m INFO[0m][30-Jun-24 11:32:58] Trained chunk 340 in 479.0s at 4481noun/s: lr=1.68e-04, loss=1.31e+00, top1=70.76%/71.081% |
|
[[38;5;39m INFO[0m][30-Jun-24 11:32:58] Chunk 341 = Batch 1425281 = Sample 729743361 |
|
[[38;5;39m INFO[0m][30-Jun-24 11:40:56] Total gradient norm stats for 262 steps: 0.3134 <= 0.3257 + 0.006885z <= 0.4039 |
|
[[38;5;39m INFO[0m][30-Jun-24 11:40:56] Trained chunk 341 in 478.0s at 4491noun/s: lr=1.64e-04, loss=1.31e+00, top1=72.20%/71.091% |
|
[[38;5;39m INFO[0m][30-Jun-24 11:40:56] Chunk 342 = Batch 1429473 = Sample 731889665 |
|
[[38;5;39m INFO[0m][30-Jun-24 11:48:54] Total gradient norm stats for 262 steps: 0.3159 <= 0.3259 + 0.005389z <= 0.3577 |
|
[[38;5;39m INFO[0m][30-Jun-24 11:48:54] Trained chunk 342 in 478.3s at 4487noun/s: lr=1.61e-04, loss=1.31e+00, top1=71.52%/71.101% |
|
[[38;5;39m INFO[0m][30-Jun-24 11:48:54] Chunk 343 = Batch 1433665 = Sample 734035969 |
|
[[38;5;39m INFO[0m][30-Jun-24 11:56:52] Total gradient norm stats for 262 steps: 0.316 <= 0.3269 + 0.004703z <= 0.3506 |
|
[[38;5;39m INFO[0m][30-Jun-24 11:56:52] Trained chunk 343 in 477.6s at 4494noun/s: lr=1.58e-04, loss=1.31e+00, top1=70.95%/71.113% |
|
[[38;5;39m INFO[0m][30-Jun-24 11:56:52] Chunk 344 = Batch 1437857 = Sample 736182273 |
|
[[38;5;39m INFO[0m][30-Jun-24 12:04:50] Total gradient norm stats for 262 steps: 0.3174 <= 0.327 + 0.004745z <= 0.3421 |
|
[[38;5;39m INFO[0m][30-Jun-24 12:04:50] Trained chunk 344 in 478.0s at 4490noun/s: lr=1.54e-04, loss=1.31e+00, top1=71.87%/71.122% |
|
[[38;5;39m INFO[0m][30-Jun-24 12:04:50] Chunk 345 = Batch 1442049 = Sample 738328577 |
|
[[38;5;39m INFO[0m][30-Jun-24 12:12:48] Total gradient norm stats for 262 steps: 0.3164 <= 0.3276 + 0.00472z <= 0.3414 |
|
[[38;5;39m INFO[0m][30-Jun-24 12:12:48] Trained chunk 345 in 478.3s at 4487noun/s: lr=1.51e-04, loss=1.30e+00, top1=69.26%/71.135% |
|
[[38;5;39m INFO[0m][30-Jun-24 12:12:48] Chunk 346 = Batch 1446241 = Sample 740474881 |
|
[[38;5;39m INFO[0m][30-Jun-24 12:20:46] Total gradient norm stats for 262 steps: 0.3183 <= 0.3288 + 0.005466z <= 0.3681 |
|
[[38;5;39m INFO[0m][30-Jun-24 12:20:46] Trained chunk 346 in 477.8s at 4492noun/s: lr=1.48e-04, loss=1.30e+00, top1=71.34%/71.145% |
|
[[38;5;39m INFO[0m][30-Jun-24 12:20:46] Chunk 347 = Batch 1450433 = Sample 742621185 |
|
[[38;5;39m INFO[0m][30-Jun-24 12:28:44] Total gradient norm stats for 262 steps: 0.3191 <= 0.3291 + 0.01103z <= 0.4899 |
|
[[38;5;39m INFO[0m][30-Jun-24 12:28:44] Trained chunk 347 in 478.2s at 4489noun/s: lr=1.45e-04, loss=1.30e+00, top1=70.87%/71.155% |
|
[[38;5;39m INFO[0m][30-Jun-24 12:28:44] Chunk 348 = Batch 1454625 = Sample 744767489 |
|
[[38;5;39m INFO[0m][30-Jun-24 12:36:43] Total gradient norm stats for 262 steps: 0.3199 <= 0.3302 + 0.005691z <= 0.3609 |
|
[[38;5;39m INFO[0m][30-Jun-24 12:36:43] Trained chunk 348 in 478.7s at 4484noun/s: lr=1.41e-04, loss=1.30e+00, top1=70.67%/71.163% |
|
[[38;5;39m INFO[0m][30-Jun-24 12:36:43] Chunk 349 = Batch 1458817 = Sample 746913793 |
|
[[38;5;39m INFO[0m][30-Jun-24 12:44:40] Total gradient norm stats for 262 steps: 0.3185 <= 0.3301 + 0.004359z <= 0.3453 |
|
[[38;5;39m INFO[0m][30-Jun-24 12:44:40] Trained chunk 349 in 478.0s at 4491noun/s: lr=1.38e-04, loss=1.30e+00, top1=71.34%/71.173% |
|
[[38;5;39m INFO[0m][30-Jun-24 12:44:40] Chunk 350 = Batch 1463009 = Sample 749060097 |
|
[[38;5;39m INFO[0m][30-Jun-24 12:52:39] Total gradient norm stats for 262 steps: 0.3224 <= 0.3307 + 0.006142z <= 0.3963 |
|
[[38;5;39m INFO[0m][30-Jun-24 12:52:39] Trained chunk 350 in 478.1s at 4489noun/s: lr=1.35e-04, loss=1.30e+00, top1=71.94%/71.184% |
|
[[38;5;39m INFO[0m][30-Jun-24 12:52:39] Chunk 351 = Batch 1467201 = Sample 751206401 |
|
[[38;5;39m INFO[0m][30-Jun-24 13:00:36] Total gradient norm stats for 262 steps: 0.3209 <= 0.3318 + 0.004938z <= 0.3553 |
|
[[38;5;39m INFO[0m][30-Jun-24 13:00:36] Trained chunk 351 in 477.8s at 4492noun/s: lr=1.32e-04, loss=1.30e+00, top1=71.51%/71.193% |
|
[[38;5;39m INFO[0m][30-Jun-24 13:00:36] Chunk 352 = Batch 1471393 = Sample 753352705 |
|
[[38;5;39m INFO[0m][30-Jun-24 13:08:35] Total gradient norm stats for 262 steps: 0.3214 <= 0.3317 + 0.004444z <= 0.3471 |
|
[[38;5;39m INFO[0m][30-Jun-24 13:08:35] Trained chunk 352 in 478.1s at 4489noun/s: lr=1.29e-04, loss=1.30e+00, top1=71.47%/71.202% |
|
[[38;5;39m INFO[0m][30-Jun-24 13:08:35] Chunk 353 = Batch 1475585 = Sample 755499009 |
|
[[38;5;39m INFO[0m][30-Jun-24 13:16:34] Total gradient norm stats for 262 steps: 0.3239 <= 0.3327 + 0.004674z <= 0.35 |
|
[[38;5;39m INFO[0m][30-Jun-24 13:16:34] Trained chunk 353 in 479.0s at 4481noun/s: lr=1.26e-04, loss=1.30e+00, top1=71.36%/71.210% |
|
[[38;5;39m INFO[0m][30-Jun-24 13:16:34] Chunk 354 = Batch 1479777 = Sample 757645313 |
|
[[38;5;39m INFO[0m][30-Jun-24 13:24:31] Total gradient norm stats for 262 steps: 0.3229 <= 0.3328 + 0.004729z <= 0.3543 |
|
[[38;5;39m INFO[0m][30-Jun-24 13:24:31] Trained chunk 354 in 477.2s at 4498noun/s: lr=1.23e-04, loss=1.30e+00, top1=70.30%/71.222% |
|
[[38;5;39m INFO[0m][30-Jun-24 13:24:31] Chunk 355 = Batch 1483969 = Sample 759791617 |
|
[[38;5;39m INFO[0m][30-Jun-24 13:32:28] Total gradient norm stats for 262 steps: 0.3231 <= 0.3343 + 0.01337z <= 0.5346 |
|
[[38;5;39m INFO[0m][30-Jun-24 13:32:28] Trained chunk 355 in 477.5s at 4495noun/s: lr=1.20e-04, loss=1.30e+00, top1=70.17%/71.228% |
|
[[38;5;39m INFO[0m][30-Jun-24 13:32:28] Chunk 356 = Batch 1488161 = Sample 761937921 |
|
[[38;5;39m INFO[0m][30-Jun-24 13:40:26] Total gradient norm stats for 262 steps: 0.3232 <= 0.3366 + 0.0363z <= 0.9123 |
|
[[38;5;39m INFO[0m][30-Jun-24 13:40:26] Trained chunk 356 in 477.5s at 4495noun/s: lr=1.17e-04, loss=1.30e+00, top1=71.73%/71.238% |
|
[[38;5;39m INFO[0m][30-Jun-24 13:40:26] Chunk 357 = Batch 1492353 = Sample 764084225 |
|
[[38;5;39m INFO[0m][30-Jun-24 13:48:24] Total gradient norm stats for 262 steps: 0.3263 <= 0.3348 + 0.005606z <= 0.3884 |
|
[[38;5;39m INFO[0m][30-Jun-24 13:48:24] Trained chunk 357 in 478.3s at 4487noun/s: lr=1.14e-04, loss=1.30e+00, top1=70.74%/71.250% |
|
[[38;5;39m INFO[0m][30-Jun-24 13:48:24] Chunk 358 = Batch 1496545 = Sample 766230529 |
|
[[38;5;39m INFO[0m][30-Jun-24 13:56:22] Total gradient norm stats for 262 steps: 0.3261 <= 0.3363 + 0.01697z <= 0.599 |
|
[[38;5;39m INFO[0m][30-Jun-24 13:56:22] Trained chunk 358 in 478.0s at 4490noun/s: lr=1.11e-04, loss=1.30e+00, top1=71.88%/71.261% |
|
[[38;5;39m INFO[0m][30-Jun-24 13:56:22] Chunk 359 = Batch 1500737 = Sample 768376833 |
|
[[38;5;39m INFO[0m][30-Jun-24 14:04:20] Total gradient norm stats for 262 steps: 0.3255 <= 0.3354 + 0.005481z <= 0.3797 |
|
[[38;5;39m INFO[0m][30-Jun-24 14:04:20] Trained chunk 359 in 478.4s at 4486noun/s: lr=1.08e-04, loss=1.30e+00, top1=71.63%/71.270% |
|
[[38;5;39m INFO[0m][30-Jun-24 14:04:20] Chunk 360 = Batch 1504929 = Sample 770523137 |
|
[[38;5;39m INFO[0m][30-Jun-24 14:12:19] Total gradient norm stats for 262 steps: 0.3248 <= 0.3376 + 0.02765z <= 0.7756 |
|
[[38;5;39m INFO[0m][30-Jun-24 14:12:19] Trained chunk 360 in 478.3s at 4488noun/s: lr=1.06e-04, loss=1.30e+00, top1=70.74%/71.275% |
|
[[38;5;39m INFO[0m][30-Jun-24 14:12:19] Chunk 361 = Batch 1509121 = Sample 772669441 |
|
[[38;5;39m INFO[0m][30-Jun-24 14:20:17] Total gradient norm stats for 262 steps: 0.3263 <= 0.3363 + 0.004197z <= 0.349 |
|
[[38;5;39m INFO[0m][30-Jun-24 14:20:17] Trained chunk 361 in 478.2s at 4488noun/s: lr=1.03e-04, loss=1.30e+00, top1=71.83%/71.285% |
|
[[38;5;39m INFO[0m][30-Jun-24 14:20:17] Chunk 362 = Batch 1513313 = Sample 774815745 |
|
[[38;5;39m INFO[0m][30-Jun-24 14:23:45] Epoch 15 finished in 11521.7s |
|
[[38;5;39m INFO[0m][30-Jun-24 14:23:45] -------------------------------------------------------------------------------- |
|
[[38;5;39m INFO[0m][30-Jun-24 14:23:45] Epoch 16 = Batch 1515121 = Sample 775741441 |
|
[[38;5;39m INFO[0m][30-Jun-24 14:28:16] Total gradient norm stats for 262 steps: 0.3272 <= 0.3374 + 0.004268z <= 0.3517 |
|
[[38;5;39m INFO[0m][30-Jun-24 14:28:16] Trained chunk 362 in 479.6s at 4475noun/s: lr=1.00e-04, loss=1.30e+00, top1=71.01%/71.293% |
|
[[38;5;39m INFO[0m][30-Jun-24 14:28:16] Chunk 363 = Batch 1517505 = Sample 776962049 |
|
[[38;5;39m INFO[0m][30-Jun-24 14:36:15] Total gradient norm stats for 262 steps: 0.3289 <= 0.3381 + 0.005091z <= 0.3726 |
|
[[38;5;39m INFO[0m][30-Jun-24 14:36:15] Trained chunk 363 in 478.4s at 4487noun/s: lr=9.73e-05, loss=1.30e+00, top1=71.07%/71.302% |
|
[[38;5;39m INFO[0m][30-Jun-24 14:36:15] Chunk 364 = Batch 1521697 = Sample 779108353 |
|
[[38;5;39m INFO[0m][30-Jun-24 14:44:12] Total gradient norm stats for 262 steps: 0.3301 <= 0.3381 + 0.00451z <= 0.3586 |
|
[[38;5;39m INFO[0m][30-Jun-24 14:44:12] Trained chunk 364 in 477.2s at 4498noun/s: lr=9.47e-05, loss=1.29e+00, top1=71.26%/71.311% |
|
[[38;5;39m INFO[0m][30-Jun-24 14:44:12] Chunk 365 = Batch 1525889 = Sample 781254657 |
|
[[38;5;39m INFO[0m][30-Jun-24 14:52:10] Total gradient norm stats for 262 steps: 0.329 <= 0.3426 + 0.06564z <= 1.398 (clipped to 1) |
|
[[38;5;39m INFO[0m][30-Jun-24 14:52:10] Trained chunk 365 in 477.7s at 4493noun/s: lr=9.20e-05, loss=1.29e+00, top1=71.56%/71.322% |
|
[[38;5;39m INFO[0m][30-Jun-24 14:52:10] Chunk 366 = Batch 1530081 = Sample 783400961 |
|
[[38;5;39m INFO[0m][30-Jun-24 15:00:08] Total gradient norm stats for 262 steps: 0.33 <= 0.3401 + 0.01807z <= 0.6216 |
|
[[38;5;39m INFO[0m][30-Jun-24 15:00:08] Trained chunk 366 in 478.3s at 4487noun/s: lr=8.94e-05, loss=1.29e+00, top1=71.77%/71.332% |
|
[[38;5;39m INFO[0m][30-Jun-24 15:00:08] Chunk 367 = Batch 1534273 = Sample 785547265 |
|
[[38;5;39m INFO[0m][30-Jun-24 15:08:06] Total gradient norm stats for 262 steps: 0.3302 <= 0.3393 + 0.006007z <= 0.4024 |
|
[[38;5;39m INFO[0m][30-Jun-24 15:08:06] Trained chunk 367 in 478.1s at 4489noun/s: lr=8.69e-05, loss=1.29e+00, top1=71.41%/71.342% |
|
[[38;5;39m INFO[0m][30-Jun-24 15:08:06] Chunk 368 = Batch 1538465 = Sample 787693569 |
|
[[38;5;39m INFO[0m][30-Jun-24 15:16:04] Total gradient norm stats for 262 steps: 0.3283 <= 0.34 + 0.004389z <= 0.3566 |
|
[[38;5;39m INFO[0m][30-Jun-24 15:16:04] Trained chunk 368 in 477.9s at 4491noun/s: lr=8.44e-05, loss=1.29e+00, top1=72.46%/71.350% |
|
[[38;5;39m INFO[0m][30-Jun-24 15:16:04] Chunk 369 = Batch 1542657 = Sample 789839873 |
|
[[38;5;39m INFO[0m][30-Jun-24 15:24:03] Total gradient norm stats for 262 steps: 0.3303 <= 0.34 + 0.003965z <= 0.3558 |
|
[[38;5;39m INFO[0m][30-Jun-24 15:24:03] Trained chunk 369 in 478.4s at 4487noun/s: lr=8.19e-05, loss=1.29e+00, top1=72.39%/71.357% |
|
[[38;5;39m INFO[0m][30-Jun-24 15:24:03] Chunk 370 = Batch 1546849 = Sample 791986177 |
|
[[38;5;39m INFO[0m][30-Jun-24 15:32:01] Total gradient norm stats for 262 steps: 0.3303 <= 0.3407 + 0.005044z <= 0.3731 |
|
[[38;5;39m INFO[0m][30-Jun-24 15:32:01] Trained chunk 370 in 478.6s at 4485noun/s: lr=7.94e-05, loss=1.29e+00, top1=71.24%/71.365% |
|
[[38;5;39m INFO[0m][30-Jun-24 15:32:01] Chunk 371 = Batch 1551041 = Sample 794132481 |
|
[[38;5;39m INFO[0m][30-Jun-24 15:39:59] Total gradient norm stats for 262 steps: 0.3325 <= 0.3412 + 0.004102z <= 0.3574 |
|
[[38;5;39m INFO[0m][30-Jun-24 15:39:59] Trained chunk 371 in 478.0s at 4490noun/s: lr=7.70e-05, loss=1.29e+00, top1=71.38%/71.374% |
|
[[38;5;39m INFO[0m][30-Jun-24 15:39:59] Chunk 372 = Batch 1555233 = Sample 796278785 |
|
[[38;5;39m INFO[0m][30-Jun-24 15:47:58] Total gradient norm stats for 262 steps: 0.3328 <= 0.342 + 0.01327z <= 0.5465 |
|
[[38;5;39m INFO[0m][30-Jun-24 15:47:58] Trained chunk 372 in 478.4s at 4486noun/s: lr=7.46e-05, loss=1.29e+00, top1=71.58%/71.382% |
|
[[38;5;39m INFO[0m][30-Jun-24 15:47:58] Chunk 373 = Batch 1559425 = Sample 798425089 |
|
[[38;5;39m INFO[0m][30-Jun-24 15:55:57] Total gradient norm stats for 262 steps: 0.3289 <= 0.3575 + 0.2586z <= 4.527 (clipped to 1) |
|
[[38;5;39m INFO[0m][30-Jun-24 15:55:57] Trained chunk 373 in 479.2s at 4479noun/s: lr=7.23e-05, loss=1.29e+00, top1=70.52%/71.391% |
|
[[38;5;39m INFO[0m][30-Jun-24 15:55:57] Chunk 374 = Batch 1563617 = Sample 800571393 |
|
[[38;5;39m INFO[0m][30-Jun-24 16:03:55] Total gradient norm stats for 262 steps: 0.3334 <= 0.3429 + 0.01613z <= 0.594 |
|
[[38;5;39m INFO[0m][30-Jun-24 16:03:55] Trained chunk 374 in 478.3s at 4487noun/s: lr=7.00e-05, loss=1.29e+00, top1=71.39%/71.397% |
|
[[38;5;39m INFO[0m][30-Jun-24 16:03:55] Chunk 375 = Batch 1567809 = Sample 802717697 |
|
[[38;5;39m INFO[0m][30-Jun-24 16:11:54] Total gradient norm stats for 262 steps: 0.3323 <= 0.3425 + 0.004408z <= 0.3617 |
|
[[38;5;39m INFO[0m][30-Jun-24 16:11:54] Trained chunk 375 in 478.7s at 4483noun/s: lr=6.77e-05, loss=1.29e+00, top1=71.55%/71.407% |
|
[[38;5;39m INFO[0m][30-Jun-24 16:11:54] Chunk 376 = Batch 1572001 = Sample 804864001 |
|
[[38;5;39m INFO[0m][30-Jun-24 16:19:52] Total gradient norm stats for 262 steps: 0.3316 <= 0.3433 + 0.00838z <= 0.463 |
|
[[38;5;39m INFO[0m][30-Jun-24 16:19:52] Trained chunk 376 in 478.3s at 4488noun/s: lr=6.54e-05, loss=1.29e+00, top1=71.82%/71.416% |
|
[[38;5;39m INFO[0m][30-Jun-24 16:19:52] Chunk 377 = Batch 1576193 = Sample 807010305 |
|
[[38;5;39m INFO[0m][30-Jun-24 16:27:50] Total gradient norm stats for 262 steps: 0.3337 <= 0.3431 + 0.00377z <= 0.3582 |
|
[[38;5;39m INFO[0m][30-Jun-24 16:27:50] Trained chunk 377 in 478.0s at 4490noun/s: lr=6.32e-05, loss=1.29e+00, top1=71.46%/71.424% |
|
[[38;5;39m INFO[0m][30-Jun-24 16:27:50] Chunk 378 = Batch 1580385 = Sample 809156609 |
|
[[38;5;39m INFO[0m][30-Jun-24 16:35:48] Total gradient norm stats for 262 steps: 0.3346 <= 0.3436 + 0.004981z <= 0.3894 |
|
[[38;5;39m INFO[0m][30-Jun-24 16:35:48] Trained chunk 378 in 477.9s at 4491noun/s: lr=6.11e-05, loss=1.29e+00, top1=72.14%/71.429% |
|
[[38;5;39m INFO[0m][30-Jun-24 16:35:48] Chunk 379 = Batch 1584577 = Sample 811302913 |
|
[[38;5;39m INFO[0m][30-Jun-24 16:43:45] Total gradient norm stats for 262 steps: 0.3349 <= 0.3449 + 0.01659z <= 0.605 |
|
[[38;5;39m INFO[0m][30-Jun-24 16:43:45] Trained chunk 379 in 477.5s at 4494noun/s: lr=5.89e-05, loss=1.29e+00, top1=70.39%/71.436% |
|
[[38;5;39m INFO[0m][30-Jun-24 16:43:45] Chunk 380 = Batch 1588769 = Sample 813449217 |
|
[[38;5;39m INFO[0m][30-Jun-24 16:51:44] Total gradient norm stats for 262 steps: 0.3345 <= 0.3438 + 0.003773z <= 0.3576 |
|
[[38;5;39m INFO[0m][30-Jun-24 16:51:44] Trained chunk 380 in 478.5s at 4485noun/s: lr=5.68e-05, loss=1.29e+00, top1=71.01%/71.445% |
|
[[38;5;39m INFO[0m][30-Jun-24 16:51:44] Chunk 381 = Batch 1592961 = Sample 815595521 |
|
[[38;5;39m INFO[0m][30-Jun-24 16:59:42] Total gradient norm stats for 262 steps: 0.3332 <= 0.3444 + 0.003848z <= 0.3569 |
|
[[38;5;39m INFO[0m][30-Jun-24 16:59:42] Trained chunk 381 in 477.6s at 4493noun/s: lr=5.48e-05, loss=1.29e+00, top1=72.07%/71.450% |
|
[[38;5;39m INFO[0m][30-Jun-24 16:59:42] Chunk 382 = Batch 1597153 = Sample 817741825 |
|
[[38;5;39m INFO[0m][30-Jun-24 17:07:40] Total gradient norm stats for 262 steps: 0.3367 <= 0.3449 + 0.005552z <= 0.4043 |
|
[[38;5;39m INFO[0m][30-Jun-24 17:07:40] Trained chunk 382 in 478.4s at 4486noun/s: lr=5.27e-05, loss=1.29e+00, top1=72.22%/71.460% |
|
[[38;5;39m INFO[0m][30-Jun-24 17:07:40] Chunk 383 = Batch 1601345 = Sample 819888129 |
|
[[38;5;39m INFO[0m][30-Jun-24 17:15:38] Total gradient norm stats for 262 steps: 0.3356 <= 0.3448 + 0.004364z <= 0.3736 |
|
[[38;5;39m INFO[0m][30-Jun-24 17:15:38] Trained chunk 383 in 477.7s at 4493noun/s: lr=5.08e-05, loss=1.29e+00, top1=71.78%/71.469% |
|
[[38;5;39m INFO[0m][30-Jun-24 17:15:38] Chunk 384 = Batch 1605537 = Sample 822034433 |
|
[[38;5;39m INFO[0m][30-Jun-24 17:23:36] Total gradient norm stats for 262 steps: 0.3363 <= 0.3448 + 0.003883z <= 0.359 |
|
[[38;5;39m INFO[0m][30-Jun-24 17:23:36] Trained chunk 384 in 477.9s at 4491noun/s: lr=4.88e-05, loss=1.28e+00, top1=71.00%/71.478% |
|
[[38;5;39m INFO[0m][30-Jun-24 17:23:36] Saved checkpoint: /data/strahl/Code/ovod/outputs/ovod_20240628_142131/ovod_chunk0384_20240630_172336.train |
|
[[38;5;39m INFO[0m][30-Jun-24 17:23:36] Chunk 385 = Batch 1609729 = Sample 824180737 |
|
[[38;5;39m INFO[0m][30-Jun-24 17:31:35] Total gradient norm stats for 262 steps: 0.3354 <= 0.3457 + 0.004679z <= 0.3773 |
|
[[38;5;39m INFO[0m][30-Jun-24 17:31:35] Trained chunk 385 in 478.6s at 4485noun/s: lr=4.69e-05, loss=1.28e+00, top1=70.80%/71.487% |
|
[[38;5;39m INFO[0m][30-Jun-24 17:31:35] Chunk 386 = Batch 1613921 = Sample 826327041 |
|
[[38;5;39m INFO[0m][30-Jun-24 17:35:48] Epoch 16 finished in 11523.6s |
|
[[38;5;39m INFO[0m][30-Jun-24 17:35:48] -------------------------------------------------------------------------------- |
|
[[38;5;39m INFO[0m][30-Jun-24 17:35:48] Epoch 17 = Batch 1616129 = Sample 827457537 |
|
[[38;5;39m INFO[0m][30-Jun-24 17:39:35] Total gradient norm stats for 262 steps: 0.3373 <= 0.3462 + 0.01106z <= 0.5153 |
|
[[38;5;39m INFO[0m][30-Jun-24 17:39:35] Trained chunk 386 in 480.8s at 4464noun/s: lr=4.50e-05, loss=1.28e+00, top1=72.24%/71.497% |
|
[[38;5;39m INFO[0m][30-Jun-24 17:39:35] Chunk 387 = Batch 1618113 = Sample 828473345 |
|
[[38;5;39m INFO[0m][30-Jun-24 17:47:35] Total gradient norm stats for 262 steps: 0.3379 <= 0.346 + 0.004237z <= 0.3627 |
|
[[38;5;39m INFO[0m][30-Jun-24 17:47:35] Trained chunk 387 in 479.1s at 4480noun/s: lr=4.32e-05, loss=1.28e+00, top1=70.21%/71.503% |
|
[[38;5;39m INFO[0m][30-Jun-24 17:47:35] Chunk 388 = Batch 1622305 = Sample 830619649 |
|
[[38;5;39m INFO[0m][30-Jun-24 17:55:34] Total gradient norm stats for 262 steps: 0.3373 <= 0.3464 + 0.007773z <= 0.4551 |
|
[[38;5;39m INFO[0m][30-Jun-24 17:55:34] Trained chunk 388 in 479.2s at 4479noun/s: lr=4.14e-05, loss=1.28e+00, top1=70.89%/71.505% |
|
[[38;5;39m INFO[0m][30-Jun-24 17:55:34] Chunk 389 = Batch 1626497 = Sample 832765953 |
|
[[38;5;39m INFO[0m][30-Jun-24 18:03:32] Total gradient norm stats for 262 steps: 0.3365 <= 0.3477 + 0.01598z <= 0.5948 |
|
[[38;5;39m INFO[0m][30-Jun-24 18:03:32] Trained chunk 389 in 478.4s at 4487noun/s: lr=3.96e-05, loss=1.28e+00, top1=70.41%/71.513% |
|
[[38;5;39m INFO[0m][30-Jun-24 18:03:32] Chunk 390 = Batch 1630689 = Sample 834912257 |
|
[[38;5;39m INFO[0m][30-Jun-24 18:11:31] Total gradient norm stats for 262 steps: 0.3366 <= 0.3471 + 0.007612z <= 0.4344 |
|
[[38;5;39m INFO[0m][30-Jun-24 18:11:31] Trained chunk 390 in 478.9s at 4482noun/s: lr=3.79e-05, loss=1.28e+00, top1=72.05%/71.523% |
|
[[38;5;39m INFO[0m][30-Jun-24 18:11:31] Chunk 391 = Batch 1634881 = Sample 837058561 |
|
[[38;5;39m INFO[0m][30-Jun-24 18:19:30] Total gradient norm stats for 262 steps: 0.3345 <= 0.3492 + 0.03449z <= 0.9017 |
|
[[38;5;39m INFO[0m][30-Jun-24 18:19:30] Trained chunk 391 in 478.5s at 4486noun/s: lr=3.62e-05, loss=1.28e+00, top1=71.71%/71.530% |
|
[[38;5;39m INFO[0m][30-Jun-24 18:19:30] Chunk 392 = Batch 1639073 = Sample 839204865 |
|
[[38;5;39m INFO[0m][30-Jun-24 18:27:28] Total gradient norm stats for 262 steps: 0.3368 <= 0.3466 + 0.003455z <= 0.3597 |
|
[[38;5;39m INFO[0m][30-Jun-24 18:27:28] Trained chunk 392 in 478.7s at 4483noun/s: lr=3.46e-05, loss=1.28e+00, top1=72.01%/71.536% |
|
[[38;5;39m INFO[0m][30-Jun-24 18:27:28] Chunk 393 = Batch 1643265 = Sample 841351169 |
|
[[38;5;39m INFO[0m][30-Jun-24 18:35:27] Total gradient norm stats for 262 steps: 0.3398 <= 0.3471 + 0.006001z <= 0.4243 |
|
[[38;5;39m INFO[0m][30-Jun-24 18:35:27] Trained chunk 393 in 479.2s at 4479noun/s: lr=3.29e-05, loss=1.28e+00, top1=70.16%/71.541% |
|
[[38;5;39m INFO[0m][30-Jun-24 18:35:27] Chunk 394 = Batch 1647457 = Sample 843497473 |
|
[[38;5;39m INFO[0m][30-Jun-24 18:43:27] Total gradient norm stats for 262 steps: 0.3376 <= 0.3473 + 0.00511z <= 0.3952 |
|
[[38;5;39m INFO[0m][30-Jun-24 18:43:27] Trained chunk 394 in 479.2s at 4479noun/s: lr=3.14e-05, loss=1.28e+00, top1=70.26%/71.549% |
|
[[38;5;39m INFO[0m][30-Jun-24 18:43:27] Chunk 395 = Batch 1651649 = Sample 845643777 |
|
[[38;5;39m INFO[0m][30-Jun-24 18:51:25] Total gradient norm stats for 262 steps: 0.3385 <= 0.3474 + 0.004257z <= 0.3873 |
|
[[38;5;39m INFO[0m][30-Jun-24 18:51:25] Trained chunk 395 in 478.7s at 4484noun/s: lr=2.98e-05, loss=1.28e+00, top1=71.51%/71.555% |
|
[[38;5;39m INFO[0m][30-Jun-24 18:51:25] Chunk 396 = Batch 1655841 = Sample 847790081 |
|
[[38;5;39m INFO[0m][30-Jun-24 18:59:24] Total gradient norm stats for 262 steps: 0.3399 <= 0.3498 + 0.03563z <= 0.9213 |
|
[[38;5;39m INFO[0m][30-Jun-24 18:59:24] Trained chunk 396 in 478.7s at 4484noun/s: lr=2.83e-05, loss=1.28e+00, top1=70.36%/71.561% |
|
[[38;5;39m INFO[0m][30-Jun-24 18:59:24] Chunk 397 = Batch 1660033 = Sample 849936385 |
|
[[38;5;39m INFO[0m][30-Jun-24 19:07:22] Total gradient norm stats for 262 steps: 0.3377 <= 0.3483 + 0.006455z <= 0.4072 |
|
[[38;5;39m INFO[0m][30-Jun-24 19:07:22] Trained chunk 397 in 477.8s at 4492noun/s: lr=2.69e-05, loss=1.28e+00, top1=71.83%/71.566% |
|
[[38;5;39m INFO[0m][30-Jun-24 19:07:22] Chunk 398 = Batch 1664225 = Sample 852082689 |
|
[[38;5;39m INFO[0m][30-Jun-24 19:15:20] Total gradient norm stats for 262 steps: 0.341 <= 0.3493 + 0.02511z <= 0.751 |
|
[[38;5;39m INFO[0m][30-Jun-24 19:15:20] Trained chunk 398 in 478.1s at 4489noun/s: lr=2.54e-05, loss=1.28e+00, top1=71.95%/71.570% |
|
[[38;5;39m INFO[0m][30-Jun-24 19:15:20] Chunk 399 = Batch 1668417 = Sample 854228993 |
|
[[38;5;39m INFO[0m][30-Jun-24 19:23:17] Total gradient norm stats for 262 steps: 0.338 <= 0.3477 + 0.003791z <= 0.3708 |
|
[[38;5;39m INFO[0m][30-Jun-24 19:23:17] Trained chunk 399 in 477.4s at 4496noun/s: lr=2.41e-05, loss=1.28e+00, top1=71.24%/71.575% |
|
[[38;5;39m INFO[0m][30-Jun-24 19:23:17] Chunk 400 = Batch 1672609 = Sample 856375297 |
|
[[38;5;39m INFO[0m][30-Jun-24 19:31:15] Total gradient norm stats for 262 steps: 0.3373 <= 0.348 + 0.003845z <= 0.365 |
|
[[38;5;39m INFO[0m][30-Jun-24 19:31:15] Trained chunk 400 in 477.9s at 4491noun/s: lr=2.27e-05, loss=1.28e+00, top1=70.70%/71.579% |
|
[[38;5;39m INFO[0m][30-Jun-24 19:31:15] Chunk 401 = Batch 1676801 = Sample 858521601 |
|
[[38;5;39m INFO[0m][30-Jun-24 19:39:13] Total gradient norm stats for 262 steps: 0.339 <= 0.3483 + 0.00375z <= 0.3688 |
|
[[38;5;39m INFO[0m][30-Jun-24 19:39:13] Trained chunk 401 in 478.2s at 4488noun/s: lr=2.14e-05, loss=1.28e+00, top1=70.26%/71.585% |
|
[[38;5;39m INFO[0m][30-Jun-24 19:39:13] Chunk 402 = Batch 1680993 = Sample 860667905 |
|
[[38;5;39m INFO[0m][30-Jun-24 19:47:11] Total gradient norm stats for 262 steps: 0.338 <= 0.3484 + 0.004476z <= 0.3793 |
|
[[38;5;39m INFO[0m][30-Jun-24 19:47:11] Trained chunk 402 in 477.7s at 4493noun/s: lr=2.01e-05, loss=1.28e+00, top1=72.04%/71.592% |
|
[[38;5;39m INFO[0m][30-Jun-24 19:47:11] Chunk 403 = Batch 1685185 = Sample 862814209 |
|
[[38;5;39m INFO[0m][30-Jun-24 19:55:09] Total gradient norm stats for 262 steps: 0.3407 <= 0.3481 + 0.003438z <= 0.3593 |
|
[[38;5;39m INFO[0m][30-Jun-24 19:55:09] Trained chunk 403 in 478.1s at 4489noun/s: lr=1.89e-05, loss=1.28e+00, top1=72.06%/71.596% |
|
[[38;5;39m INFO[0m][30-Jun-24 19:55:09] Chunk 404 = Batch 1689377 = Sample 864960513 |
|
[[38;5;39m INFO[0m][30-Jun-24 20:03:07] Total gradient norm stats for 262 steps: 0.3403 <= 0.3483 + 0.003413z <= 0.36 |
|
[[38;5;39m INFO[0m][30-Jun-24 20:03:07] Trained chunk 404 in 477.3s at 4497noun/s: lr=1.77e-05, loss=1.28e+00, top1=72.50%/71.601% |
|
[[38;5;39m INFO[0m][30-Jun-24 20:03:07] Chunk 405 = Batch 1693569 = Sample 867106817 |
|
[[38;5;39m INFO[0m][30-Jun-24 20:11:05] Total gradient norm stats for 262 steps: 0.3411 <= 0.3488 + 0.007407z <= 0.4416 |
|
[[38;5;39m INFO[0m][30-Jun-24 20:11:05] Trained chunk 405 in 478.1s at 4490noun/s: lr=1.65e-05, loss=1.28e+00, top1=71.28%/71.606% |
|
[[38;5;39m INFO[0m][30-Jun-24 20:11:05] Chunk 406 = Batch 1697761 = Sample 869253121 |
|
[[38;5;39m INFO[0m][30-Jun-24 20:19:03] Total gradient norm stats for 262 steps: 0.3404 <= 0.3489 + 0.004071z <= 0.3742 |
|
[[38;5;39m INFO[0m][30-Jun-24 20:19:03] Trained chunk 406 in 478.0s at 4490noun/s: lr=1.54e-05, loss=1.28e+00, top1=71.53%/71.608% |
|
[[38;5;39m INFO[0m][30-Jun-24 20:19:03] Chunk 407 = Batch 1701953 = Sample 871399425 |
|
[[38;5;39m INFO[0m][30-Jun-24 20:27:00] Total gradient norm stats for 262 steps: 0.3408 <= 0.3543 + 0.09086z <= 1.818 (clipped to 1) |
|
[[38;5;39m INFO[0m][30-Jun-24 20:27:00] Trained chunk 407 in 477.1s at 4499noun/s: lr=1.43e-05, loss=1.28e+00, top1=72.02%/71.610% |
|
[[38;5;39m INFO[0m][30-Jun-24 20:27:00] Chunk 408 = Batch 1706145 = Sample 873545729 |
|
[[38;5;39m INFO[0m][30-Jun-24 20:34:57] Total gradient norm stats for 262 steps: 0.3394 <= 0.3487 + 0.006126z <= 0.4302 |
|
[[38;5;39m INFO[0m][30-Jun-24 20:34:57] Trained chunk 408 in 477.4s at 4496noun/s: lr=1.33e-05, loss=1.28e+00, top1=71.16%/71.612% |
|
[[38;5;39m INFO[0m][30-Jun-24 20:34:57] Chunk 409 = Batch 1710337 = Sample 875692033 |
|
[[38;5;39m INFO[0m][30-Jun-24 20:42:55] Total gradient norm stats for 262 steps: 0.3408 <= 0.3493 + 0.006479z <= 0.4138 |
|
[[38;5;39m INFO[0m][30-Jun-24 20:42:55] Trained chunk 409 in 477.5s at 4495noun/s: lr=1.23e-05, loss=1.28e+00, top1=72.85%/71.615% |
|
[[38;5;39m INFO[0m][30-Jun-24 20:42:55] Chunk 410 = Batch 1714529 = Sample 877838337 |
|
[[38;5;39m INFO[0m][30-Jun-24 20:47:54] Epoch 17 finished in 11525.2s |
|
[[38;5;39m INFO[0m][30-Jun-24 20:47:54] -------------------------------------------------------------------------------- |
|
[[38;5;39m INFO[0m][30-Jun-24 20:47:54] Epoch 18 = Batch 1717137 = Sample 879173633 |
|
[[38;5;39m INFO[0m][30-Jun-24 20:50:54] Total gradient norm stats for 262 steps: 0.3415 <= 0.3489 + 0.00389z <= 0.3697 |
|
[[38;5;39m INFO[0m][30-Jun-24 20:50:54] Trained chunk 410 in 479.3s at 4478noun/s: lr=1.13e-05, loss=1.28e+00, top1=71.70%/71.619% |
|
[[38;5;39m INFO[0m][30-Jun-24 20:50:54] Chunk 411 = Batch 1718721 = Sample 879984641 |
|
[[38;5;39m INFO[0m][30-Jun-24 20:58:53] Total gradient norm stats for 262 steps: 0.3397 <= 0.3486 + 0.004093z <= 0.3748 |
|
[[38;5;39m INFO[0m][30-Jun-24 20:58:53] Trained chunk 411 in 479.0s at 4481noun/s: lr=1.04e-05, loss=1.28e+00, top1=70.65%/71.625% |
|
[[38;5;39m INFO[0m][30-Jun-24 20:58:53] Chunk 412 = Batch 1722913 = Sample 882130945 |
|
[[38;5;39m INFO[0m][30-Jun-24 21:06:51] Total gradient norm stats for 262 steps: 0.3394 <= 0.3486 + 0.003381z <= 0.3581 |
|
[[38;5;39m INFO[0m][30-Jun-24 21:06:51] Trained chunk 412 in 478.2s at 4489noun/s: lr=9.53e-06, loss=1.28e+00, top1=72.55%/71.629% |
|
[[38;5;39m INFO[0m][30-Jun-24 21:06:51] Chunk 413 = Batch 1727105 = Sample 884277249 |
|
[[38;5;39m INFO[0m][30-Jun-24 21:14:49] Total gradient norm stats for 262 steps: 0.3387 <= 0.3493 + 0.004778z <= 0.3897 |
|
[[38;5;39m INFO[0m][30-Jun-24 21:14:49] Trained chunk 413 in 477.8s at 4492noun/s: lr=8.69e-06, loss=1.28e+00, top1=71.76%/71.634% |
|
[[38;5;39m INFO[0m][30-Jun-24 21:14:49] Chunk 414 = Batch 1731297 = Sample 886423553 |
|
[[38;5;39m INFO[0m][30-Jun-24 21:22:48] Total gradient norm stats for 262 steps: 0.3416 <= 0.3489 + 0.003638z <= 0.3617 |
|
[[38;5;39m INFO[0m][30-Jun-24 21:22:48] Trained chunk 414 in 479.3s at 4478noun/s: lr=7.88e-06, loss=1.28e+00, top1=73.43%/71.637% |
|
[[38;5;39m INFO[0m][30-Jun-24 21:22:48] Chunk 415 = Batch 1735489 = Sample 888569857 |
|
[[38;5;39m INFO[0m][30-Jun-24 21:30:47] Total gradient norm stats for 262 steps: 0.3401 <= 0.349 + 0.004144z <= 0.3902 |
|
[[38;5;39m INFO[0m][30-Jun-24 21:30:47] Trained chunk 415 in 478.8s at 4483noun/s: lr=7.11e-06, loss=1.28e+00, top1=71.16%/71.640% |
|
[[38;5;39m INFO[0m][30-Jun-24 21:30:47] Chunk 416 = Batch 1739681 = Sample 890716161 |
|
[[38;5;39m INFO[0m][30-Jun-24 21:38:46] Total gradient norm stats for 262 steps: 0.3409 <= 0.3492 + 0.004167z <= 0.3716 |
|
[[38;5;39m INFO[0m][30-Jun-24 21:38:46] Trained chunk 416 in 478.8s at 4482noun/s: lr=6.39e-06, loss=1.28e+00, top1=71.63%/71.643% |
|
[[38;5;39m INFO[0m][30-Jun-24 21:38:46] Chunk 417 = Batch 1743873 = Sample 892862465 |
|
[[38;5;39m INFO[0m][30-Jun-24 21:46:44] Total gradient norm stats for 262 steps: 0.3414 <= 0.349 + 0.003658z <= 0.3721 |
|
[[38;5;39m INFO[0m][30-Jun-24 21:46:44] Trained chunk 417 in 478.1s at 4489noun/s: lr=5.70e-06, loss=1.28e+00, top1=72.23%/71.645% |
|
[[38;5;39m INFO[0m][30-Jun-24 21:46:44] Chunk 418 = Batch 1748065 = Sample 895008769 |
|
[[38;5;39m INFO[0m][30-Jun-24 21:54:43] Total gradient norm stats for 262 steps: 0.3409 <= 0.3492 + 0.003586z <= 0.361 |
|
[[38;5;39m INFO[0m][30-Jun-24 21:54:43] Trained chunk 418 in 478.7s at 4484noun/s: lr=5.05e-06, loss=1.27e+00, top1=71.79%/71.647% |
|
[[38;5;39m INFO[0m][30-Jun-24 21:54:43] Chunk 419 = Batch 1752257 = Sample 897155073 |
|
[[38;5;39m INFO[0m][30-Jun-24 22:02:40] Total gradient norm stats for 262 steps: 0.3415 <= 0.3493 + 0.003922z <= 0.3822 |
|
[[38;5;39m INFO[0m][30-Jun-24 22:02:40] Trained chunk 419 in 477.8s at 4492noun/s: lr=4.44e-06, loss=1.28e+00, top1=71.78%/71.647% |
|
[[38;5;39m INFO[0m][30-Jun-24 22:02:40] Chunk 420 = Batch 1756449 = Sample 899301377 |
|
[[38;5;39m INFO[0m][30-Jun-24 22:10:39] Total gradient norm stats for 262 steps: 0.3397 <= 0.3535 + 0.068z <= 1.448 (clipped to 1) |
|
[[38;5;39m INFO[0m][30-Jun-24 22:10:39] Trained chunk 420 in 478.3s at 4487noun/s: lr=3.87e-06, loss=1.27e+00, top1=71.50%/71.651% |
|
[[38;5;39m INFO[0m][30-Jun-24 22:10:39] Chunk 421 = Batch 1760641 = Sample 901447681 |
|
[[38;5;39m INFO[0m][30-Jun-24 22:18:37] Total gradient norm stats for 262 steps: 0.3407 <= 0.349 + 0.00387z <= 0.3648 |
|
[[38;5;39m INFO[0m][30-Jun-24 22:18:37] Trained chunk 421 in 478.1s at 4489noun/s: lr=3.33e-06, loss=1.27e+00, top1=70.64%/71.655% |
|
[[38;5;39m INFO[0m][30-Jun-24 22:18:37] Chunk 422 = Batch 1764833 = Sample 903593985 |
|
[[38;5;39m INFO[0m][30-Jun-24 22:26:34] Total gradient norm stats for 262 steps: 0.3407 <= 0.3494 + 0.004609z <= 0.3882 |
|
[[38;5;39m INFO[0m][30-Jun-24 22:26:34] Trained chunk 422 in 477.1s at 4498noun/s: lr=2.84e-06, loss=1.27e+00, top1=73.77%/71.655% |
|
[[38;5;39m INFO[0m][30-Jun-24 22:26:34] Chunk 423 = Batch 1769025 = Sample 905740289 |
|
[[38;5;39m INFO[0m][30-Jun-24 22:34:33] Total gradient norm stats for 262 steps: 0.341 <= 0.3499 + 0.01119z <= 0.5177 |
|
[[38;5;39m INFO[0m][30-Jun-24 22:34:33] Trained chunk 423 in 478.6s at 4485noun/s: lr=2.39e-06, loss=1.27e+00, top1=71.88%/71.657% |
|
[[38;5;39m INFO[0m][30-Jun-24 22:34:33] Chunk 424 = Batch 1773217 = Sample 907886593 |
|
[[38;5;39m INFO[0m][30-Jun-24 22:42:31] Total gradient norm stats for 262 steps: 0.3409 <= 0.3493 + 0.004708z <= 0.3862 |
|
[[38;5;39m INFO[0m][30-Jun-24 22:42:31] Trained chunk 424 in 478.1s at 4490noun/s: lr=1.97e-06, loss=1.27e+00, top1=72.51%/71.657% |
|
[[38;5;39m INFO[0m][30-Jun-24 22:42:31] Chunk 425 = Batch 1777409 = Sample 910032897 |
|
[[38;5;39m INFO[0m][30-Jun-24 22:50:29] Total gradient norm stats for 262 steps: 0.3399 <= 0.3489 + 0.003436z <= 0.3593 |
|
[[38;5;39m INFO[0m][30-Jun-24 22:50:29] Trained chunk 425 in 478.6s at 4484noun/s: lr=1.60e-06, loss=1.27e+00, top1=70.98%/71.659% |
|
[[38;5;39m INFO[0m][30-Jun-24 22:50:29] Chunk 426 = Batch 1781601 = Sample 912179201 |
|
[[38;5;39m INFO[0m][30-Jun-24 22:58:27] Total gradient norm stats for 262 steps: 0.3403 <= 0.3491 + 0.00582z <= 0.4237 |
|
[[38;5;39m INFO[0m][30-Jun-24 22:58:27] Trained chunk 426 in 477.4s at 4496noun/s: lr=1.26e-06, loss=1.27e+00, top1=72.27%/71.661% |
|
[[38;5;39m INFO[0m][30-Jun-24 22:58:27] Chunk 427 = Batch 1785793 = Sample 914325505 |
|
[[38;5;39m INFO[0m][30-Jun-24 23:06:25] Total gradient norm stats for 262 steps: 0.3404 <= 0.3493 + 0.003309z <= 0.3604 |
|
[[38;5;39m INFO[0m][30-Jun-24 23:06:25] Trained chunk 427 in 478.2s at 4489noun/s: lr=9.67e-07, loss=1.27e+00, top1=71.14%/71.663% |
|
[[38;5;39m INFO[0m][30-Jun-24 23:06:25] Chunk 428 = Batch 1789985 = Sample 916471809 |
|
[[38;5;39m INFO[0m][30-Jun-24 23:14:23] Total gradient norm stats for 262 steps: 0.34 <= 0.3492 + 0.004893z <= 0.3989 |
|
[[38;5;39m INFO[0m][30-Jun-24 23:14:23] Trained chunk 428 in 478.3s at 4488noun/s: lr=7.11e-07, loss=1.27e+00, top1=72.49%/71.666% |
|
[[38;5;39m INFO[0m][30-Jun-24 23:14:23] Chunk 429 = Batch 1794177 = Sample 918618113 |
|
[[38;5;39m INFO[0m][30-Jun-24 23:22:22] Total gradient norm stats for 262 steps: 0.341 <= 0.3491 + 0.003458z <= 0.3652 |
|
[[38;5;39m INFO[0m][30-Jun-24 23:22:22] Trained chunk 429 in 478.6s at 4485noun/s: lr=4.93e-07, loss=1.27e+00, top1=70.68%/71.669% |
|
[[38;5;39m INFO[0m][30-Jun-24 23:22:22] Chunk 430 = Batch 1798369 = Sample 920764417 |
|
[[38;5;39m INFO[0m][30-Jun-24 23:30:20] Total gradient norm stats for 262 steps: 0.341 <= 0.3492 + 0.003341z <= 0.3591 |
|
[[38;5;39m INFO[0m][30-Jun-24 23:30:20] Trained chunk 430 in 478.5s at 4486noun/s: lr=3.16e-07, loss=1.27e+00, top1=71.44%/71.668% |
|
[[38;5;39m INFO[0m][30-Jun-24 23:30:20] Chunk 431 = Batch 1802561 = Sample 922910721 |
|
[[38;5;39m INFO[0m][30-Jun-24 23:38:18] Total gradient norm stats for 262 steps: 0.341 <= 0.3505 + 0.01715z <= 0.621 |
|
[[38;5;39m INFO[0m][30-Jun-24 23:38:18] Trained chunk 431 in 478.2s at 4489noun/s: lr=1.78e-07, loss=1.27e+00, top1=72.18%/71.666% |
|
[[38;5;39m INFO[0m][30-Jun-24 23:38:18] Chunk 432 = Batch 1806753 = Sample 925057025 |
|
[[38;5;39m INFO[0m][30-Jun-24 23:46:17] Total gradient norm stats for 262 steps: 0.3403 <= 0.349 + 0.003484z <= 0.3614 |
|
[[38;5;39m INFO[0m][30-Jun-24 23:46:17] Trained chunk 432 in 478.0s at 4490noun/s: lr=7.90e-08, loss=1.27e+00, top1=72.95%/71.665% |
|
[[38;5;39m INFO[0m][30-Jun-24 23:46:17] Saved checkpoint: /data/strahl/Code/ovod/outputs/ovod_20240628_142131/ovod_chunk0432_20240630_234617.train |
|
[[38;5;39m INFO[0m][30-Jun-24 23:46:17] Chunk 433 = Batch 1810945 = Sample 927203329 |
|
[[38;5;39m INFO[0m][30-Jun-24 23:54:15] Total gradient norm stats for 262 steps: 0.3403 <= 0.3497 + 0.005913z <= 0.4232 |
|
[[38;5;39m INFO[0m][30-Jun-24 23:54:15] Trained chunk 433 in 478.4s at 4486noun/s: lr=1.97e-08, loss=1.27e+00, top1=71.90%/71.668% |
|
[[38;5;39m INFO[0m][30-Jun-24 23:54:16] Saved checkpoint: /data/strahl/Code/ovod/outputs/ovod_20240628_142131/ovod_chunk0433_20240630_235415.train |
|
[[38;5;39m INFO[0m][30-Jun-24 23:54:17] -------------------------------------------------------------------------------- |
|
[[38;5;39m INFO[0m][30-Jun-24 23:54:17] Trained for 433 chunks (up to 18 epochs) in 207162.0s |
|
[[38;5;39m INFO[0m][30-Jun-24 23:54:17] Trained 1815136 batches = 929349632 samples |
|
[[38;5;39m INFO[0m][30-Jun-24 23:54:17] Unloaded and un-memory-mapped cache |
|
|