| { |
| "architectures": [ |
| "VFMMultiFrameTransformer" |
| ], |
| "chosen_layers": [ |
| 4, |
| 11, |
| 17, |
| 23 |
| ], |
| "ffn_layer": "mlp", |
| "geometry_aggregator": false, |
| "geometry_aggregator_layer": 6, |
| "grounding_ratio": 0.5, |
| "hidden_act": "gelu", |
| "hidden_size": 1024, |
| "image_aggregator": false, |
| "image_aggregator_layer": 6, |
| "image_size": 224, |
| "image_ssl": { |
| "compute_precision": { |
| "sharding_strategy": "SHARD_GRAD_OP" |
| }, |
| "crops": { |
| "local_crops_number": 2 |
| }, |
| "dino": { |
| "force_weight_norm": false, |
| "global_ignore_diagonal": true, |
| "head_bottleneck_dim": 256, |
| "head_hidden_dim": 2048, |
| "head_n_prototypes": 65536, |
| "head_nlayers": 3, |
| "head_norm_last_layer": false, |
| "koleo_distributed_replicas": 0, |
| "koleo_loss_distributed": false, |
| "koleo_loss_weight": 0.1, |
| "koleo_topk": 1, |
| "local_loss_weight_schedule": { |
| "end": 0.5, |
| "peak": 0.5, |
| "start": 0.5, |
| "warmup_epochs": 0 |
| }, |
| "loss_weight": 1.0, |
| "reweight_dino_local_loss": false |
| }, |
| "distillation": { |
| "checkpoint_path": "", |
| "enabled": false, |
| "full_cfg_path": "" |
| }, |
| "gram": { |
| "ckpt": null, |
| "compute_stats": false, |
| "ema_teacher": false, |
| "global_teacher_resize_antialias": false, |
| "global_teacher_resize_method": "bicubic", |
| "img_level": true, |
| "it_first_update": 0, |
| "it_load_ema_teacher": -1, |
| "loss_weight": 1.0, |
| "loss_weight_schedule": null, |
| "max_updates": null, |
| "normalized": true, |
| "remove_neg": false, |
| "remove_only_teacher_neg": false, |
| "rep_update": true, |
| "tokens_used": "all", |
| "update_frequency": 50000, |
| "use_loss": true |
| }, |
| "ibot": { |
| "force_masking_even_with_zero_weight": false, |
| "head_bottleneck_dim": 256, |
| "head_hidden_dim": 2048, |
| "head_n_prototypes": 65536, |
| "head_nlayers": 3, |
| "head_norm_last_layer": false, |
| "loss_weight": 1.0, |
| "mask_random_circular_shift": false, |
| "mask_ratio_min_max": [ |
| 0.1, |
| 0.5 |
| ], |
| "mask_sample_probability": 0.5, |
| "separate_head": true |
| }, |
| "multidistillation": { |
| "enabled": false |
| }, |
| "train": { |
| "centering": "sinkhorn_knopp" |
| } |
| }, |
| "initializer_range": 0.02, |
| "intermediate_size": 3072, |
| "layer_norm_eps": 1e-06, |
| "mlp_ratio": 4.0, |
| "mm_projector_type": "mlp2x_gelu", |
| "model_type": "vfm", |
| "num_attention_heads": 16, |
| "num_channels": 3, |
| "num_experts": 8, |
| "num_frames": 16, |
| "patch_embed_name": "dinov3_vitl16_torch", |
| "patch_size": 16, |
| "top_k": 2, |
| "torch_dtype": "float32", |
| "transformers_version": "4.52.3", |
| "upcycle_to_moe": false, |
| "video_aggregator": true, |
| "video_aggregator_layer": 24 |
| } |
|
|