davidfan97 tsbpp meta-bot commited on
Commit
d3bf033
·
verified ·
0 Parent(s):

Initial commit

Browse files

Co-authored-by: tsbpp <tsbpp@users.noreply.huggingface.co>
Co-authored-by: meta-bot <meta-bot@users.noreply.huggingface.co>

Files changed (6) hide show
  1. .gitattributes +36 -0
  2. README.md +51 -0
  3. config.json +77 -0
  4. model.safetensors +3 -0
  5. preprocessor_config.json +27 -0
  6. webssl_teaser.png +3 -0
.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ webssl_teaser.png filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: cc-by-nc-4.0
4
+ inference: false
5
+ ---
6
+ # Web-SSL DINO ViT-1B: 2B MetaCLIP data, 224 Resolution
7
+ A 1 billion parameter Vision Transformer (ViT) trained with DINOv2 self-supervised learning on web-scale image data without language supervision. Introduced in ["Scaling Language-Free Visual Representation Learning"](https://arxiv.org/abs/2504.01017) (Fan et al., 2025).
8
+
9
+ ## Model Details
10
+ - **Architecture**: ViT (1536 width, 40 depth, 24 heads)
11
+ - **Parameters**: 1B
12
+ - **Resolution**: 224×224 pixels
13
+ - **Training**: Self-supervised Web-DINO on 2B image samples from MetaCLIP web data
14
+
15
+ ## Model Descriptions
16
+ Web-SSL DINO 1B is a 1 billion parameter Vision Transformer model trained using self-supervised learning on 2 billion web images without language supervision. This model demonstrates that pure visual learning, when scaled appropriately, can match or exceed the performance of language-supervised models like CLIP across various vision tasks.
17
+
18
+ <img src="webssl_teaser.png" alt="WebSSL Model Overview" width="600">
19
+
20
+ ## Usage
21
+ ```python
22
+ from transformers import AutoImageProcessor, Dinov2Model
23
+ import torch
24
+ from PIL import Image
25
+
26
+ processor = AutoImageProcessor.from_pretrained('facebook/webssl-dino1b-full2b-224')
27
+ # 'eager' and 'sdpa' attn_implementation supported
28
+ model = Dinov2Model.from_pretrained('facebook/webssl-dino1b-full2b-224')
29
+
30
+ # Process an image
31
+ image = Image.open('path/to/image.jpg')
32
+ inputs = processor(images=image, return_tensors="pt")
33
+ with torch.no_grad():
34
+ outputs = model(**inputs)
35
+
36
+ cls_features = outputs.last_hidden_state[:, 0] # CLS token features
37
+ patch_features = outputs.last_hidden_state[:, 1:] # patch-wise token features
38
+ ```
39
+
40
+ ## Citation
41
+
42
+ ```bibtex
43
+ @article{fan2025scaling,
44
+ title={Scaling Language-Free Visual Representation Learning},
45
+ author={David Fan and Shengbang Tong and Jiachen Zhu and Koustuv Sinha and Zhuang Liu and Xinlei Chen and Michael Rabbat and Nicolas Ballas and Yann LeCun and Amir Bar and Saining Xie},
46
+ year={2025},
47
+ eprint={2504.01017},
48
+ archivePrefix={arXiv},
49
+ primaryClass={cs.CV}
50
+ }
51
+ ```
config.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "apply_layernorm": true,
3
+ "architectures": [
4
+ "Dinov2Model"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.0,
7
+ "block_chunks": 4,
8
+ "drop_path_rate": 0.0,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.0,
11
+ "hidden_size": 1536,
12
+ "image_size": 224,
13
+ "initializer_range": 0.02,
14
+ "layer_norm_eps": 1e-06,
15
+ "layerscale_value": 1.0,
16
+ "mlp_ratio": 4,
17
+ "model_type": "dinov2",
18
+ "num_attention_heads": 24,
19
+ "num_channels": 3,
20
+ "num_hidden_layers": 40,
21
+ "num_layers_per_block": 10,
22
+ "out_features": [
23
+ "stage40"
24
+ ],
25
+ "out_indices": [
26
+ 40
27
+ ],
28
+ "patch_size": 14,
29
+ "qkv_bias": true,
30
+ "reshape_hidden_states": true,
31
+ "stage_names": [
32
+ "stem",
33
+ "stage1",
34
+ "stage2",
35
+ "stage3",
36
+ "stage4",
37
+ "stage5",
38
+ "stage6",
39
+ "stage7",
40
+ "stage8",
41
+ "stage9",
42
+ "stage10",
43
+ "stage11",
44
+ "stage12",
45
+ "stage13",
46
+ "stage14",
47
+ "stage15",
48
+ "stage16",
49
+ "stage17",
50
+ "stage18",
51
+ "stage19",
52
+ "stage20",
53
+ "stage21",
54
+ "stage22",
55
+ "stage23",
56
+ "stage24",
57
+ "stage25",
58
+ "stage26",
59
+ "stage27",
60
+ "stage28",
61
+ "stage29",
62
+ "stage30",
63
+ "stage31",
64
+ "stage32",
65
+ "stage33",
66
+ "stage34",
67
+ "stage35",
68
+ "stage36",
69
+ "stage37",
70
+ "stage38",
71
+ "stage39",
72
+ "stage40"
73
+ ],
74
+ "torch_dtype": "float32",
75
+ "transformers_version": "4.46.2",
76
+ "use_swiglu_ffn": true
77
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df2b9b861736e630d0eb223c6d8809a94fa6ca54206c05634b56f3dbd9bf607d
3
+ size 4539167152
preprocessor_config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": {
3
+ "height": 224,
4
+ "width": 224
5
+ },
6
+ "do_center_crop": true,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_rescale": true,
10
+ "do_resize": true,
11
+ "image_mean": [
12
+ 0.485,
13
+ 0.456,
14
+ 0.406
15
+ ],
16
+ "image_processor_type": "BitImageProcessor",
17
+ "image_std": [
18
+ 0.229,
19
+ 0.224,
20
+ 0.225
21
+ ],
22
+ "resample": 3,
23
+ "rescale_factor": 0.00392156862745098,
24
+ "size": {
25
+ "shortest_edge": 224
26
+ }
27
+ }
webssl_teaser.png ADDED

Git LFS Details

  • SHA256: 06f6b5568bd4bdf00a3d249329ebab11023e475eb30c9249da61d486fc039fe5
  • Pointer size: 131 Bytes
  • Size of remote file: 371 kB