multimodalart HF Staff commited on
Commit
15e3a3b
·
verified ·
1 Parent(s): 0d5ce11

Upload 719 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +450 -0
  2. taming-transformers/License.txt +19 -0
  3. taming-transformers/README.md +410 -0
  4. taming-transformers/assets/birddrawnbyachild.png +3 -0
  5. taming-transformers/assets/coco_scene_images_training.svg +2574 -0
  6. taming-transformers/assets/drin.jpg +3 -0
  7. taming-transformers/assets/faceshq.jpg +3 -0
  8. taming-transformers/assets/first_stage_mushrooms.png +3 -0
  9. taming-transformers/assets/first_stage_squirrels.png +3 -0
  10. taming-transformers/assets/imagenet.png +3 -0
  11. taming-transformers/assets/lake_in_the_mountains.png +3 -0
  12. taming-transformers/assets/mountain.jpeg +3 -0
  13. taming-transformers/assets/scene_images_samples.svg +0 -0
  14. taming-transformers/assets/stormy.jpeg +3 -0
  15. taming-transformers/assets/sunset_and_ocean.jpg +3 -0
  16. taming-transformers/assets/teaser.png +3 -0
  17. taming-transformers/configs/coco_cond_stage.yaml +49 -0
  18. taming-transformers/configs/coco_scene_images_transformer.yaml +80 -0
  19. taming-transformers/configs/custom_vqgan.yaml +43 -0
  20. taming-transformers/configs/drin_transformer.yaml +77 -0
  21. taming-transformers/configs/faceshq_transformer.yaml +61 -0
  22. taming-transformers/configs/faceshq_vqgan.yaml +42 -0
  23. taming-transformers/configs/imagenet_vqgan.yaml +42 -0
  24. taming-transformers/configs/imagenetdepth_vqgan.yaml +41 -0
  25. taming-transformers/configs/open_images_scene_images_transformer.yaml +86 -0
  26. taming-transformers/configs/sflckr_cond_stage.yaml +43 -0
  27. taming-transformers/data/ade20k_examples.txt +30 -0
  28. taming-transformers/data/ade20k_images/ADE_val_00000123.jpg +0 -0
  29. taming-transformers/data/ade20k_images/ADE_val_00000125.jpg +0 -0
  30. taming-transformers/data/ade20k_images/ADE_val_00000126.jpg +0 -0
  31. taming-transformers/data/ade20k_images/ADE_val_00000203.jpg +0 -0
  32. taming-transformers/data/ade20k_images/ADE_val_00000262.jpg +0 -0
  33. taming-transformers/data/ade20k_images/ADE_val_00000287.jpg +0 -0
  34. taming-transformers/data/ade20k_images/ADE_val_00000289.jpg +0 -0
  35. taming-transformers/data/ade20k_images/ADE_val_00000303.jpg +0 -0
  36. taming-transformers/data/ade20k_images/ADE_val_00000509.jpg +0 -0
  37. taming-transformers/data/ade20k_images/ADE_val_00000532.jpg +0 -0
  38. taming-transformers/data/ade20k_images/ADE_val_00000573.jpg +0 -0
  39. taming-transformers/data/ade20k_images/ADE_val_00000603.jpg +0 -0
  40. taming-transformers/data/ade20k_images/ADE_val_00000636.jpg +0 -0
  41. taming-transformers/data/ade20k_images/ADE_val_00000734.jpg +0 -0
  42. taming-transformers/data/ade20k_images/ADE_val_00000875.jpg +0 -0
  43. taming-transformers/data/ade20k_images/ADE_val_00000880.jpg +0 -0
  44. taming-transformers/data/ade20k_images/ADE_val_00001177.jpg +0 -0
  45. taming-transformers/data/ade20k_images/ADE_val_00001200.jpg +0 -0
  46. taming-transformers/data/ade20k_images/ADE_val_00001209.jpg +0 -0
  47. taming-transformers/data/ade20k_images/ADE_val_00001388.jpg +0 -0
  48. taming-transformers/data/ade20k_images/ADE_val_00001412.jpg +0 -0
  49. taming-transformers/data/ade20k_images/ADE_val_00001498.jpg +0 -0
  50. taming-transformers/data/ade20k_images/ADE_val_00001578.jpg +0 -0
.gitattributes ADDED
@@ -0,0 +1,450 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ taming-transformers/assets/birddrawnbyachild.png filter=lfs diff=lfs merge=lfs -text
2
+ taming-transformers/assets/drin.jpg filter=lfs diff=lfs merge=lfs -text
3
+ taming-transformers/assets/faceshq.jpg filter=lfs diff=lfs merge=lfs -text
4
+ taming-transformers/assets/first_stage_mushrooms.png filter=lfs diff=lfs merge=lfs -text
5
+ taming-transformers/assets/first_stage_squirrels.png filter=lfs diff=lfs merge=lfs -text
6
+ taming-transformers/assets/imagenet.png filter=lfs diff=lfs merge=lfs -text
7
+ taming-transformers/assets/lake_in_the_mountains.png filter=lfs diff=lfs merge=lfs -text
8
+ taming-transformers/assets/mountain.jpeg filter=lfs diff=lfs merge=lfs -text
9
+ taming-transformers/assets/stormy.jpeg filter=lfs diff=lfs merge=lfs -text
10
+ taming-transformers/assets/sunset_and_ocean.jpg filter=lfs diff=lfs merge=lfs -text
11
+ taming-transformers/assets/teaser.png filter=lfs diff=lfs merge=lfs -text
12
+ taming-transformers/data/coco_annotations_100/train2017/000000010005.jpg filter=lfs diff=lfs merge=lfs -text
13
+ taming-transformers/data/coco_annotations_100/train2017/000000010014.jpg filter=lfs diff=lfs merge=lfs -text
14
+ taming-transformers/data/coco_annotations_100/train2017/000000010015.jpg filter=lfs diff=lfs merge=lfs -text
15
+ taming-transformers/data/coco_annotations_100/train2017/000000010023.jpg filter=lfs diff=lfs merge=lfs -text
16
+ taming-transformers/data/coco_annotations_100/train2017/000000010024.jpg filter=lfs diff=lfs merge=lfs -text
17
+ taming-transformers/data/coco_annotations_100/train2017/000000010037.jpg filter=lfs diff=lfs merge=lfs -text
18
+ taming-transformers/data/coco_annotations_100/train2017/000000010039.jpg filter=lfs diff=lfs merge=lfs -text
19
+ taming-transformers/data/coco_annotations_100/train2017/000000010040.jpg filter=lfs diff=lfs merge=lfs -text
20
+ taming-transformers/data/coco_annotations_100/train2017/000000010041.jpg filter=lfs diff=lfs merge=lfs -text
21
+ taming-transformers/data/coco_annotations_100/train2017/000000010046.jpg filter=lfs diff=lfs merge=lfs -text
22
+ taming-transformers/data/coco_annotations_100/train2017/000000010056.jpg filter=lfs diff=lfs merge=lfs -text
23
+ taming-transformers/data/coco_annotations_100/train2017/000000010058.jpg filter=lfs diff=lfs merge=lfs -text
24
+ taming-transformers/data/coco_annotations_100/train2017/000000010069.jpg filter=lfs diff=lfs merge=lfs -text
25
+ taming-transformers/data/coco_annotations_100/train2017/000000010073.jpg filter=lfs diff=lfs merge=lfs -text
26
+ taming-transformers/data/coco_annotations_100/train2017/000000010077.jpg filter=lfs diff=lfs merge=lfs -text
27
+ taming-transformers/data/coco_annotations_100/train2017/000000010082.jpg filter=lfs diff=lfs merge=lfs -text
28
+ taming-transformers/data/coco_annotations_100/train2017/000000010083.jpg filter=lfs diff=lfs merge=lfs -text
29
+ taming-transformers/data/coco_annotations_100/train2017/000000010084.jpg filter=lfs diff=lfs merge=lfs -text
30
+ taming-transformers/data/coco_annotations_100/train2017/000000010094.jpg filter=lfs diff=lfs merge=lfs -text
31
+ taming-transformers/data/coco_annotations_100/train2017/000000010097.jpg filter=lfs diff=lfs merge=lfs -text
32
+ taming-transformers/data/coco_annotations_100/train2017/000000010104.jpg filter=lfs diff=lfs merge=lfs -text
33
+ taming-transformers/data/coco_annotations_100/train2017/000000010114.jpg filter=lfs diff=lfs merge=lfs -text
34
+ taming-transformers/data/coco_annotations_100/train2017/000000010115.jpg filter=lfs diff=lfs merge=lfs -text
35
+ taming-transformers/data/coco_annotations_100/train2017/000000010123.jpg filter=lfs diff=lfs merge=lfs -text
36
+ taming-transformers/data/coco_annotations_100/train2017/000000010125.jpg filter=lfs diff=lfs merge=lfs -text
37
+ taming-transformers/data/coco_annotations_100/train2017/000000010130.jpg filter=lfs diff=lfs merge=lfs -text
38
+ taming-transformers/data/coco_annotations_100/train2017/000000010136.jpg filter=lfs diff=lfs merge=lfs -text
39
+ taming-transformers/data/coco_annotations_100/train2017/000000010138.jpg filter=lfs diff=lfs merge=lfs -text
40
+ taming-transformers/data/coco_annotations_100/train2017/000000010142.jpg filter=lfs diff=lfs merge=lfs -text
41
+ taming-transformers/data/coco_annotations_100/train2017/000000010145.jpg filter=lfs diff=lfs merge=lfs -text
42
+ taming-transformers/data/coco_annotations_100/train2017/000000010149.jpg filter=lfs diff=lfs merge=lfs -text
43
+ taming-transformers/data/coco_annotations_100/train2017/000000010161.jpg filter=lfs diff=lfs merge=lfs -text
44
+ taming-transformers/data/coco_annotations_100/train2017/000000010166.jpg filter=lfs diff=lfs merge=lfs -text
45
+ taming-transformers/data/coco_annotations_100/train2017/000000010175.jpg filter=lfs diff=lfs merge=lfs -text
46
+ taming-transformers/data/coco_annotations_100/train2017/000000010176.jpg filter=lfs diff=lfs merge=lfs -text
47
+ taming-transformers/data/coco_annotations_100/train2017/000000010179.jpg filter=lfs diff=lfs merge=lfs -text
48
+ taming-transformers/data/coco_annotations_100/train2017/000000010192.jpg filter=lfs diff=lfs merge=lfs -text
49
+ taming-transformers/data/coco_annotations_100/train2017/000000010196.jpg filter=lfs diff=lfs merge=lfs -text
50
+ taming-transformers/data/coco_annotations_100/train2017/000000010211.jpg filter=lfs diff=lfs merge=lfs -text
51
+ taming-transformers/data/coco_annotations_100/train2017/000000010216.jpg filter=lfs diff=lfs merge=lfs -text
52
+ taming-transformers/data/coco_annotations_100/train2017/000000010217.jpg filter=lfs diff=lfs merge=lfs -text
53
+ taming-transformers/data/coco_annotations_100/train2017/000000010219.jpg filter=lfs diff=lfs merge=lfs -text
54
+ taming-transformers/data/coco_annotations_100/train2017/000000010229.jpg filter=lfs diff=lfs merge=lfs -text
55
+ taming-transformers/data/coco_annotations_100/train2017/000000010230.jpg filter=lfs diff=lfs merge=lfs -text
56
+ taming-transformers/data/coco_annotations_100/train2017/000000010232.jpg filter=lfs diff=lfs merge=lfs -text
57
+ taming-transformers/data/coco_annotations_100/train2017/000000010239.jpg filter=lfs diff=lfs merge=lfs -text
58
+ taming-transformers/data/coco_annotations_100/train2017/000000010241.jpg filter=lfs diff=lfs merge=lfs -text
59
+ taming-transformers/data/coco_annotations_100/train2017/000000010244.jpg filter=lfs diff=lfs merge=lfs -text
60
+ taming-transformers/data/coco_annotations_100/train2017/000000010245.jpg filter=lfs diff=lfs merge=lfs -text
61
+ taming-transformers/data/coco_annotations_100/train2017/000000010248.jpg filter=lfs diff=lfs merge=lfs -text
62
+ taming-transformers/data/coco_annotations_100/train2017/000000010249.jpg filter=lfs diff=lfs merge=lfs -text
63
+ taming-transformers/data/coco_annotations_100/train2017/000000010256.jpg filter=lfs diff=lfs merge=lfs -text
64
+ taming-transformers/data/coco_annotations_100/train2017/000000010263.jpg filter=lfs diff=lfs merge=lfs -text
65
+ taming-transformers/data/coco_annotations_100/train2017/000000010275.jpg filter=lfs diff=lfs merge=lfs -text
66
+ taming-transformers/data/coco_annotations_100/train2017/000000010276.jpg filter=lfs diff=lfs merge=lfs -text
67
+ taming-transformers/data/coco_annotations_100/train2017/000000010281.jpg filter=lfs diff=lfs merge=lfs -text
68
+ taming-transformers/data/coco_annotations_100/train2017/000000010290.jpg filter=lfs diff=lfs merge=lfs -text
69
+ taming-transformers/data/coco_annotations_100/train2017/000000010303.jpg filter=lfs diff=lfs merge=lfs -text
70
+ taming-transformers/data/coco_annotations_100/train2017/000000010318.jpg filter=lfs diff=lfs merge=lfs -text
71
+ taming-transformers/data/coco_annotations_100/train2017/000000010319.jpg filter=lfs diff=lfs merge=lfs -text
72
+ taming-transformers/data/coco_annotations_100/train2017/000000010321.jpg filter=lfs diff=lfs merge=lfs -text
73
+ taming-transformers/data/coco_annotations_100/train2017/000000010324.jpg filter=lfs diff=lfs merge=lfs -text
74
+ taming-transformers/data/coco_annotations_100/train2017/000000010327.jpg filter=lfs diff=lfs merge=lfs -text
75
+ taming-transformers/data/coco_annotations_100/train2017/000000010337.jpg filter=lfs diff=lfs merge=lfs -text
76
+ taming-transformers/data/coco_annotations_100/train2017/000000010342.jpg filter=lfs diff=lfs merge=lfs -text
77
+ taming-transformers/data/coco_annotations_100/train2017/000000010343.jpg filter=lfs diff=lfs merge=lfs -text
78
+ taming-transformers/data/coco_annotations_100/train2017/000000010346.jpg filter=lfs diff=lfs merge=lfs -text
79
+ taming-transformers/data/coco_annotations_100/train2017/000000010358.jpg filter=lfs diff=lfs merge=lfs -text
80
+ taming-transformers/data/coco_annotations_100/train2017/000000010369.jpg filter=lfs diff=lfs merge=lfs -text
81
+ taming-transformers/data/coco_annotations_100/train2017/000000010386.jpg filter=lfs diff=lfs merge=lfs -text
82
+ taming-transformers/data/coco_annotations_100/train2017/000000010393.jpg filter=lfs diff=lfs merge=lfs -text
83
+ taming-transformers/data/coco_annotations_100/train2017/000000010395.jpg filter=lfs diff=lfs merge=lfs -text
84
+ taming-transformers/data/coco_annotations_100/train2017/000000010400.jpg filter=lfs diff=lfs merge=lfs -text
85
+ taming-transformers/data/coco_annotations_100/train2017/000000010403.jpg filter=lfs diff=lfs merge=lfs -text
86
+ taming-transformers/data/coco_annotations_100/train2017/000000010405.jpg filter=lfs diff=lfs merge=lfs -text
87
+ taming-transformers/data/coco_annotations_100/train2017/000000010407.jpg filter=lfs diff=lfs merge=lfs -text
88
+ taming-transformers/data/coco_annotations_100/train2017/000000010414.jpg filter=lfs diff=lfs merge=lfs -text
89
+ taming-transformers/data/coco_annotations_100/train2017/000000010420.jpg filter=lfs diff=lfs merge=lfs -text
90
+ taming-transformers/data/coco_annotations_100/train2017/000000010421.jpg filter=lfs diff=lfs merge=lfs -text
91
+ taming-transformers/data/coco_annotations_100/train2017/000000010428.jpg filter=lfs diff=lfs merge=lfs -text
92
+ taming-transformers/data/coco_annotations_100/train2017/000000010430.jpg filter=lfs diff=lfs merge=lfs -text
93
+ taming-transformers/data/coco_annotations_100/train2017/000000010432.jpg filter=lfs diff=lfs merge=lfs -text
94
+ taming-transformers/data/coco_annotations_100/train2017/000000010434.jpg filter=lfs diff=lfs merge=lfs -text
95
+ taming-transformers/data/coco_annotations_100/train2017/000000010442.jpg filter=lfs diff=lfs merge=lfs -text
96
+ taming-transformers/data/coco_annotations_100/train2017/000000010445.jpg filter=lfs diff=lfs merge=lfs -text
97
+ taming-transformers/data/coco_annotations_100/train2017/000000010449.jpg filter=lfs diff=lfs merge=lfs -text
98
+ taming-transformers/data/coco_annotations_100/train2017/000000010463.jpg filter=lfs diff=lfs merge=lfs -text
99
+ taming-transformers/data/coco_annotations_100/val2017/000000010092.jpg filter=lfs diff=lfs merge=lfs -text
100
+ taming-transformers/data/coco_annotations_100/val2017/000000010583.jpg filter=lfs diff=lfs merge=lfs -text
101
+ taming-transformers/data/coco_annotations_100/val2017/000000010707.jpg filter=lfs diff=lfs merge=lfs -text
102
+ taming-transformers/data/coco_annotations_100/val2017/000000010764.jpg filter=lfs diff=lfs merge=lfs -text
103
+ taming-transformers/data/coco_annotations_100/val2017/000000011122.jpg filter=lfs diff=lfs merge=lfs -text
104
+ taming-transformers/data/coco_annotations_100/val2017/000000011149.jpg filter=lfs diff=lfs merge=lfs -text
105
+ taming-transformers/data/coco_annotations_100/val2017/000000011197.jpg filter=lfs diff=lfs merge=lfs -text
106
+ taming-transformers/data/coco_annotations_100/val2017/000000011511.jpg filter=lfs diff=lfs merge=lfs -text
107
+ taming-transformers/data/coco_annotations_100/val2017/000000011615.jpg filter=lfs diff=lfs merge=lfs -text
108
+ taming-transformers/data/coco_annotations_100/val2017/000000011699.jpg filter=lfs diff=lfs merge=lfs -text
109
+ taming-transformers/data/coco_annotations_100/val2017/000000011760.jpg filter=lfs diff=lfs merge=lfs -text
110
+ taming-transformers/data/coco_annotations_100/val2017/000000012062.jpg filter=lfs diff=lfs merge=lfs -text
111
+ taming-transformers/data/coco_annotations_100/val2017/000000012120.jpg filter=lfs diff=lfs merge=lfs -text
112
+ taming-transformers/data/coco_annotations_100/val2017/000000012280.jpg filter=lfs diff=lfs merge=lfs -text
113
+ taming-transformers/data/coco_annotations_100/val2017/000000012576.jpg filter=lfs diff=lfs merge=lfs -text
114
+ taming-transformers/data/coco_annotations_100/val2017/000000012639.jpg filter=lfs diff=lfs merge=lfs -text
115
+ taming-transformers/data/coco_annotations_100/val2017/000000012670.jpg filter=lfs diff=lfs merge=lfs -text
116
+ taming-transformers/data/coco_annotations_100/val2017/000000012748.jpg filter=lfs diff=lfs merge=lfs -text
117
+ taming-transformers/data/coco_annotations_100/val2017/000000013004.jpg filter=lfs diff=lfs merge=lfs -text
118
+ taming-transformers/data/coco_annotations_100/val2017/000000013177.jpg filter=lfs diff=lfs merge=lfs -text
119
+ taming-transformers/data/coco_annotations_100/val2017/000000013201.jpg filter=lfs diff=lfs merge=lfs -text
120
+ taming-transformers/data/coco_annotations_100/val2017/000000013291.jpg filter=lfs diff=lfs merge=lfs -text
121
+ taming-transformers/data/coco_annotations_100/val2017/000000013348.jpg filter=lfs diff=lfs merge=lfs -text
122
+ taming-transformers/data/coco_annotations_100/val2017/000000013546.jpg filter=lfs diff=lfs merge=lfs -text
123
+ taming-transformers/data/coco_annotations_100/val2017/000000013659.jpg filter=lfs diff=lfs merge=lfs -text
124
+ taming-transformers/data/coco_annotations_100/val2017/000000013729.jpg filter=lfs diff=lfs merge=lfs -text
125
+ taming-transformers/data/coco_annotations_100/val2017/000000013774.jpg filter=lfs diff=lfs merge=lfs -text
126
+ taming-transformers/data/coco_annotations_100/val2017/000000013923.jpg filter=lfs diff=lfs merge=lfs -text
127
+ taming-transformers/data/coco_annotations_100/val2017/000000014007.jpg filter=lfs diff=lfs merge=lfs -text
128
+ taming-transformers/data/coco_annotations_100/val2017/000000014038.jpg filter=lfs diff=lfs merge=lfs -text
129
+ taming-transformers/data/coco_annotations_100/val2017/000000014226.jpg filter=lfs diff=lfs merge=lfs -text
130
+ taming-transformers/data/coco_annotations_100/val2017/000000014380.jpg filter=lfs diff=lfs merge=lfs -text
131
+ taming-transformers/data/coco_annotations_100/val2017/000000014439.jpg filter=lfs diff=lfs merge=lfs -text
132
+ taming-transformers/data/coco_annotations_100/val2017/000000014473.jpg filter=lfs diff=lfs merge=lfs -text
133
+ taming-transformers/data/coco_annotations_100/val2017/000000014831.jpg filter=lfs diff=lfs merge=lfs -text
134
+ taming-transformers/data/coco_annotations_100/val2017/000000014888.jpg filter=lfs diff=lfs merge=lfs -text
135
+ taming-transformers/data/coco_annotations_100/val2017/000000015079.jpg filter=lfs diff=lfs merge=lfs -text
136
+ taming-transformers/data/coco_annotations_100/val2017/000000015254.jpg filter=lfs diff=lfs merge=lfs -text
137
+ taming-transformers/data/coco_annotations_100/val2017/000000015272.jpg filter=lfs diff=lfs merge=lfs -text
138
+ taming-transformers/data/coco_annotations_100/val2017/000000015278.jpg filter=lfs diff=lfs merge=lfs -text
139
+ taming-transformers/data/coco_annotations_100/val2017/000000015335.jpg filter=lfs diff=lfs merge=lfs -text
140
+ taming-transformers/data/coco_annotations_100/val2017/000000015338.jpg filter=lfs diff=lfs merge=lfs -text
141
+ taming-transformers/data/coco_annotations_100/val2017/000000015440.jpg filter=lfs diff=lfs merge=lfs -text
142
+ taming-transformers/data/coco_annotations_100/val2017/000000015517.jpg filter=lfs diff=lfs merge=lfs -text
143
+ taming-transformers/data/coco_annotations_100/val2017/000000015597.jpg filter=lfs diff=lfs merge=lfs -text
144
+ taming-transformers/data/coco_annotations_100/val2017/000000015660.jpg filter=lfs diff=lfs merge=lfs -text
145
+ taming-transformers/data/coco_annotations_100/val2017/000000015746.jpg filter=lfs diff=lfs merge=lfs -text
146
+ taming-transformers/data/coco_annotations_100/val2017/000000015751.jpg filter=lfs diff=lfs merge=lfs -text
147
+ taming-transformers/data/coco_annotations_100/val2017/000000015956.jpg filter=lfs diff=lfs merge=lfs -text
148
+ taming-transformers/data/coco_annotations_100/val2017/000000016010.jpg filter=lfs diff=lfs merge=lfs -text
149
+ taming-transformers/data/coco_annotations_100/val2017/000000016228.jpg filter=lfs diff=lfs merge=lfs -text
150
+ taming-transformers/data/coco_annotations_100/val2017/000000016249.jpg filter=lfs diff=lfs merge=lfs -text
151
+ taming-transformers/data/coco_annotations_100/val2017/000000016439.jpg filter=lfs diff=lfs merge=lfs -text
152
+ taming-transformers/data/coco_annotations_100/val2017/000000016451.jpg filter=lfs diff=lfs merge=lfs -text
153
+ taming-transformers/data/coco_annotations_100/val2017/000000016598.jpg filter=lfs diff=lfs merge=lfs -text
154
+ taming-transformers/data/coco_annotations_100/val2017/000000016958.jpg filter=lfs diff=lfs merge=lfs -text
155
+ taming-transformers/data/coco_annotations_100/val2017/000000017029.jpg filter=lfs diff=lfs merge=lfs -text
156
+ taming-transformers/data/coco_annotations_100/val2017/000000017031.jpg filter=lfs diff=lfs merge=lfs -text
157
+ taming-transformers/data/coco_annotations_100/val2017/000000017115.jpg filter=lfs diff=lfs merge=lfs -text
158
+ taming-transformers/data/coco_annotations_100/val2017/000000017178.jpg filter=lfs diff=lfs merge=lfs -text
159
+ taming-transformers/data/coco_annotations_100/val2017/000000017182.jpg filter=lfs diff=lfs merge=lfs -text
160
+ taming-transformers/data/coco_annotations_100/val2017/000000017207.jpg filter=lfs diff=lfs merge=lfs -text
161
+ taming-transformers/data/coco_annotations_100/val2017/000000017379.jpg filter=lfs diff=lfs merge=lfs -text
162
+ taming-transformers/data/coco_annotations_100/val2017/000000017436.jpg filter=lfs diff=lfs merge=lfs -text
163
+ taming-transformers/data/coco_annotations_100/val2017/000000017627.jpg filter=lfs diff=lfs merge=lfs -text
164
+ taming-transformers/data/coco_annotations_100/val2017/000000017714.jpg filter=lfs diff=lfs merge=lfs -text
165
+ taming-transformers/data/coco_annotations_100/val2017/000000017899.jpg filter=lfs diff=lfs merge=lfs -text
166
+ taming-transformers/data/coco_annotations_100/val2017/000000017905.jpg filter=lfs diff=lfs merge=lfs -text
167
+ taming-transformers/data/coco_annotations_100/val2017/000000017959.jpg filter=lfs diff=lfs merge=lfs -text
168
+ taming-transformers/data/coco_annotations_100/val2017/000000018150.jpg filter=lfs diff=lfs merge=lfs -text
169
+ taming-transformers/data/coco_annotations_100/val2017/000000018193.jpg filter=lfs diff=lfs merge=lfs -text
170
+ taming-transformers/data/coco_annotations_100/val2017/000000018380.jpg filter=lfs diff=lfs merge=lfs -text
171
+ taming-transformers/data/coco_annotations_100/val2017/000000018491.jpg filter=lfs diff=lfs merge=lfs -text
172
+ taming-transformers/data/coco_annotations_100/val2017/000000018519.jpg filter=lfs diff=lfs merge=lfs -text
173
+ taming-transformers/data/coco_annotations_100/val2017/000000018575.jpg filter=lfs diff=lfs merge=lfs -text
174
+ taming-transformers/data/coco_annotations_100/val2017/000000018737.jpg filter=lfs diff=lfs merge=lfs -text
175
+ taming-transformers/data/coco_annotations_100/val2017/000000018837.jpg filter=lfs diff=lfs merge=lfs -text
176
+ taming-transformers/data/coco_annotations_100/val2017/000000019042.jpg filter=lfs diff=lfs merge=lfs -text
177
+ taming-transformers/data/coco_annotations_100/val2017/000000019109.jpg filter=lfs diff=lfs merge=lfs -text
178
+ taming-transformers/data/coco_annotations_100/val2017/000000019221.jpg filter=lfs diff=lfs merge=lfs -text
179
+ taming-transformers/data/coco_annotations_100/val2017/000000019402.jpg filter=lfs diff=lfs merge=lfs -text
180
+ taming-transformers/data/coco_annotations_100/val2017/000000019432.jpg filter=lfs diff=lfs merge=lfs -text
181
+ taming-transformers/data/coco_annotations_100/val2017/000000019924.jpg filter=lfs diff=lfs merge=lfs -text
182
+ taming-transformers/data/coco_annotations_100/val2017/000000020059.jpg filter=lfs diff=lfs merge=lfs -text
183
+ taming-transformers/data/coco_annotations_100/val2017/000000020107.jpg filter=lfs diff=lfs merge=lfs -text
184
+ taming-transformers/data/coco_annotations_100/val2017/000000020247.jpg filter=lfs diff=lfs merge=lfs -text
185
+ taming-transformers/data/coco_annotations_100/val2017/000000020333.jpg filter=lfs diff=lfs merge=lfs -text
186
+ taming-transformers/data/coco_images/000000018380.jpg filter=lfs diff=lfs merge=lfs -text
187
+ taming-transformers/data/coco_images/000000052507.jpg filter=lfs diff=lfs merge=lfs -text
188
+ taming-transformers/data/coco_images/000000057672.jpg filter=lfs diff=lfs merge=lfs -text
189
+ taming-transformers/data/coco_images/000000064898.jpg filter=lfs diff=lfs merge=lfs -text
190
+ taming-transformers/data/coco_images/000000110638.jpg filter=lfs diff=lfs merge=lfs -text
191
+ taming-transformers/data/coco_images/000000119445.jpg filter=lfs diff=lfs merge=lfs -text
192
+ taming-transformers/data/coco_images/000000128658.jpg filter=lfs diff=lfs merge=lfs -text
193
+ taming-transformers/data/coco_images/000000154358.jpg filter=lfs diff=lfs merge=lfs -text
194
+ taming-transformers/data/coco_images/000000166259.jpg filter=lfs diff=lfs merge=lfs -text
195
+ taming-transformers/data/coco_images/000000166563.jpg filter=lfs diff=lfs merge=lfs -text
196
+ taming-transformers/data/coco_images/000000185599.jpg filter=lfs diff=lfs merge=lfs -text
197
+ taming-transformers/data/coco_images/000000205834.jpg filter=lfs diff=lfs merge=lfs -text
198
+ taming-transformers/data/coco_images/000000231169.jpg filter=lfs diff=lfs merge=lfs -text
199
+ taming-transformers/data/coco_images/000000237928.jpg filter=lfs diff=lfs merge=lfs -text
200
+ taming-transformers/data/coco_images/000000255824.jpg filter=lfs diff=lfs merge=lfs -text
201
+ taming-transformers/data/coco_images/000000256775.jpg filter=lfs diff=lfs merge=lfs -text
202
+ taming-transformers/data/coco_images/000000303653.jpg filter=lfs diff=lfs merge=lfs -text
203
+ taming-transformers/data/coco_images/000000323895.jpg filter=lfs diff=lfs merge=lfs -text
204
+ taming-transformers/data/coco_images/000000335529.jpg filter=lfs diff=lfs merge=lfs -text
205
+ taming-transformers/data/coco_images/000000348045.jpg filter=lfs diff=lfs merge=lfs -text
206
+ taming-transformers/data/coco_images/000000348481.jpg filter=lfs diff=lfs merge=lfs -text
207
+ taming-transformers/data/coco_images/000000356347.jpg filter=lfs diff=lfs merge=lfs -text
208
+ taming-transformers/data/coco_images/000000361180.jpg filter=lfs diff=lfs merge=lfs -text
209
+ taming-transformers/data/coco_images/000000406997.jpg filter=lfs diff=lfs merge=lfs -text
210
+ taming-transformers/data/coco_images/000000491464.jpg filter=lfs diff=lfs merge=lfs -text
211
+ taming-transformers/data/coco_images/000000517069.jpg filter=lfs diff=lfs merge=lfs -text
212
+ taming-transformers/data/coco_images/000000522393.jpg filter=lfs diff=lfs merge=lfs -text
213
+ taming-transformers/data/coco_images/000000569273.jpg filter=lfs diff=lfs merge=lfs -text
214
+ taming-transformers/data/drin_depth/n01795545/ILSVRC2012_val_00023344.png filter=lfs diff=lfs merge=lfs -text
215
+ taming-transformers/data/drin_depth/n01819313/ILSVRC2012_val_00003068.png filter=lfs diff=lfs merge=lfs -text
216
+ taming-transformers/data/drin_depth/n01820546/ILSVRC2012_val_00034784.png filter=lfs diff=lfs merge=lfs -text
217
+ taming-transformers/data/drin_depth/n01820546/ILSVRC2012_val_00047491.png filter=lfs diff=lfs merge=lfs -text
218
+ taming-transformers/data/drin_depth/n01828970/ILSVRC2012_val_00001336.png filter=lfs diff=lfs merge=lfs -text
219
+ taming-transformers/data/drin_depth/n01828970/ILSVRC2012_val_00008236.png filter=lfs diff=lfs merge=lfs -text
220
+ taming-transformers/data/drin_depth/n01828970/ILSVRC2012_val_00046802.png filter=lfs diff=lfs merge=lfs -text
221
+ taming-transformers/data/drin_depth/n01843065/ILSVRC2012_val_00022439.png filter=lfs diff=lfs merge=lfs -text
222
+ taming-transformers/data/drin_depth/n01847000/ILSVRC2012_val_00022364.png filter=lfs diff=lfs merge=lfs -text
223
+ taming-transformers/data/drin_depth/n02085782/ILSVRC2012_val_00012298.png filter=lfs diff=lfs merge=lfs -text
224
+ taming-transformers/data/drin_depth/n02086646/ILSVRC2012_val_00011473.png filter=lfs diff=lfs merge=lfs -text
225
+ taming-transformers/data/drin_depth/n02088466/ILSVRC2012_val_00013651.png filter=lfs diff=lfs merge=lfs -text
226
+ taming-transformers/data/drin_depth/n02089973/ILSVRC2012_val_00000028.png filter=lfs diff=lfs merge=lfs -text
227
+ taming-transformers/data/drin_depth/n02093256/ILSVRC2012_val_00046547.png filter=lfs diff=lfs merge=lfs -text
228
+ taming-transformers/data/drin_depth/n02096294/ILSVRC2012_val_00042133.png filter=lfs diff=lfs merge=lfs -text
229
+ taming-transformers/data/drin_depth/n02099601/ILSVRC2012_val_00005697.png filter=lfs diff=lfs merge=lfs -text
230
+ taming-transformers/data/drin_depth/n02099712/ILSVRC2012_val_00023471.png filter=lfs diff=lfs merge=lfs -text
231
+ taming-transformers/data/drin_depth/n02100877/ILSVRC2012_val_00039863.png filter=lfs diff=lfs merge=lfs -text
232
+ taming-transformers/data/drin_depth/n02101006/ILSVRC2012_val_00032333.png filter=lfs diff=lfs merge=lfs -text
233
+ taming-transformers/data/drin_depth/n02101006/ILSVRC2012_val_00047325.png filter=lfs diff=lfs merge=lfs -text
234
+ taming-transformers/data/drin_depth/n02101556/ILSVRC2012_val_00030540.png filter=lfs diff=lfs merge=lfs -text
235
+ taming-transformers/data/drin_depth/n02102318/ILSVRC2012_val_00024691.png filter=lfs diff=lfs merge=lfs -text
236
+ taming-transformers/data/drin_depth/n02105505/ILSVRC2012_val_00031252.png filter=lfs diff=lfs merge=lfs -text
237
+ taming-transformers/data/drin_depth/n02110627/ILSVRC2012_val_00008310.png filter=lfs diff=lfs merge=lfs -text
238
+ taming-transformers/data/drin_depth/n02111889/ILSVRC2012_val_00042625.png filter=lfs diff=lfs merge=lfs -text
239
+ taming-transformers/data/drin_images/n01795545/ILSVRC2012_val_00023344.JPEG filter=lfs diff=lfs merge=lfs -text
240
+ taming-transformers/data/drin_images/n01819313/ILSVRC2012_val_00003068.JPEG filter=lfs diff=lfs merge=lfs -text
241
+ taming-transformers/data/drin_images/n01820546/ILSVRC2012_val_00034784.JPEG filter=lfs diff=lfs merge=lfs -text
242
+ taming-transformers/data/drin_images/n01828970/ILSVRC2012_val_00001336.JPEG filter=lfs diff=lfs merge=lfs -text
243
+ taming-transformers/data/drin_images/n01828970/ILSVRC2012_val_00008236.JPEG filter=lfs diff=lfs merge=lfs -text
244
+ taming-transformers/data/drin_images/n01828970/ILSVRC2012_val_00046802.JPEG filter=lfs diff=lfs merge=lfs -text
245
+ taming-transformers/data/drin_images/n01843065/ILSVRC2012_val_00022439.JPEG filter=lfs diff=lfs merge=lfs -text
246
+ taming-transformers/data/drin_images/n01847000/ILSVRC2012_val_00022364.JPEG filter=lfs diff=lfs merge=lfs -text
247
+ taming-transformers/data/drin_images/n02086646/ILSVRC2012_val_00011473.JPEG filter=lfs diff=lfs merge=lfs -text
248
+ taming-transformers/data/drin_images/n02089973/ILSVRC2012_val_00000028.JPEG filter=lfs diff=lfs merge=lfs -text
249
+ taming-transformers/data/drin_images/n02096294/ILSVRC2012_val_00042133.JPEG filter=lfs diff=lfs merge=lfs -text
250
+ taming-transformers/data/drin_images/n02099601/ILSVRC2012_val_00005697.JPEG filter=lfs diff=lfs merge=lfs -text
251
+ taming-transformers/data/drin_images/n02100877/ILSVRC2012_val_00039863.JPEG filter=lfs diff=lfs merge=lfs -text
252
+ taming-transformers/data/drin_images/n02101006/ILSVRC2012_val_00032333.JPEG filter=lfs diff=lfs merge=lfs -text
253
+ taming-transformers/data/drin_images/n02101006/ILSVRC2012_val_00047325.JPEG filter=lfs diff=lfs merge=lfs -text
254
+ taming-transformers/data/drin_images/n02101556/ILSVRC2012_val_00030540.JPEG filter=lfs diff=lfs merge=lfs -text
255
+ taming-transformers/data/drin_images/n02102318/ILSVRC2012_val_00024691.JPEG filter=lfs diff=lfs merge=lfs -text
256
+ taming-transformers/data/drin_images/n02110627/ILSVRC2012_val_00008310.JPEG filter=lfs diff=lfs merge=lfs -text
257
+ taming-transformers/data/open_images_annotations_100/train/000ab31e6be35fed.jpg filter=lfs diff=lfs merge=lfs -text
258
+ taming-transformers/data/open_images_annotations_100/train/000ab7bec71cc50a.jpg filter=lfs diff=lfs merge=lfs -text
259
+ taming-transformers/data/open_images_annotations_100/train/000ab8c20b3e5b58.jpg filter=lfs diff=lfs merge=lfs -text
260
+ taming-transformers/data/open_images_annotations_100/train/000abc075d659122.jpg filter=lfs diff=lfs merge=lfs -text
261
+ taming-transformers/data/open_images_annotations_100/train/000abe5eddc5b303.jpg filter=lfs diff=lfs merge=lfs -text
262
+ taming-transformers/data/open_images_annotations_100/train/000ac34008b0ba4c.jpg filter=lfs diff=lfs merge=lfs -text
263
+ taming-transformers/data/open_images_annotations_100/train/000ac8c676b6077a.jpg filter=lfs diff=lfs merge=lfs -text
264
+ taming-transformers/data/open_images_annotations_100/train/000ac95750ac7399.jpg filter=lfs diff=lfs merge=lfs -text
265
+ taming-transformers/data/open_images_annotations_100/train/000acf666d991c39.jpg filter=lfs diff=lfs merge=lfs -text
266
+ taming-transformers/data/open_images_annotations_100/train/000ad0ecfb21ee63.jpg filter=lfs diff=lfs merge=lfs -text
267
+ taming-transformers/data/open_images_annotations_100/train/000ad20b5e452b24.jpg filter=lfs diff=lfs merge=lfs -text
268
+ taming-transformers/data/open_images_annotations_100/train/000ad3d42653f5f6.jpg filter=lfs diff=lfs merge=lfs -text
269
+ taming-transformers/data/open_images_annotations_100/train/000ad6c520be9ec5.jpg filter=lfs diff=lfs merge=lfs -text
270
+ taming-transformers/data/open_images_annotations_100/train/000ad6fa67b5ad96.jpg filter=lfs diff=lfs merge=lfs -text
271
+ taming-transformers/data/open_images_annotations_100/train/000adcdd7244ce4a.jpg filter=lfs diff=lfs merge=lfs -text
272
+ taming-transformers/data/open_images_annotations_100/train/000adef7197e3118.jpg filter=lfs diff=lfs merge=lfs -text
273
+ taming-transformers/data/open_images_annotations_100/train/000adfe5b817011c.jpg filter=lfs diff=lfs merge=lfs -text
274
+ taming-transformers/data/open_images_annotations_100/train/000ae235808cc1e8.jpg filter=lfs diff=lfs merge=lfs -text
275
+ taming-transformers/data/open_images_annotations_100/train/000ae28755d2d20e.jpg filter=lfs diff=lfs merge=lfs -text
276
+ taming-transformers/data/open_images_annotations_100/train/000aecd78b230135.jpg filter=lfs diff=lfs merge=lfs -text
277
+ taming-transformers/data/open_images_annotations_100/train/000aee0af66d4237.jpg filter=lfs diff=lfs merge=lfs -text
278
+ taming-transformers/data/open_images_annotations_100/train/000af631fb329557.jpg filter=lfs diff=lfs merge=lfs -text
279
+ taming-transformers/data/open_images_annotations_100/train/000b06c0eed42a4c.jpg filter=lfs diff=lfs merge=lfs -text
280
+ taming-transformers/data/open_images_annotations_100/train/000b093da01e5bfe.jpg filter=lfs diff=lfs merge=lfs -text
281
+ taming-transformers/data/open_images_annotations_100/train/000b09d5d3fc821f.jpg filter=lfs diff=lfs merge=lfs -text
282
+ taming-transformers/data/open_images_annotations_100/train/000b0f5159f54105.jpg filter=lfs diff=lfs merge=lfs -text
283
+ taming-transformers/data/open_images_annotations_100/train/000b168e791f591d.jpg filter=lfs diff=lfs merge=lfs -text
284
+ taming-transformers/data/open_images_annotations_100/train/000b1971d8daaeef.jpg filter=lfs diff=lfs merge=lfs -text
285
+ taming-transformers/data/open_images_annotations_100/train/000b1b3b85edd850.jpg filter=lfs diff=lfs merge=lfs -text
286
+ taming-transformers/data/open_images_annotations_100/train/000b1b92f0800e94.jpg filter=lfs diff=lfs merge=lfs -text
287
+ taming-transformers/data/open_images_annotations_100/train/000b260e1f08a32a.jpg filter=lfs diff=lfs merge=lfs -text
288
+ taming-transformers/data/open_images_annotations_100/train/000b29496f75c8e5.jpg filter=lfs diff=lfs merge=lfs -text
289
+ taming-transformers/data/open_images_annotations_100/train/000b299b5f5ed902.jpg filter=lfs diff=lfs merge=lfs -text
290
+ taming-transformers/data/open_images_annotations_100/train/000b2b00065e564a.jpg filter=lfs diff=lfs merge=lfs -text
291
+ taming-transformers/data/open_images_annotations_100/train/000b2d1789d5f80d.jpg filter=lfs diff=lfs merge=lfs -text
292
+ taming-transformers/data/open_images_annotations_100/train/000b38d9f2f664fe.jpg filter=lfs diff=lfs merge=lfs -text
293
+ taming-transformers/data/open_images_annotations_100/train/000b393437134262.jpg filter=lfs diff=lfs merge=lfs -text
294
+ taming-transformers/data/open_images_annotations_100/train/000b3940e7d25c03.jpg filter=lfs diff=lfs merge=lfs -text
295
+ taming-transformers/data/open_images_annotations_100/train/000b397382b2464a.jpg filter=lfs diff=lfs merge=lfs -text
296
+ taming-transformers/data/open_images_annotations_100/train/000b42cae15622e0.jpg filter=lfs diff=lfs merge=lfs -text
297
+ taming-transformers/data/open_images_annotations_100/train/000b432ae644b679.jpg filter=lfs diff=lfs merge=lfs -text
298
+ taming-transformers/data/open_images_annotations_100/train/000b485cedacbf97.jpg filter=lfs diff=lfs merge=lfs -text
299
+ taming-transformers/data/open_images_annotations_100/train/000b4935979bf4b5.jpg filter=lfs diff=lfs merge=lfs -text
300
+ taming-transformers/data/open_images_annotations_100/train/000b4fcdf1af3361.jpg filter=lfs diff=lfs merge=lfs -text
301
+ taming-transformers/data/open_images_annotations_100/train/000b50bdd1933a36.jpg filter=lfs diff=lfs merge=lfs -text
302
+ taming-transformers/data/open_images_annotations_100/train/000b55559b0244d7.jpg filter=lfs diff=lfs merge=lfs -text
303
+ taming-transformers/data/open_images_annotations_100/train/000b55e339f0b131.jpg filter=lfs diff=lfs merge=lfs -text
304
+ taming-transformers/data/open_images_annotations_100/train/000b567c26dd4e5d.jpg filter=lfs diff=lfs merge=lfs -text
305
+ taming-transformers/data/open_images_annotations_100/train/000b59a7822679e6.jpg filter=lfs diff=lfs merge=lfs -text
306
+ taming-transformers/data/open_images_annotations_100/train/000b5bc07c0c5df7.jpg filter=lfs diff=lfs merge=lfs -text
307
+ taming-transformers/data/open_images_annotations_100/train/000b606e130bdf5e.jpg filter=lfs diff=lfs merge=lfs -text
308
+ taming-transformers/data/open_images_annotations_100/train/000b63a1445f53c8.jpg filter=lfs diff=lfs merge=lfs -text
309
+ taming-transformers/data/open_images_annotations_100/train/000b65a36ad46f9e.jpg filter=lfs diff=lfs merge=lfs -text
310
+ taming-transformers/data/open_images_annotations_100/train/000b70a84aab664b.jpg filter=lfs diff=lfs merge=lfs -text
311
+ taming-transformers/data/open_images_annotations_100/train/000b72e1446f8849.jpg filter=lfs diff=lfs merge=lfs -text
312
+ taming-transformers/data/open_images_annotations_100/train/000b76a9b80ba43a.jpg filter=lfs diff=lfs merge=lfs -text
313
+ taming-transformers/data/open_images_annotations_100/train/000b7dfaa1810a83.jpg filter=lfs diff=lfs merge=lfs -text
314
+ taming-transformers/data/open_images_annotations_100/train/000b81b5757963e0.jpg filter=lfs diff=lfs merge=lfs -text
315
+ taming-transformers/data/open_images_annotations_100/train/000b825dea3016eb.jpg filter=lfs diff=lfs merge=lfs -text
316
+ taming-transformers/data/open_images_annotations_100/train/000b87119cc301cf.jpg filter=lfs diff=lfs merge=lfs -text
317
+ taming-transformers/data/open_images_annotations_100/train/000b8d80f7386698.jpg filter=lfs diff=lfs merge=lfs -text
318
+ taming-transformers/data/open_images_annotations_100/train/000b9007a01f7405.jpg filter=lfs diff=lfs merge=lfs -text
319
+ taming-transformers/data/open_images_annotations_100/train/000b93644609911f.jpg filter=lfs diff=lfs merge=lfs -text
320
+ taming-transformers/data/open_images_annotations_100/train/000b9814a07fd974.jpg filter=lfs diff=lfs merge=lfs -text
321
+ taming-transformers/data/open_images_annotations_100/train/000b9a97776b3634.jpg filter=lfs diff=lfs merge=lfs -text
322
+ taming-transformers/data/open_images_annotations_100/train/000b9b00d7aef8f5.jpg filter=lfs diff=lfs merge=lfs -text
323
+ taming-transformers/data/open_images_annotations_100/train/000b9b61afea2cd4.jpg filter=lfs diff=lfs merge=lfs -text
324
+ taming-transformers/data/open_images_annotations_100/train/000b9c365c9e307a.jpg filter=lfs diff=lfs merge=lfs -text
325
+ taming-transformers/data/open_images_annotations_100/train/000b9d6c0f7d794d.jpg filter=lfs diff=lfs merge=lfs -text
326
+ taming-transformers/data/open_images_annotations_100/train/000b9f3ba4891c11.jpg filter=lfs diff=lfs merge=lfs -text
327
+ taming-transformers/data/open_images_annotations_100/train/000ba221f70676c6.jpg filter=lfs diff=lfs merge=lfs -text
328
+ taming-transformers/data/open_images_annotations_100/train/000ba28d70b1a999.jpg filter=lfs diff=lfs merge=lfs -text
329
+ taming-transformers/data/open_images_annotations_100/train/000ba3ca8a2ca955.jpg filter=lfs diff=lfs merge=lfs -text
330
+ taming-transformers/data/open_images_annotations_100/train/000ba40bf7a2b458.jpg filter=lfs diff=lfs merge=lfs -text
331
+ taming-transformers/data/open_images_annotations_100/train/000baa6f7dae9b79.jpg filter=lfs diff=lfs merge=lfs -text
332
+ taming-transformers/data/open_images_annotations_100/train/000bab5b1a67844e.jpg filter=lfs diff=lfs merge=lfs -text
333
+ taming-transformers/data/open_images_annotations_100/train/000bb0ae453283b0.jpg filter=lfs diff=lfs merge=lfs -text
334
+ taming-transformers/data/open_images_annotations_100/train/000bb81adefe7332.jpg filter=lfs diff=lfs merge=lfs -text
335
+ taming-transformers/data/open_images_annotations_100/train/000bb8bd9b1bca65.jpg filter=lfs diff=lfs merge=lfs -text
336
+ taming-transformers/data/open_images_annotations_100/train/000bbdf0dc8099d8.jpg filter=lfs diff=lfs merge=lfs -text
337
+ taming-transformers/data/open_images_annotations_100/train/000bc1eb7f74adae.jpg filter=lfs diff=lfs merge=lfs -text
338
+ taming-transformers/data/open_images_annotations_100/train/000bc33717a6371f.jpg filter=lfs diff=lfs merge=lfs -text
339
+ taming-transformers/data/open_images_annotations_100/train/000bc387c731dd97.jpg filter=lfs diff=lfs merge=lfs -text
340
+ taming-transformers/data/open_images_annotations_100/train/000bc5006eb7fd98.jpg filter=lfs diff=lfs merge=lfs -text
341
+ taming-transformers/data/open_images_annotations_100/train/000bc5ad4cc3ae73.jpg filter=lfs diff=lfs merge=lfs -text
342
+ taming-transformers/data/open_images_annotations_100/train/000bc75d38907c78.jpg filter=lfs diff=lfs merge=lfs -text
343
+ taming-transformers/data/open_images_annotations_100/train/000bc7b0a1889bcb.jpg filter=lfs diff=lfs merge=lfs -text
344
+ taming-transformers/data/open_images_annotations_100/train/000bcd3bcd95cbb3.jpg filter=lfs diff=lfs merge=lfs -text
345
+ taming-transformers/data/open_images_annotations_100/train/000bcee5bed5446b.jpg filter=lfs diff=lfs merge=lfs -text
346
+ taming-transformers/data/open_images_annotations_100/validation/09c67960e389e4df.jpg filter=lfs diff=lfs merge=lfs -text
347
+ taming-transformers/data/open_images_annotations_100/validation/09c6ddd2c210450e.jpg filter=lfs diff=lfs merge=lfs -text
348
+ taming-transformers/data/open_images_annotations_100/validation/09c7f89055cf399b.jpg filter=lfs diff=lfs merge=lfs -text
349
+ taming-transformers/data/open_images_annotations_100/validation/09c863d76bcf6b00.jpg filter=lfs diff=lfs merge=lfs -text
350
+ taming-transformers/data/open_images_annotations_100/validation/09c993afacd01547.jpg filter=lfs diff=lfs merge=lfs -text
351
+ taming-transformers/data/open_images_annotations_100/validation/09d2112596d9155b.jpg filter=lfs diff=lfs merge=lfs -text
352
+ taming-transformers/data/open_images_annotations_100/validation/09d354dbd3dcc857.jpg filter=lfs diff=lfs merge=lfs -text
353
+ taming-transformers/data/open_images_annotations_100/validation/09d45c49c4adbae4.jpg filter=lfs diff=lfs merge=lfs -text
354
+ taming-transformers/data/open_images_annotations_100/validation/09d64f43c7111879.jpg filter=lfs diff=lfs merge=lfs -text
355
+ taming-transformers/data/open_images_annotations_100/validation/09d8aa2d19ff724d.jpg filter=lfs diff=lfs merge=lfs -text
356
+ taming-transformers/data/open_images_annotations_100/validation/09dcb9b52055d40f.jpg filter=lfs diff=lfs merge=lfs -text
357
+ taming-transformers/data/open_images_annotations_100/validation/09dd0671cd633432.jpg filter=lfs diff=lfs merge=lfs -text
358
+ taming-transformers/data/open_images_annotations_100/validation/09df63bd01367ca3.jpg filter=lfs diff=lfs merge=lfs -text
359
+ taming-transformers/data/open_images_annotations_100/validation/09e094375efab7fe.jpg filter=lfs diff=lfs merge=lfs -text
360
+ taming-transformers/data/open_images_annotations_100/validation/09e617d9d3120b32.jpg filter=lfs diff=lfs merge=lfs -text
361
+ taming-transformers/data/open_images_annotations_100/validation/09ebcee57699eb98.jpg filter=lfs diff=lfs merge=lfs -text
362
+ taming-transformers/data/open_images_annotations_100/validation/09f8b77a88f224d9.jpg filter=lfs diff=lfs merge=lfs -text
363
+ taming-transformers/data/open_images_annotations_100/validation/09f8e760f60df0da.jpg filter=lfs diff=lfs merge=lfs -text
364
+ taming-transformers/data/open_images_annotations_100/validation/09fa093bcd300c1a.jpg filter=lfs diff=lfs merge=lfs -text
365
+ taming-transformers/data/open_images_annotations_100/validation/0a02c648d24f39fb.jpg filter=lfs diff=lfs merge=lfs -text
366
+ taming-transformers/data/open_images_annotations_100/validation/0a08a4711c728078.jpg filter=lfs diff=lfs merge=lfs -text
367
+ taming-transformers/data/open_images_annotations_100/validation/0a13dcaaab9a35e0.jpg filter=lfs diff=lfs merge=lfs -text
368
+ taming-transformers/data/open_images_annotations_100/validation/0a1b11867383b13e.jpg filter=lfs diff=lfs merge=lfs -text
369
+ taming-transformers/data/open_images_annotations_100/validation/0a23d3f0e7d850f4.jpg filter=lfs diff=lfs merge=lfs -text
370
+ taming-transformers/data/open_images_annotations_100/validation/0a278d979b63fc72.jpg filter=lfs diff=lfs merge=lfs -text
371
+ taming-transformers/data/open_images_annotations_100/validation/0a2c6ef66896fb92.jpg filter=lfs diff=lfs merge=lfs -text
372
+ taming-transformers/data/open_images_annotations_100/validation/0a34d80ee1db201e.jpg filter=lfs diff=lfs merge=lfs -text
373
+ taming-transformers/data/open_images_annotations_100/validation/0a37aa0734ac8016.jpg filter=lfs diff=lfs merge=lfs -text
374
+ taming-transformers/data/open_images_annotations_100/validation/0a3873442ad329c2.jpg filter=lfs diff=lfs merge=lfs -text
375
+ taming-transformers/data/open_images_annotations_100/validation/0a39325e5ad7f5a0.jpg filter=lfs diff=lfs merge=lfs -text
376
+ taming-transformers/data/open_images_annotations_100/validation/0a3c01759e77a02d.jpg filter=lfs diff=lfs merge=lfs -text
377
+ taming-transformers/data/open_images_annotations_100/validation/0a3f577a327ca7cc.jpg filter=lfs diff=lfs merge=lfs -text
378
+ taming-transformers/data/open_images_annotations_100/validation/0a3f9b3d57ef354a.jpg filter=lfs diff=lfs merge=lfs -text
379
+ taming-transformers/data/open_images_annotations_100/validation/0a41cda5f44baaf6.jpg filter=lfs diff=lfs merge=lfs -text
380
+ taming-transformers/data/open_images_annotations_100/validation/0a47e7d602855f93.jpg filter=lfs diff=lfs merge=lfs -text
381
+ taming-transformers/data/open_images_annotations_100/validation/0a4abf0a8071b917.jpg filter=lfs diff=lfs merge=lfs -text
382
+ taming-transformers/data/open_images_annotations_100/validation/0a4db5693da70448.jpg filter=lfs diff=lfs merge=lfs -text
383
+ taming-transformers/data/open_images_annotations_100/validation/0a556c8163b58fae.jpg filter=lfs diff=lfs merge=lfs -text
384
+ taming-transformers/data/open_images_annotations_100/validation/0a563d05ebab4fe3.jpg filter=lfs diff=lfs merge=lfs -text
385
+ taming-transformers/data/open_images_annotations_100/validation/0a599940d33b6b2b.jpg filter=lfs diff=lfs merge=lfs -text
386
+ taming-transformers/data/open_images_annotations_100/validation/0a600f1148d1023c.jpg filter=lfs diff=lfs merge=lfs -text
387
+ taming-transformers/data/open_images_annotations_100/validation/0a6a03c8f23ee744.jpg filter=lfs diff=lfs merge=lfs -text
388
+ taming-transformers/data/open_images_annotations_100/validation/0a6bc386b28f2aac.jpg filter=lfs diff=lfs merge=lfs -text
389
+ taming-transformers/data/open_images_annotations_100/validation/0a7074a2a5515531.jpg filter=lfs diff=lfs merge=lfs -text
390
+ taming-transformers/data/open_images_annotations_100/validation/0a72fef43a51c479.jpg filter=lfs diff=lfs merge=lfs -text
391
+ taming-transformers/data/open_images_annotations_100/validation/0a73064c82730ff5.jpg filter=lfs diff=lfs merge=lfs -text
392
+ taming-transformers/data/open_images_annotations_100/validation/0a78374f2d3949ae.jpg filter=lfs diff=lfs merge=lfs -text
393
+ taming-transformers/data/open_images_annotations_100/validation/0a7be0b883a12966.jpg filter=lfs diff=lfs merge=lfs -text
394
+ taming-transformers/data/open_images_annotations_100/validation/0a7c597abf1e90d4.jpg filter=lfs diff=lfs merge=lfs -text
395
+ taming-transformers/data/open_images_annotations_100/validation/0a7f13330a5d0023.jpg filter=lfs diff=lfs merge=lfs -text
396
+ taming-transformers/data/open_images_annotations_100/validation/0a7f4d9a0ccb9afe.jpg filter=lfs diff=lfs merge=lfs -text
397
+ taming-transformers/data/open_images_annotations_100/validation/0a7fbc1d68e4e5ae.jpg filter=lfs diff=lfs merge=lfs -text
398
+ taming-transformers/data/open_images_annotations_100/validation/0a82f0443c940816.jpg filter=lfs diff=lfs merge=lfs -text
399
+ taming-transformers/data/open_images_annotations_100/validation/0a8657e8b5c9d7bb.jpg filter=lfs diff=lfs merge=lfs -text
400
+ taming-transformers/data/open_images_annotations_100/validation/0a877314ca2039d9.jpg filter=lfs diff=lfs merge=lfs -text
401
+ taming-transformers/data/open_images_annotations_100/validation/0a917bbca24cf75d.jpg filter=lfs diff=lfs merge=lfs -text
402
+ taming-transformers/data/open_images_annotations_100/validation/0a94296ff543a1dc.jpg filter=lfs diff=lfs merge=lfs -text
403
+ taming-transformers/data/open_images_annotations_100/validation/0a9f73b3c2557150.jpg filter=lfs diff=lfs merge=lfs -text
404
+ taming-transformers/data/open_images_annotations_100/validation/0a9ff75a7897e757.jpg filter=lfs diff=lfs merge=lfs -text
405
+ taming-transformers/data/open_images_annotations_100/validation/0aa206fa7ea80036.jpg filter=lfs diff=lfs merge=lfs -text
406
+ taming-transformers/data/open_images_annotations_100/validation/0aa3a6c33fca122b.jpg filter=lfs diff=lfs merge=lfs -text
407
+ taming-transformers/data/open_images_annotations_100/validation/0aaad833ac61ac9d.jpg filter=lfs diff=lfs merge=lfs -text
408
+ taming-transformers/data/open_images_annotations_100/validation/0aacbdb54e853a0a.jpg filter=lfs diff=lfs merge=lfs -text
409
+ taming-transformers/data/open_images_annotations_100/validation/0aad9fc79a35bd53.jpg filter=lfs diff=lfs merge=lfs -text
410
+ taming-transformers/data/open_images_annotations_100/validation/0aae34863935e33a.jpg filter=lfs diff=lfs merge=lfs -text
411
+ taming-transformers/data/open_images_annotations_100/validation/0ab050b51e78acdb.jpg filter=lfs diff=lfs merge=lfs -text
412
+ taming-transformers/data/open_images_annotations_100/validation/0ab10a6417ef2301.jpg filter=lfs diff=lfs merge=lfs -text
413
+ taming-transformers/data/open_images_annotations_100/validation/0ab2b64f27f8baca.jpg filter=lfs diff=lfs merge=lfs -text
414
+ taming-transformers/data/open_images_annotations_100/validation/0ab5c690eebfad95.jpg filter=lfs diff=lfs merge=lfs -text
415
+ taming-transformers/data/open_images_annotations_100/validation/0ac166d12e401a98.jpg filter=lfs diff=lfs merge=lfs -text
416
+ taming-transformers/data/open_images_annotations_100/validation/0ac2f91a7995aa8b.jpg filter=lfs diff=lfs merge=lfs -text
417
+ taming-transformers/data/open_images_annotations_100/validation/0ac3c1db1b3645f2.jpg filter=lfs diff=lfs merge=lfs -text
418
+ taming-transformers/data/open_images_annotations_100/validation/0ac51477636a6933.jpg filter=lfs diff=lfs merge=lfs -text
419
+ taming-transformers/data/open_images_annotations_100/validation/0ac52440f73b5c80.jpg filter=lfs diff=lfs merge=lfs -text
420
+ taming-transformers/data/open_images_annotations_100/validation/0ad7884032419621.jpg filter=lfs diff=lfs merge=lfs -text
421
+ taming-transformers/data/open_images_annotations_100/validation/0ad7bad30cd432df.jpg filter=lfs diff=lfs merge=lfs -text
422
+ taming-transformers/data/open_images_annotations_100/validation/0ad99d610a9092e6.jpg filter=lfs diff=lfs merge=lfs -text
423
+ taming-transformers/data/open_images_annotations_100/validation/0ada35baba28134b.jpg filter=lfs diff=lfs merge=lfs -text
424
+ taming-transformers/data/open_images_annotations_100/validation/0adc1330287b2e66.jpg filter=lfs diff=lfs merge=lfs -text
425
+ taming-transformers/data/open_images_annotations_100/validation/0adc373e996aadc2.jpg filter=lfs diff=lfs merge=lfs -text
426
+ taming-transformers/data/open_images_annotations_100/validation/0add91a2efb3f33d.jpg filter=lfs diff=lfs merge=lfs -text
427
+ taming-transformers/data/open_images_annotations_100/validation/0ade7aef439e2102.jpg filter=lfs diff=lfs merge=lfs -text
428
+ taming-transformers/data/sflckr_images/alaska_lakes/43259216952_59352d7204_b.jpg filter=lfs diff=lfs merge=lfs -text
429
+ taming-transformers/data/sflckr_images/australia/12822389285_a7723081b5_b.jpg filter=lfs diff=lfs merge=lfs -text
430
+ taming-transformers/data/sflckr_images/australia/8720651218_ca82a6608e_b.jpg filter=lfs diff=lfs merge=lfs -text
431
+ taming-transformers/data/sflckr_images/black_forest/8364557382_c6c9ee2fd6_b.jpg filter=lfs diff=lfs merge=lfs -text
432
+ taming-transformers/data/sflckr_images/canada/256743165_9f130ba95b_b.jpg filter=lfs diff=lfs merge=lfs -text
433
+ taming-transformers/data/sflckr_images/canada/2883773_881c197107_c.jpg filter=lfs diff=lfs merge=lfs -text
434
+ taming-transformers/data/sflckr_images/carribean/14351041152_ef77484a1f_b.jpg filter=lfs diff=lfs merge=lfs -text
435
+ taming-transformers/data/sflckr_images/carribean/18176301_c9d27557cf_b.jpg filter=lfs diff=lfs merge=lfs -text
436
+ taming-transformers/data/sflckr_images/cliff_ocean/36142796444_45d452f567_b.jpg filter=lfs diff=lfs merge=lfs -text
437
+ taming-transformers/data/sflckr_images/desert/4534149722_3cc4f92891_b.jpg filter=lfs diff=lfs merge=lfs -text
438
+ taming-transformers/data/sflckr_images/geysir/14996762478_a9bdbf959a_b.jpg filter=lfs diff=lfs merge=lfs -text
439
+ taming-transformers/data/sflckr_images/geysir/26320755536_7c769b6218_b.jpg filter=lfs diff=lfs merge=lfs -text
440
+ taming-transformers/data/sflckr_images/geysir/4748115806_7219c2b3be_b.jpg filter=lfs diff=lfs merge=lfs -text
441
+ taming-transformers/data/sflckr_images/ireland/15570753471_74db396d14_b.jpg filter=lfs diff=lfs merge=lfs -text
442
+ taming-transformers/data/sflckr_images/lakes/39933489595_f0e5d85b6d_b.jpg filter=lfs diff=lfs merge=lfs -text
443
+ taming-transformers/data/sflckr_images/meadow/18864473291_844325caab_b.jpg filter=lfs diff=lfs merge=lfs -text
444
+ taming-transformers/data/sflckr_images/mongolia/6076373946_e9ea2aee32_b.jpg filter=lfs diff=lfs merge=lfs -text
445
+ taming-transformers/data/sflckr_images/newzealand_np/7942812194_9348729b93_b.jpg filter=lfs diff=lfs merge=lfs -text
446
+ taming-transformers/data/sflckr_images/norway/20099378793_cc2df820af_b.jpg filter=lfs diff=lfs merge=lfs -text
447
+ taming-transformers/data/sflckr_images/norway/25735082181_999927fe5a_b.jpg filter=lfs diff=lfs merge=lfs -text
448
+ taming-transformers/data/sflckr_images/swiss_mountains/33509672006_bf4c416afd_b.jpg filter=lfs diff=lfs merge=lfs -text
449
+ taming-transformers/data/sflckr_images/volcano/50254383883_27ed6ea93a_b.jpg filter=lfs diff=lfs merge=lfs -text
450
+ taming-transformers/scripts/reconstruction_usage.ipynb filter=lfs diff=lfs merge=lfs -text
taming-transformers/License.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright (c) 2020 Patrick Esser and Robin Rombach and Björn Ommer
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in all
11
+ copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
14
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
15
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
16
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
17
+ DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
18
+ OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
19
+ OR OTHER DEALINGS IN THE SOFTWARE./
taming-transformers/README.md ADDED
@@ -0,0 +1,410 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Taming Transformers for High-Resolution Image Synthesis
2
+ ##### CVPR 2021 (Oral)
3
+ ![teaser](assets/mountain.jpeg)
4
+
5
+ [**Taming Transformers for High-Resolution Image Synthesis**](https://compvis.github.io/taming-transformers/)<br/>
6
+ [Patrick Esser](https://github.com/pesser)\*,
7
+ [Robin Rombach](https://github.com/rromb)\*,
8
+ [Björn Ommer](https://hci.iwr.uni-heidelberg.de/Staff/bommer)<br/>
9
+ \* equal contribution
10
+
11
+ **tl;dr** We combine the efficiancy of convolutional approaches with the expressivity of transformers by introducing a convolutional VQGAN, which learns a codebook of context-rich visual parts, whose composition is modeled with an autoregressive transformer.
12
+
13
+ ![teaser](assets/teaser.png)
14
+ [arXiv](https://arxiv.org/abs/2012.09841) | [BibTeX](#bibtex) | [Project Page](https://compvis.github.io/taming-transformers/)
15
+
16
+
17
+ ### News
18
+ #### 2022
19
+ - More pretrained VQGANs (e.g. a f8-model with only 256 codebook entries) are available in our new work on [Latent Diffusion Models](https://github.com/CompVis/latent-diffusion).
20
+ - Added scene synthesis models as proposed in the paper [High-Resolution Complex Scene Synthesis with Transformers](https://arxiv.org/abs/2105.06458), see [this section](#scene-image-synthesis).
21
+ #### 2021
22
+ - Thanks to [rom1504](https://github.com/rom1504) it is now easy to [train a VQGAN on your own datasets](#training-on-custom-data).
23
+ - Included a bugfix for the quantizer. For backward compatibility it is
24
+ disabled by default (which corresponds to always training with `beta=1.0`).
25
+ Use `legacy=False` in the quantizer config to enable it.
26
+ Thanks [richcmwang](https://github.com/richcmwang) and [wcshin-git](https://github.com/wcshin-git)!
27
+ - Our paper received an update: See https://arxiv.org/abs/2012.09841v3 and the corresponding changelog.
28
+ - Added a pretrained, [1.4B transformer model](https://k00.fr/s511rwcv) trained for class-conditional ImageNet synthesis, which obtains state-of-the-art FID scores among autoregressive approaches and outperforms BigGAN.
29
+ - Added pretrained, unconditional models on [FFHQ](https://k00.fr/yndvfu95) and [CelebA-HQ](https://k00.fr/2xkmielf).
30
+ - Added accelerated sampling via caching of keys/values in the self-attention operation, used in `scripts/sample_fast.py`.
31
+ - Added a checkpoint of a [VQGAN](https://heibox.uni-heidelberg.de/d/2e5662443a6b4307b470/) trained with f8 compression and Gumbel-Quantization.
32
+ See also our updated [reconstruction notebook](https://colab.research.google.com/github/CompVis/taming-transformers/blob/master/scripts/reconstruction_usage.ipynb).
33
+ - We added a [colab notebook](https://colab.research.google.com/github/CompVis/taming-transformers/blob/master/scripts/reconstruction_usage.ipynb) which compares two VQGANs and OpenAI's [DALL-E](https://github.com/openai/DALL-E). See also [this section](#more-resources).
34
+ - We now include an overview of pretrained models in [Tab.1](#overview-of-pretrained-models). We added models for [COCO](#coco) and [ADE20k](#ade20k).
35
+ - The streamlit demo now supports image completions.
36
+ - We now include a couple of examples from the D-RIN dataset so you can run the
37
+ [D-RIN demo](#d-rin) without preparing the dataset first.
38
+ - You can now jump right into sampling with our [Colab quickstart notebook](https://colab.research.google.com/github/CompVis/taming-transformers/blob/master/scripts/taming-transformers.ipynb).
39
+
40
+ ## Requirements
41
+ A suitable [conda](https://conda.io/) environment named `taming` can be created
42
+ and activated with:
43
+
44
+ ```
45
+ conda env create -f environment.yaml
46
+ conda activate taming
47
+ ```
48
+ ## Overview of pretrained models
49
+ The following table provides an overview of all models that are currently available.
50
+ FID scores were evaluated using [torch-fidelity](https://github.com/toshas/torch-fidelity).
51
+ For reference, we also include a link to the recently released autoencoder of the [DALL-E](https://github.com/openai/DALL-E) model.
52
+ See the corresponding [colab
53
+ notebook](https://colab.research.google.com/github/CompVis/taming-transformers/blob/master/scripts/reconstruction_usage.ipynb)
54
+ for a comparison and discussion of reconstruction capabilities.
55
+
56
+ | Dataset | FID vs train | FID vs val | Link | Samples (256x256) | Comments
57
+ | ------------- | ------------- | ------------- |------------- | ------------- |------------- |
58
+ | FFHQ (f=16) | 9.6 | -- | [ffhq_transformer](https://k00.fr/yndvfu95) | [ffhq_samples](https://k00.fr/j626x093) |
59
+ | CelebA-HQ (f=16) | 10.2 | -- | [celebahq_transformer](https://k00.fr/2xkmielf) | [celebahq_samples](https://k00.fr/j626x093) |
60
+ | ADE20K (f=16) | -- | 35.5 | [ade20k_transformer](https://k00.fr/ot46cksa) | [ade20k_samples.zip](https://heibox.uni-heidelberg.de/f/70bb78cbaf844501b8fb/) [2k] | evaluated on val split (2k images)
61
+ | COCO-Stuff (f=16) | -- | 20.4 | [coco_transformer](https://k00.fr/2zz6i2ce) | [coco_samples.zip](https://heibox.uni-heidelberg.de/f/a395a9be612f4a7a8054/) [5k] | evaluated on val split (5k images)
62
+ | ImageNet (cIN) (f=16) | 15.98/15.78/6.59/5.88/5.20 | -- | [cin_transformer](https://k00.fr/s511rwcv) | [cin_samples](https://k00.fr/j626x093) | different decoding hyperparameters |
63
+ | | | | || |
64
+ | FacesHQ (f=16) | -- | -- | [faceshq_transformer](https://k00.fr/qqfl2do8)
65
+ | S-FLCKR (f=16) | -- | -- | [sflckr](https://heibox.uni-heidelberg.de/d/73487ab6e5314cb5adba/)
66
+ | D-RIN (f=16) | -- | -- | [drin_transformer](https://k00.fr/39jcugc5)
67
+ | | | | | || |
68
+ | VQGAN ImageNet (f=16), 1024 | 10.54 | 7.94 | [vqgan_imagenet_f16_1024](https://heibox.uni-heidelberg.de/d/8088892a516d4e3baf92/) | [reconstructions](https://k00.fr/j626x093) | Reconstruction-FIDs.
69
+ | VQGAN ImageNet (f=16), 16384 | 7.41 | 4.98 |[vqgan_imagenet_f16_16384](https://heibox.uni-heidelberg.de/d/a7530b09fed84f80a887/) | [reconstructions](https://k00.fr/j626x093) | Reconstruction-FIDs.
70
+ | VQGAN OpenImages (f=8), 256 | -- | 1.49 |https://ommer-lab.com/files/latent-diffusion/vq-f8-n256.zip | --- | Reconstruction-FIDs. Available via [latent diffusion](https://github.com/CompVis/latent-diffusion).
71
+ | VQGAN OpenImages (f=8), 16384 | -- | 1.14 |https://ommer-lab.com/files/latent-diffusion/vq-f8.zip | --- | Reconstruction-FIDs. Available via [latent diffusion](https://github.com/CompVis/latent-diffusion)
72
+ | VQGAN OpenImages (f=8), 8192, GumbelQuantization | 3.24 | 1.49 |[vqgan_gumbel_f8](https://heibox.uni-heidelberg.de/d/2e5662443a6b4307b470/) | --- | Reconstruction-FIDs.
73
+ | | | | | || |
74
+ | DALL-E dVAE (f=8), 8192, GumbelQuantization | 33.88 | 32.01 | https://github.com/openai/DALL-E | [reconstructions](https://k00.fr/j626x093) | Reconstruction-FIDs.
75
+
76
+
77
+ ## Running pretrained models
78
+
79
+ The commands below will start a streamlit demo which supports sampling at
80
+ different resolutions and image completions. To run a non-interactive version
81
+ of the sampling process, replace `streamlit run scripts/sample_conditional.py --`
82
+ by `python scripts/make_samples.py --outdir <path_to_write_samples_to>` and
83
+ keep the remaining command line arguments.
84
+
85
+ To sample from unconditional or class-conditional models,
86
+ run `python scripts/sample_fast.py -r <path/to/config_and_checkpoint>`.
87
+ We describe below how to use this script to sample from the ImageNet, FFHQ, and CelebA-HQ models,
88
+ respectively.
89
+
90
+ ### S-FLCKR
91
+ ![teaser](assets/sunset_and_ocean.jpg)
92
+
93
+ You can also [run this model in a Colab
94
+ notebook](https://colab.research.google.com/github/CompVis/taming-transformers/blob/master/scripts/taming-transformers.ipynb),
95
+ which includes all necessary steps to start sampling.
96
+
97
+ Download the
98
+ [2020-11-09T13-31-51_sflckr](https://heibox.uni-heidelberg.de/d/73487ab6e5314cb5adba/)
99
+ folder and place it into `logs`. Then, run
100
+ ```
101
+ streamlit run scripts/sample_conditional.py -- -r logs/2020-11-09T13-31-51_sflckr/
102
+ ```
103
+
104
+ ### ImageNet
105
+ ![teaser](assets/imagenet.png)
106
+
107
+ Download the [2021-04-03T19-39-50_cin_transformer](https://k00.fr/s511rwcv)
108
+ folder and place it into logs. Sampling from the class-conditional ImageNet
109
+ model does not require any data preparation. To produce 50 samples for each of
110
+ the 1000 classes of ImageNet, with k=600 for top-k sampling, p=0.92 for nucleus
111
+ sampling and temperature t=1.0, run
112
+
113
+ ```
114
+ python scripts/sample_fast.py -r logs/2021-04-03T19-39-50_cin_transformer/ -n 50 -k 600 -t 1.0 -p 0.92 --batch_size 25
115
+ ```
116
+
117
+ To restrict the model to certain classes, provide them via the `--classes` argument, separated by
118
+ commas. For example, to sample 50 *ostriches*, *border collies* and *whiskey jugs*, run
119
+
120
+ ```
121
+ python scripts/sample_fast.py -r logs/2021-04-03T19-39-50_cin_transformer/ -n 50 -k 600 -t 1.0 -p 0.92 --batch_size 25 --classes 9,232,901
122
+ ```
123
+ We recommended to experiment with the autoregressive decoding parameters (top-k, top-p and temperature) for best results.
124
+
125
+ ### FFHQ/CelebA-HQ
126
+
127
+ Download the [2021-04-23T18-19-01_ffhq_transformer](https://k00.fr/yndvfu95) and
128
+ [2021-04-23T18-11-19_celebahq_transformer](https://k00.fr/2xkmielf)
129
+ folders and place them into logs.
130
+ Again, sampling from these unconditional models does not require any data preparation.
131
+ To produce 50000 samples, with k=250 for top-k sampling,
132
+ p=1.0 for nucleus sampling and temperature t=1.0, run
133
+
134
+ ```
135
+ python scripts/sample_fast.py -r logs/2021-04-23T18-19-01_ffhq_transformer/
136
+ ```
137
+ for FFHQ and
138
+
139
+ ```
140
+ python scripts/sample_fast.py -r logs/2021-04-23T18-11-19_celebahq_transformer/
141
+ ```
142
+ to sample from the CelebA-HQ model.
143
+ For both models it can be advantageous to vary the top-k/top-p parameters for sampling.
144
+
145
+ ### FacesHQ
146
+ ![teaser](assets/faceshq.jpg)
147
+
148
+ Download [2020-11-13T21-41-45_faceshq_transformer](https://k00.fr/qqfl2do8) and
149
+ place it into `logs`. Follow the data preparation steps for
150
+ [CelebA-HQ](#celeba-hq) and [FFHQ](#ffhq). Run
151
+ ```
152
+ streamlit run scripts/sample_conditional.py -- -r logs/2020-11-13T21-41-45_faceshq_transformer/
153
+ ```
154
+
155
+ ### D-RIN
156
+ ![teaser](assets/drin.jpg)
157
+
158
+ Download [2020-11-20T12-54-32_drin_transformer](https://k00.fr/39jcugc5) and
159
+ place it into `logs`. To run the demo on a couple of example depth maps
160
+ included in the repository, run
161
+
162
+ ```
163
+ streamlit run scripts/sample_conditional.py -- -r logs/2020-11-20T12-54-32_drin_transformer/ --ignore_base_data data="{target: main.DataModuleFromConfig, params: {batch_size: 1, validation: {target: taming.data.imagenet.DRINExamples}}}"
164
+ ```
165
+
166
+ To run the demo on the complete validation set, first follow the data preparation steps for
167
+ [ImageNet](#imagenet) and then run
168
+ ```
169
+ streamlit run scripts/sample_conditional.py -- -r logs/2020-11-20T12-54-32_drin_transformer/
170
+ ```
171
+
172
+ ### COCO
173
+ Download [2021-01-20T16-04-20_coco_transformer](https://k00.fr/2zz6i2ce) and
174
+ place it into `logs`. To run the demo on a couple of example segmentation maps
175
+ included in the repository, run
176
+
177
+ ```
178
+ streamlit run scripts/sample_conditional.py -- -r logs/2021-01-20T16-04-20_coco_transformer/ --ignore_base_data data="{target: main.DataModuleFromConfig, params: {batch_size: 1, validation: {target: taming.data.coco.Examples}}}"
179
+ ```
180
+
181
+ ### ADE20k
182
+ Download [2020-11-20T21-45-44_ade20k_transformer](https://k00.fr/ot46cksa) and
183
+ place it into `logs`. To run the demo on a couple of example segmentation maps
184
+ included in the repository, run
185
+
186
+ ```
187
+ streamlit run scripts/sample_conditional.py -- -r logs/2020-11-20T21-45-44_ade20k_transformer/ --ignore_base_data data="{target: main.DataModuleFromConfig, params: {batch_size: 1, validation: {target: taming.data.ade20k.Examples}}}"
188
+ ```
189
+
190
+ ## Scene Image Synthesis
191
+ ![teaser](assets/scene_images_samples.svg)
192
+ Scene image generation based on bounding box conditionals as done in our CVPR2021 AI4CC workshop paper [High-Resolution Complex Scene Synthesis with Transformers](https://arxiv.org/abs/2105.06458) (see talk on [workshop page](https://visual.cs.brown.edu/workshops/aicc2021/#awards)). Supporting the datasets COCO and Open Images.
193
+
194
+ ### Training
195
+ Download first-stage models [COCO-8k-VQGAN](https://heibox.uni-heidelberg.de/f/78dea9589974474c97c1/) for COCO or [COCO/Open-Images-8k-VQGAN](https://heibox.uni-heidelberg.de/f/461d9a9f4fcf48ab84f4/) for Open Images.
196
+ Change `ckpt_path` in `data/coco_scene_images_transformer.yaml` and `data/open_images_scene_images_transformer.yaml` to point to the downloaded first-stage models.
197
+ Download the full COCO/OI datasets and adapt `data_path` in the same files, unless working with the 100 files provided for training and validation suits your needs already.
198
+
199
+ Code can be run with
200
+ `python main.py --base configs/coco_scene_images_transformer.yaml -t True --gpus 0,`
201
+ or
202
+ `python main.py --base configs/open_images_scene_images_transformer.yaml -t True --gpus 0,`
203
+
204
+ ### Sampling
205
+ Train a model as described above or download a pre-trained model:
206
+ - [Open Images 1 billion parameter model](https://drive.google.com/file/d/1FEK-Z7hyWJBvFWQF50pzSK9y1W_CJEig/view?usp=sharing) available that trained 100 epochs. On 256x256 pixels, FID 41.48±0.21, SceneFID 14.60±0.15, Inception Score 18.47±0.27. The model was trained with 2d crops of images and is thus well-prepared for the task of generating high-resolution images, e.g. 512x512.
207
+ - [Open Images distilled version of the above model with 125 million parameters](https://drive.google.com/file/d/1xf89g0mc78J3d8Bx5YhbK4tNRNlOoYaO) allows for sampling on smaller GPUs (4 GB is enough for sampling 256x256 px images). Model was trained for 60 epochs with 10% soft loss, 90% hard loss. On 256x256 pixels, FID 43.07±0.40, SceneFID 15.93±0.19, Inception Score 17.23±0.11.
208
+ - [COCO 30 epochs](https://heibox.uni-heidelberg.de/f/0d0b2594e9074c7e9a33/)
209
+ - [COCO 60 epochs](https://drive.google.com/file/d/1bInd49g2YulTJBjU32Awyt5qnzxxG5U9/) (find model statistics for both COCO versions in `assets/coco_scene_images_training.svg`)
210
+
211
+ When downloading a pre-trained model, remember to change `ckpt_path` in `configs/*project.yaml` to point to your downloaded first-stage model (see ->Training).
212
+
213
+ Scene image generation can be run with
214
+ `python scripts/make_scene_samples.py --outdir=/some/outdir -r /path/to/pretrained/model --resolution=512,512`
215
+
216
+
217
+ ## Training on custom data
218
+
219
+ Training on your own dataset can be beneficial to get better tokens and hence better images for your domain.
220
+ Those are the steps to follow to make this work:
221
+ 1. install the repo with `conda env create -f environment.yaml`, `conda activate taming` and `pip install -e .`
222
+ 1. put your .jpg files in a folder `your_folder`
223
+ 2. create 2 text files a `xx_train.txt` and `xx_test.txt` that point to the files in your training and test set respectively (for example `find $(pwd)/your_folder -name "*.jpg" > train.txt`)
224
+ 3. adapt `configs/custom_vqgan.yaml` to point to these 2 files
225
+ 4. run `python main.py --base configs/custom_vqgan.yaml -t True --gpus 0,1` to
226
+ train on two GPUs. Use `--gpus 0,` (with a trailing comma) to train on a single GPU.
227
+
228
+ ## Data Preparation
229
+
230
+ ### ImageNet
231
+ The code will try to download (through [Academic
232
+ Torrents](http://academictorrents.com/)) and prepare ImageNet the first time it
233
+ is used. However, since ImageNet is quite large, this requires a lot of disk
234
+ space and time. If you already have ImageNet on your disk, you can speed things
235
+ up by putting the data into
236
+ `${XDG_CACHE}/autoencoders/data/ILSVRC2012_{split}/data/` (which defaults to
237
+ `~/.cache/autoencoders/data/ILSVRC2012_{split}/data/`), where `{split}` is one
238
+ of `train`/`validation`. It should have the following structure:
239
+
240
+ ```
241
+ ${XDG_CACHE}/autoencoders/data/ILSVRC2012_{split}/data/
242
+ ├── n01440764
243
+ │ ├── n01440764_10026.JPEG
244
+ │ ├── n01440764_10027.JPEG
245
+ │ ├── ...
246
+ ├── n01443537
247
+ │ ├── n01443537_10007.JPEG
248
+ │ ├── n01443537_10014.JPEG
249
+ │ ├── ...
250
+ ├── ...
251
+ ```
252
+
253
+ If you haven't extracted the data, you can also place
254
+ `ILSVRC2012_img_train.tar`/`ILSVRC2012_img_val.tar` (or symlinks to them) into
255
+ `${XDG_CACHE}/autoencoders/data/ILSVRC2012_train/` /
256
+ `${XDG_CACHE}/autoencoders/data/ILSVRC2012_validation/`, which will then be
257
+ extracted into above structure without downloading it again. Note that this
258
+ will only happen if neither a folder
259
+ `${XDG_CACHE}/autoencoders/data/ILSVRC2012_{split}/data/` nor a file
260
+ `${XDG_CACHE}/autoencoders/data/ILSVRC2012_{split}/.ready` exist. Remove them
261
+ if you want to force running the dataset preparation again.
262
+
263
+ You will then need to prepare the depth data using
264
+ [MiDaS](https://github.com/intel-isl/MiDaS). Create a symlink
265
+ `data/imagenet_depth` pointing to a folder with two subfolders `train` and
266
+ `val`, each mirroring the structure of the corresponding ImageNet folder
267
+ described above and containing a `png` file for each of ImageNet's `JPEG`
268
+ files. The `png` encodes `float32` depth values obtained from MiDaS as RGBA
269
+ images. We provide the script `scripts/extract_depth.py` to generate this data.
270
+ **Please note** that this script uses [MiDaS via PyTorch
271
+ Hub](https://pytorch.org/hub/intelisl_midas_v2/). When we prepared the data,
272
+ the hub provided the [MiDaS
273
+ v2.0](https://github.com/intel-isl/MiDaS/releases/tag/v2) version, but now it
274
+ provides a v2.1 version. We haven't tested our models with depth maps obtained
275
+ via v2.1 and if you want to make sure that things work as expected, you must
276
+ adjust the script to make sure it explicitly uses
277
+ [v2.0](https://github.com/intel-isl/MiDaS/releases/tag/v2)!
278
+
279
+ ### CelebA-HQ
280
+ Create a symlink `data/celebahq` pointing to a folder containing the `.npy`
281
+ files of CelebA-HQ (instructions to obtain them can be found in the [PGGAN
282
+ repository](https://github.com/tkarras/progressive_growing_of_gans)).
283
+
284
+ ### FFHQ
285
+ Create a symlink `data/ffhq` pointing to the `images1024x1024` folder obtained
286
+ from the [FFHQ repository](https://github.com/NVlabs/ffhq-dataset).
287
+
288
+ ### S-FLCKR
289
+ Unfortunately, we are not allowed to distribute the images we collected for the
290
+ S-FLCKR dataset and can therefore only give a description how it was produced.
291
+ There are many resources on [collecting images from the
292
+ web](https://github.com/adrianmrit/flickrdatasets) to get started.
293
+ We collected sufficiently large images from [flickr](https://www.flickr.com)
294
+ (see `data/flickr_tags.txt` for a full list of tags used to find images)
295
+ and various [subreddits](https://www.reddit.com/r/sfwpornnetwork/wiki/network)
296
+ (see `data/subreddits.txt` for all subreddits that were used).
297
+ Overall, we collected 107625 images, and split them randomly into 96861
298
+ training images and 10764 validation images. We then obtained segmentation
299
+ masks for each image using [DeepLab v2](https://arxiv.org/abs/1606.00915)
300
+ trained on [COCO-Stuff](https://arxiv.org/abs/1612.03716). We used a [PyTorch
301
+ reimplementation](https://github.com/kazuto1011/deeplab-pytorch) and include an
302
+ example script for this process in `scripts/extract_segmentation.py`.
303
+
304
+ ### COCO
305
+ Create a symlink `data/coco` containing the images from the 2017 split in
306
+ `train2017` and `val2017`, and their annotations in `annotations`. Files can be
307
+ obtained from the [COCO webpage](https://cocodataset.org/). In addition, we use
308
+ the [Stuff+thing PNG-style annotations on COCO 2017
309
+ trainval](http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/stuffthingmaps_trainval2017.zip)
310
+ annotations from [COCO-Stuff](https://github.com/nightrome/cocostuff), which
311
+ should be placed under `data/cocostuffthings`.
312
+
313
+ ### ADE20k
314
+ Create a symlink `data/ade20k_root` containing the contents of
315
+ [ADEChallengeData2016.zip](http://data.csail.mit.edu/places/ADEchallenge/ADEChallengeData2016.zip)
316
+ from the [MIT Scene Parsing Benchmark](http://sceneparsing.csail.mit.edu/).
317
+
318
+ ## Training models
319
+
320
+ ### FacesHQ
321
+
322
+ Train a VQGAN with
323
+ ```
324
+ python main.py --base configs/faceshq_vqgan.yaml -t True --gpus 0,
325
+ ```
326
+
327
+ Then, adjust the checkpoint path of the config key
328
+ `model.params.first_stage_config.params.ckpt_path` in
329
+ `configs/faceshq_transformer.yaml` (or download
330
+ [2020-11-09T13-33-36_faceshq_vqgan](https://k00.fr/uxy5usa9) and place into `logs`, which
331
+ corresponds to the preconfigured checkpoint path), then run
332
+ ```
333
+ python main.py --base configs/faceshq_transformer.yaml -t True --gpus 0,
334
+ ```
335
+
336
+ ### D-RIN
337
+
338
+ Train a VQGAN on ImageNet with
339
+ ```
340
+ python main.py --base configs/imagenet_vqgan.yaml -t True --gpus 0,
341
+ ```
342
+
343
+ or download a pretrained one from [2020-09-23T17-56-33_imagenet_vqgan](https://k00.fr/u0j2dtac)
344
+ and place under `logs`. If you trained your own, adjust the path in the config
345
+ key `model.params.first_stage_config.params.ckpt_path` of
346
+ `configs/drin_transformer.yaml`.
347
+
348
+ Train a VQGAN on Depth Maps of ImageNet with
349
+ ```
350
+ python main.py --base configs/imagenetdepth_vqgan.yaml -t True --gpus 0,
351
+ ```
352
+
353
+ or download a pretrained one from [2020-11-03T15-34-24_imagenetdepth_vqgan](https://k00.fr/55rlxs6i)
354
+ and place under `logs`. If you trained your own, adjust the path in the config
355
+ key `model.params.cond_stage_config.params.ckpt_path` of
356
+ `configs/drin_transformer.yaml`.
357
+
358
+ To train the transformer, run
359
+ ```
360
+ python main.py --base configs/drin_transformer.yaml -t True --gpus 0,
361
+ ```
362
+
363
+ ## More Resources
364
+ ### Comparing Different First Stage Models
365
+ The reconstruction and compression capabilities of different fist stage models can be analyzed in this [colab notebook](https://colab.research.google.com/github/CompVis/taming-transformers/blob/master/scripts/reconstruction_usage.ipynb).
366
+ In particular, the notebook compares two VQGANs with a downsampling factor of f=16 for each and codebook dimensionality of 1024 and 16384,
367
+ a VQGAN with f=8 and 8192 codebook entries and the discrete autoencoder of OpenAI's [DALL-E](https://github.com/openai/DALL-E) (which has f=8 and 8192
368
+ codebook entries).
369
+ ![firststages1](assets/first_stage_squirrels.png)
370
+ ![firststages2](assets/first_stage_mushrooms.png)
371
+
372
+ ### Other
373
+ - A [video summary](https://www.youtube.com/watch?v=o7dqGcLDf0A&feature=emb_imp_woyt) by [Two Minute Papers](https://www.youtube.com/channel/UCbfYPyITQ-7l4upoX8nvctg).
374
+ - A [video summary](https://www.youtube.com/watch?v=-wDSDtIAyWQ) by [Gradient Dude](https://www.youtube.com/c/GradientDude/about).
375
+ - A [weights and biases report summarizing the paper](https://wandb.ai/ayush-thakur/taming-transformer/reports/-Overview-Taming-Transformers-for-High-Resolution-Image-Synthesis---Vmlldzo0NjEyMTY)
376
+ by [ayulockin](https://github.com/ayulockin).
377
+ - A [video summary](https://www.youtube.com/watch?v=JfUTd8fjtX8&feature=emb_imp_woyt) by [What's AI](https://www.youtube.com/channel/UCUzGQrN-lyyc0BWTYoJM_Sg).
378
+ - Take a look at [ak9250's notebook](https://github.com/ak9250/taming-transformers/blob/master/tamingtransformerscolab.ipynb) if you want to run the streamlit demos on Colab.
379
+
380
+ ### Text-to-Image Optimization via CLIP
381
+ VQGAN has been successfully used as an image generator guided by the [CLIP](https://github.com/openai/CLIP) model, both for pure image generation
382
+ from scratch and image-to-image translation. We recommend the following notebooks/videos/resources:
383
+
384
+ - [Advadnouns](https://twitter.com/advadnoun/status/1389316507134357506) Patreon and corresponding LatentVision notebooks: https://www.patreon.com/patronizeme
385
+ - The [notebook]( https://colab.research.google.com/drive/1L8oL-vLJXVcRzCFbPwOoMkPKJ8-aYdPN) of [Rivers Have Wings](https://twitter.com/RiversHaveWings).
386
+ - A [video](https://www.youtube.com/watch?v=90QDe6DQXF4&t=12s) explanation by [Dot CSV](https://www.youtube.com/channel/UCy5znSnfMsDwaLlROnZ7Qbg) (in Spanish, but English subtitles are available)
387
+
388
+ ![txt2img](assets/birddrawnbyachild.png)
389
+
390
+ Text prompt: *'A bird drawn by a child'*
391
+
392
+ ## Shout-outs
393
+ Thanks to everyone who makes their code and models available. In particular,
394
+
395
+ - The architecture of our VQGAN is inspired by [Denoising Diffusion Probabilistic Models](https://github.com/hojonathanho/diffusion)
396
+ - The very hackable transformer implementation [minGPT](https://github.com/karpathy/minGPT)
397
+ - The good ol' [PatchGAN](https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix) and [Learned Perceptual Similarity (LPIPS)](https://github.com/richzhang/PerceptualSimilarity)
398
+
399
+ ## BibTeX
400
+
401
+ ```
402
+ @misc{esser2020taming,
403
+ title={Taming Transformers for High-Resolution Image Synthesis},
404
+ author={Patrick Esser and Robin Rombach and Björn Ommer},
405
+ year={2020},
406
+ eprint={2012.09841},
407
+ archivePrefix={arXiv},
408
+ primaryClass={cs.CV}
409
+ }
410
+ ```
taming-transformers/assets/birddrawnbyachild.png ADDED

Git LFS Details

  • SHA256: 165778bb85e86f8aaaed38eee4d33f62ab1ef237d890229cfa2e0685f5064127
  • Pointer size: 132 Bytes
  • Size of remote file: 1.61 MB
taming-transformers/assets/coco_scene_images_training.svg ADDED
taming-transformers/assets/drin.jpg ADDED

Git LFS Details

  • SHA256: 83652380049c45af8c1b75216ded141b3d064cca8154eb2875337b4d5182152b
  • Pointer size: 131 Bytes
  • Size of remote file: 286 kB
taming-transformers/assets/faceshq.jpg ADDED

Git LFS Details

  • SHA256: 6f20c66b935086464db0bad4b5dd90fadb3fb1d20373cb02c415ec4a9cfb989c
  • Pointer size: 131 Bytes
  • Size of remote file: 307 kB
taming-transformers/assets/first_stage_mushrooms.png ADDED

Git LFS Details

  • SHA256: 425218621d5e01ea30c9e51fa0969ad36c22063a405dc6f6ccb6dd8db64000a0
  • Pointer size: 132 Bytes
  • Size of remote file: 1.35 MB
taming-transformers/assets/first_stage_squirrels.png ADDED

Git LFS Details

  • SHA256: b5f234ee1566d6c537339a7110a1a1df088d527812097c19ac61f01b335cd6ae
  • Pointer size: 132 Bytes
  • Size of remote file: 1.42 MB
taming-transformers/assets/imagenet.png ADDED

Git LFS Details

  • SHA256: 2057d65399435ba17f265ad7ff421a9aabfb6051dec00bec5a37383dfccb2e54
  • Pointer size: 132 Bytes
  • Size of remote file: 1.03 MB
taming-transformers/assets/lake_in_the_mountains.png ADDED

Git LFS Details

  • SHA256: 9d0fa79e39e09c1eb398b1643cf3c5ee2cc94cc6f394771d20cb907838b36852
  • Pointer size: 131 Bytes
  • Size of remote file: 565 kB
taming-transformers/assets/mountain.jpeg ADDED

Git LFS Details

  • SHA256: 22859310b39f5011abc78e36970fdb0f3d62a33817d9301bde3d1252a11bc0bc
  • Pointer size: 131 Bytes
  • Size of remote file: 436 kB
taming-transformers/assets/scene_images_samples.svg ADDED
taming-transformers/assets/stormy.jpeg ADDED

Git LFS Details

  • SHA256: 13b9cde8e62c3fb145c4dd3d13c0d450e023f2405824f0a74b4e3f06411ce884
  • Pointer size: 131 Bytes
  • Size of remote file: 718 kB
taming-transformers/assets/sunset_and_ocean.jpg ADDED

Git LFS Details

  • SHA256: 0c967b3073a56221eda2cc5418efb8535a85d87f4b40cd487d42abae8135b341
  • Pointer size: 131 Bytes
  • Size of remote file: 322 kB
taming-transformers/assets/teaser.png ADDED

Git LFS Details

  • SHA256: 988481993d7911b41b38a86341e016a47729807552ce667f5713bca1118a7b11
  • Pointer size: 131 Bytes
  • Size of remote file: 359 kB
taming-transformers/configs/coco_cond_stage.yaml ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 4.5e-06
3
+ target: taming.models.vqgan.VQSegmentationModel
4
+ params:
5
+ embed_dim: 256
6
+ n_embed: 1024
7
+ image_key: "segmentation"
8
+ n_labels: 183
9
+ ddconfig:
10
+ double_z: false
11
+ z_channels: 256
12
+ resolution: 256
13
+ in_channels: 183
14
+ out_ch: 183
15
+ ch: 128
16
+ ch_mult:
17
+ - 1
18
+ - 1
19
+ - 2
20
+ - 2
21
+ - 4
22
+ num_res_blocks: 2
23
+ attn_resolutions:
24
+ - 16
25
+ dropout: 0.0
26
+
27
+ lossconfig:
28
+ target: taming.modules.losses.segmentation.BCELossWithQuant
29
+ params:
30
+ codebook_weight: 1.0
31
+
32
+ data:
33
+ target: main.DataModuleFromConfig
34
+ params:
35
+ batch_size: 12
36
+ train:
37
+ target: taming.data.coco.CocoImagesAndCaptionsTrain
38
+ params:
39
+ size: 296
40
+ crop_size: 256
41
+ onehot_segmentation: true
42
+ use_stuffthing: true
43
+ validation:
44
+ target: taming.data.coco.CocoImagesAndCaptionsValidation
45
+ params:
46
+ size: 256
47
+ crop_size: 256
48
+ onehot_segmentation: true
49
+ use_stuffthing: true
taming-transformers/configs/coco_scene_images_transformer.yaml ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 4.5e-06
3
+ target: taming.models.cond_transformer.Net2NetTransformer
4
+ params:
5
+ cond_stage_key: objects_bbox
6
+ transformer_config:
7
+ target: taming.modules.transformer.mingpt.GPT
8
+ params:
9
+ vocab_size: 8192
10
+ block_size: 348 # = 256 + 92 = dim(vqgan_latent_space,16x16) + dim(conditional_builder.embedding_dim)
11
+ n_layer: 40
12
+ n_head: 16
13
+ n_embd: 1408
14
+ embd_pdrop: 0.1
15
+ resid_pdrop: 0.1
16
+ attn_pdrop: 0.1
17
+ first_stage_config:
18
+ target: taming.models.vqgan.VQModel
19
+ params:
20
+ ckpt_path: /path/to/coco_epoch117.ckpt # https://heibox.uni-heidelberg.de/f/78dea9589974474c97c1/
21
+ embed_dim: 256
22
+ n_embed: 8192
23
+ ddconfig:
24
+ double_z: false
25
+ z_channels: 256
26
+ resolution: 256
27
+ in_channels: 3
28
+ out_ch: 3
29
+ ch: 128
30
+ ch_mult:
31
+ - 1
32
+ - 1
33
+ - 2
34
+ - 2
35
+ - 4
36
+ num_res_blocks: 2
37
+ attn_resolutions:
38
+ - 16
39
+ dropout: 0.0
40
+ lossconfig:
41
+ target: taming.modules.losses.DummyLoss
42
+ cond_stage_config:
43
+ target: taming.models.dummy_cond_stage.DummyCondStage
44
+ params:
45
+ conditional_key: objects_bbox
46
+
47
+ data:
48
+ target: main.DataModuleFromConfig
49
+ params:
50
+ batch_size: 6
51
+ train:
52
+ target: taming.data.annotated_objects_coco.AnnotatedObjectsCoco
53
+ params:
54
+ data_path: data/coco_annotations_100 # substitute with path to full dataset
55
+ split: train
56
+ keys: [image, objects_bbox, file_name, annotations]
57
+ no_tokens: 8192
58
+ target_image_size: 256
59
+ min_object_area: 0.00001
60
+ min_objects_per_image: 2
61
+ max_objects_per_image: 30
62
+ crop_method: random-1d
63
+ random_flip: true
64
+ use_group_parameter: true
65
+ encode_crop: true
66
+ validation:
67
+ target: taming.data.annotated_objects_coco.AnnotatedObjectsCoco
68
+ params:
69
+ data_path: data/coco_annotations_100 # substitute with path to full dataset
70
+ split: validation
71
+ keys: [image, objects_bbox, file_name, annotations]
72
+ no_tokens: 8192
73
+ target_image_size: 256
74
+ min_object_area: 0.00001
75
+ min_objects_per_image: 2
76
+ max_objects_per_image: 30
77
+ crop_method: center
78
+ random_flip: false
79
+ use_group_parameter: true
80
+ encode_crop: true
taming-transformers/configs/custom_vqgan.yaml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 4.5e-6
3
+ target: taming.models.vqgan.VQModel
4
+ params:
5
+ embed_dim: 256
6
+ n_embed: 1024
7
+ ddconfig:
8
+ double_z: False
9
+ z_channels: 256
10
+ resolution: 256
11
+ in_channels: 3
12
+ out_ch: 3
13
+ ch: 128
14
+ ch_mult: [ 1,1,2,2,4] # num_down = len(ch_mult)-1
15
+ num_res_blocks: 2
16
+ attn_resolutions: [16]
17
+ dropout: 0.0
18
+
19
+ lossconfig:
20
+ target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
21
+ params:
22
+ disc_conditional: False
23
+ disc_in_channels: 3
24
+ disc_start: 10000
25
+ disc_weight: 0.8
26
+ codebook_weight: 1.0
27
+
28
+ data:
29
+ target: main.DataModuleFromConfig
30
+ params:
31
+ batch_size: 5
32
+ num_workers: 8
33
+ train:
34
+ target: taming.data.custom.CustomTrain
35
+ params:
36
+ training_images_list_file: some/training.txt
37
+ size: 256
38
+ validation:
39
+ target: taming.data.custom.CustomTest
40
+ params:
41
+ test_images_list_file: some/test.txt
42
+ size: 256
43
+
taming-transformers/configs/drin_transformer.yaml ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 4.5e-06
3
+ target: taming.models.cond_transformer.Net2NetTransformer
4
+ params:
5
+ cond_stage_key: depth
6
+ transformer_config:
7
+ target: taming.modules.transformer.mingpt.GPT
8
+ params:
9
+ vocab_size: 1024
10
+ block_size: 512
11
+ n_layer: 24
12
+ n_head: 16
13
+ n_embd: 1024
14
+ first_stage_config:
15
+ target: taming.models.vqgan.VQModel
16
+ params:
17
+ ckpt_path: logs/2020-09-23T17-56-33_imagenet_vqgan/checkpoints/last.ckpt
18
+ embed_dim: 256
19
+ n_embed: 1024
20
+ ddconfig:
21
+ double_z: false
22
+ z_channels: 256
23
+ resolution: 256
24
+ in_channels: 3
25
+ out_ch: 3
26
+ ch: 128
27
+ ch_mult:
28
+ - 1
29
+ - 1
30
+ - 2
31
+ - 2
32
+ - 4
33
+ num_res_blocks: 2
34
+ attn_resolutions:
35
+ - 16
36
+ dropout: 0.0
37
+ lossconfig:
38
+ target: taming.modules.losses.DummyLoss
39
+ cond_stage_config:
40
+ target: taming.models.vqgan.VQModel
41
+ params:
42
+ ckpt_path: logs/2020-11-03T15-34-24_imagenetdepth_vqgan/checkpoints/last.ckpt
43
+ embed_dim: 256
44
+ n_embed: 1024
45
+ ddconfig:
46
+ double_z: false
47
+ z_channels: 256
48
+ resolution: 256
49
+ in_channels: 1
50
+ out_ch: 1
51
+ ch: 128
52
+ ch_mult:
53
+ - 1
54
+ - 1
55
+ - 2
56
+ - 2
57
+ - 4
58
+ num_res_blocks: 2
59
+ attn_resolutions:
60
+ - 16
61
+ dropout: 0.0
62
+ lossconfig:
63
+ target: taming.modules.losses.DummyLoss
64
+
65
+ data:
66
+ target: main.DataModuleFromConfig
67
+ params:
68
+ batch_size: 2
69
+ num_workers: 8
70
+ train:
71
+ target: taming.data.imagenet.RINTrainWithDepth
72
+ params:
73
+ size: 256
74
+ validation:
75
+ target: taming.data.imagenet.RINValidationWithDepth
76
+ params:
77
+ size: 256
taming-transformers/configs/faceshq_transformer.yaml ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 4.5e-06
3
+ target: taming.models.cond_transformer.Net2NetTransformer
4
+ params:
5
+ cond_stage_key: coord
6
+ transformer_config:
7
+ target: taming.modules.transformer.mingpt.GPT
8
+ params:
9
+ vocab_size: 1024
10
+ block_size: 512
11
+ n_layer: 24
12
+ n_head: 16
13
+ n_embd: 1024
14
+ first_stage_config:
15
+ target: taming.models.vqgan.VQModel
16
+ params:
17
+ ckpt_path: logs/2020-11-09T13-33-36_faceshq_vqgan/checkpoints/last.ckpt
18
+ embed_dim: 256
19
+ n_embed: 1024
20
+ ddconfig:
21
+ double_z: false
22
+ z_channels: 256
23
+ resolution: 256
24
+ in_channels: 3
25
+ out_ch: 3
26
+ ch: 128
27
+ ch_mult:
28
+ - 1
29
+ - 1
30
+ - 2
31
+ - 2
32
+ - 4
33
+ num_res_blocks: 2
34
+ attn_resolutions:
35
+ - 16
36
+ dropout: 0.0
37
+ lossconfig:
38
+ target: taming.modules.losses.DummyLoss
39
+ cond_stage_config:
40
+ target: taming.modules.misc.coord.CoordStage
41
+ params:
42
+ n_embed: 1024
43
+ down_factor: 16
44
+
45
+ data:
46
+ target: main.DataModuleFromConfig
47
+ params:
48
+ batch_size: 2
49
+ num_workers: 8
50
+ train:
51
+ target: taming.data.faceshq.FacesHQTrain
52
+ params:
53
+ size: 256
54
+ crop_size: 256
55
+ coord: True
56
+ validation:
57
+ target: taming.data.faceshq.FacesHQValidation
58
+ params:
59
+ size: 256
60
+ crop_size: 256
61
+ coord: True
taming-transformers/configs/faceshq_vqgan.yaml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 4.5e-6
3
+ target: taming.models.vqgan.VQModel
4
+ params:
5
+ embed_dim: 256
6
+ n_embed: 1024
7
+ ddconfig:
8
+ double_z: False
9
+ z_channels: 256
10
+ resolution: 256
11
+ in_channels: 3
12
+ out_ch: 3
13
+ ch: 128
14
+ ch_mult: [ 1,1,2,2,4] # num_down = len(ch_mult)-1
15
+ num_res_blocks: 2
16
+ attn_resolutions: [16]
17
+ dropout: 0.0
18
+
19
+ lossconfig:
20
+ target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
21
+ params:
22
+ disc_conditional: False
23
+ disc_in_channels: 3
24
+ disc_start: 30001
25
+ disc_weight: 0.8
26
+ codebook_weight: 1.0
27
+
28
+ data:
29
+ target: main.DataModuleFromConfig
30
+ params:
31
+ batch_size: 3
32
+ num_workers: 8
33
+ train:
34
+ target: taming.data.faceshq.FacesHQTrain
35
+ params:
36
+ size: 256
37
+ crop_size: 256
38
+ validation:
39
+ target: taming.data.faceshq.FacesHQValidation
40
+ params:
41
+ size: 256
42
+ crop_size: 256
taming-transformers/configs/imagenet_vqgan.yaml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 4.5e-6
3
+ target: taming.models.vqgan.VQModel
4
+ params:
5
+ embed_dim: 256
6
+ n_embed: 1024
7
+ ddconfig:
8
+ double_z: False
9
+ z_channels: 256
10
+ resolution: 256
11
+ in_channels: 3
12
+ out_ch: 3
13
+ ch: 128
14
+ ch_mult: [ 1,1,2,2,4] # num_down = len(ch_mult)-1
15
+ num_res_blocks: 2
16
+ attn_resolutions: [16]
17
+ dropout: 0.0
18
+
19
+ lossconfig:
20
+ target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
21
+ params:
22
+ disc_conditional: False
23
+ disc_in_channels: 3
24
+ disc_start: 250001
25
+ disc_weight: 0.8
26
+ codebook_weight: 1.0
27
+
28
+ data:
29
+ target: main.DataModuleFromConfig
30
+ params:
31
+ batch_size: 12
32
+ num_workers: 24
33
+ train:
34
+ target: taming.data.imagenet.ImageNetTrain
35
+ params:
36
+ config:
37
+ size: 256
38
+ validation:
39
+ target: taming.data.imagenet.ImageNetValidation
40
+ params:
41
+ config:
42
+ size: 256
taming-transformers/configs/imagenetdepth_vqgan.yaml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 4.5e-6
3
+ target: taming.models.vqgan.VQModel
4
+ params:
5
+ embed_dim: 256
6
+ n_embed: 1024
7
+ image_key: depth
8
+ ddconfig:
9
+ double_z: False
10
+ z_channels: 256
11
+ resolution: 256
12
+ in_channels: 1
13
+ out_ch: 1
14
+ ch: 128
15
+ ch_mult: [ 1,1,2,2,4] # num_down = len(ch_mult)-1
16
+ num_res_blocks: 2
17
+ attn_resolutions: [16]
18
+ dropout: 0.0
19
+
20
+ lossconfig:
21
+ target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
22
+ params:
23
+ disc_conditional: False
24
+ disc_in_channels: 1
25
+ disc_start: 50001
26
+ disc_weight: 0.75
27
+ codebook_weight: 1.0
28
+
29
+ data:
30
+ target: main.DataModuleFromConfig
31
+ params:
32
+ batch_size: 3
33
+ num_workers: 8
34
+ train:
35
+ target: taming.data.imagenet.ImageNetTrainWithDepth
36
+ params:
37
+ size: 256
38
+ validation:
39
+ target: taming.data.imagenet.ImageNetValidationWithDepth
40
+ params:
41
+ size: 256
taming-transformers/configs/open_images_scene_images_transformer.yaml ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 4.5e-06
3
+ target: taming.models.cond_transformer.Net2NetTransformer
4
+ params:
5
+ cond_stage_key: objects_bbox
6
+ transformer_config:
7
+ target: taming.modules.transformer.mingpt.GPT
8
+ params:
9
+ vocab_size: 8192
10
+ block_size: 348 # = 256 + 92 = dim(vqgan_latent_space,16x16) + dim(conditional_builder.embedding_dim)
11
+ n_layer: 36
12
+ n_head: 16
13
+ n_embd: 1536
14
+ embd_pdrop: 0.1
15
+ resid_pdrop: 0.1
16
+ attn_pdrop: 0.1
17
+ first_stage_config:
18
+ target: taming.models.vqgan.VQModel
19
+ params:
20
+ ckpt_path: /path/to/coco_oi_epoch12.ckpt # https://heibox.uni-heidelberg.de/f/461d9a9f4fcf48ab84f4/
21
+ embed_dim: 256
22
+ n_embed: 8192
23
+ ddconfig:
24
+ double_z: false
25
+ z_channels: 256
26
+ resolution: 256
27
+ in_channels: 3
28
+ out_ch: 3
29
+ ch: 128
30
+ ch_mult:
31
+ - 1
32
+ - 1
33
+ - 2
34
+ - 2
35
+ - 4
36
+ num_res_blocks: 2
37
+ attn_resolutions:
38
+ - 16
39
+ dropout: 0.0
40
+ lossconfig:
41
+ target: taming.modules.losses.DummyLoss
42
+ cond_stage_config:
43
+ target: taming.models.dummy_cond_stage.DummyCondStage
44
+ params:
45
+ conditional_key: objects_bbox
46
+
47
+ data:
48
+ target: main.DataModuleFromConfig
49
+ params:
50
+ batch_size: 6
51
+ train:
52
+ target: taming.data.annotated_objects_open_images.AnnotatedObjectsOpenImages
53
+ params:
54
+ data_path: data/open_images_annotations_100 # substitute with path to full dataset
55
+ split: train
56
+ keys: [image, objects_bbox, file_name, annotations]
57
+ no_tokens: 8192
58
+ target_image_size: 256
59
+ category_allow_list_target: taming.data.open_images_helper.top_300_classes_plus_coco_compatibility
60
+ category_mapping_target: taming.data.open_images_helper.open_images_unify_categories_for_coco
61
+ min_object_area: 0.0001
62
+ min_objects_per_image: 2
63
+ max_objects_per_image: 30
64
+ crop_method: random-2d
65
+ random_flip: true
66
+ use_group_parameter: true
67
+ use_additional_parameters: true
68
+ encode_crop: true
69
+ validation:
70
+ target: taming.data.annotated_objects_open_images.AnnotatedObjectsOpenImages
71
+ params:
72
+ data_path: data/open_images_annotations_100 # substitute with path to full dataset
73
+ split: validation
74
+ keys: [image, objects_bbox, file_name, annotations]
75
+ no_tokens: 8192
76
+ target_image_size: 256
77
+ category_allow_list_target: taming.data.open_images_helper.top_300_classes_plus_coco_compatibility
78
+ category_mapping_target: taming.data.open_images_helper.open_images_unify_categories_for_coco
79
+ min_object_area: 0.0001
80
+ min_objects_per_image: 2
81
+ max_objects_per_image: 30
82
+ crop_method: center
83
+ random_flip: false
84
+ use_group_parameter: true
85
+ use_additional_parameters: true
86
+ encode_crop: true
taming-transformers/configs/sflckr_cond_stage.yaml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 4.5e-06
3
+ target: taming.models.vqgan.VQSegmentationModel
4
+ params:
5
+ embed_dim: 256
6
+ n_embed: 1024
7
+ image_key: "segmentation"
8
+ n_labels: 182
9
+ ddconfig:
10
+ double_z: false
11
+ z_channels: 256
12
+ resolution: 256
13
+ in_channels: 182
14
+ out_ch: 182
15
+ ch: 128
16
+ ch_mult:
17
+ - 1
18
+ - 1
19
+ - 2
20
+ - 2
21
+ - 4
22
+ num_res_blocks: 2
23
+ attn_resolutions:
24
+ - 16
25
+ dropout: 0.0
26
+
27
+ lossconfig:
28
+ target: taming.modules.losses.segmentation.BCELossWithQuant
29
+ params:
30
+ codebook_weight: 1.0
31
+
32
+ data:
33
+ target: cutlit.DataModuleFromConfig
34
+ params:
35
+ batch_size: 12
36
+ train:
37
+ target: taming.data.sflckr.Examples # adjust
38
+ params:
39
+ size: 256
40
+ validation:
41
+ target: taming.data.sflckr.Examples # adjust
42
+ params:
43
+ size: 256
taming-transformers/data/ade20k_examples.txt ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ADE_val_00000636.jpg
2
+ ADE_val_00000126.jpg
3
+ ADE_val_00001412.jpg
4
+ ADE_val_00001845.jpg
5
+ ADE_val_00001200.jpg
6
+ ADE_val_00001578.jpg
7
+ ADE_val_00000880.jpg
8
+ ADE_val_00000875.jpg
9
+ ADE_val_00000123.jpg
10
+ ADE_val_00001209.jpg
11
+ ADE_val_00000203.jpg
12
+ ADE_val_00001851.jpg
13
+ ADE_val_00001583.jpg
14
+ ADE_val_00000287.jpg
15
+ ADE_val_00001947.jpg
16
+ ADE_val_00000262.jpg
17
+ ADE_val_00000603.jpg
18
+ ADE_val_00000125.jpg
19
+ ADE_val_00001698.jpg
20
+ ADE_val_00001966.jpg
21
+ ADE_val_00000532.jpg
22
+ ADE_val_00001177.jpg
23
+ ADE_val_00000734.jpg
24
+ ADE_val_00001498.jpg
25
+ ADE_val_00001766.jpg
26
+ ADE_val_00000303.jpg
27
+ ADE_val_00000509.jpg
28
+ ADE_val_00000573.jpg
29
+ ADE_val_00000289.jpg
30
+ ADE_val_00001388.jpg
taming-transformers/data/ade20k_images/ADE_val_00000123.jpg ADDED
taming-transformers/data/ade20k_images/ADE_val_00000125.jpg ADDED
taming-transformers/data/ade20k_images/ADE_val_00000126.jpg ADDED
taming-transformers/data/ade20k_images/ADE_val_00000203.jpg ADDED
taming-transformers/data/ade20k_images/ADE_val_00000262.jpg ADDED
taming-transformers/data/ade20k_images/ADE_val_00000287.jpg ADDED
taming-transformers/data/ade20k_images/ADE_val_00000289.jpg ADDED
taming-transformers/data/ade20k_images/ADE_val_00000303.jpg ADDED
taming-transformers/data/ade20k_images/ADE_val_00000509.jpg ADDED
taming-transformers/data/ade20k_images/ADE_val_00000532.jpg ADDED
taming-transformers/data/ade20k_images/ADE_val_00000573.jpg ADDED
taming-transformers/data/ade20k_images/ADE_val_00000603.jpg ADDED
taming-transformers/data/ade20k_images/ADE_val_00000636.jpg ADDED
taming-transformers/data/ade20k_images/ADE_val_00000734.jpg ADDED
taming-transformers/data/ade20k_images/ADE_val_00000875.jpg ADDED
taming-transformers/data/ade20k_images/ADE_val_00000880.jpg ADDED
taming-transformers/data/ade20k_images/ADE_val_00001177.jpg ADDED
taming-transformers/data/ade20k_images/ADE_val_00001200.jpg ADDED
taming-transformers/data/ade20k_images/ADE_val_00001209.jpg ADDED
taming-transformers/data/ade20k_images/ADE_val_00001388.jpg ADDED
taming-transformers/data/ade20k_images/ADE_val_00001412.jpg ADDED
taming-transformers/data/ade20k_images/ADE_val_00001498.jpg ADDED
taming-transformers/data/ade20k_images/ADE_val_00001578.jpg ADDED