mali90 commited on
Commit
5853b04
·
verified ·
1 Parent(s): 42c2c6f

Update index.html

Browse files
Files changed (1) hide show
  1. index.html +6 -52
index.html CHANGED
@@ -36,52 +36,6 @@
36
  </p>
37
  </div>
38
  </section>
39
-
40
- <section class="section">
41
- <div class="container">
42
- <h2 class="title is-3">📊 Results</h2>
43
- <div class="highlight-box">
44
- <p><strong>✔️ Accuracy</strong></p>
45
- <ul>
46
- <li>Spearman’s ρ > 0.87 with human ground truth</li>
47
- </ul>
48
- </div>
49
- <div class="highlight-box">
50
- <p><strong>📈 Downstream LLM Training Impact</strong></p>
51
- <ul>
52
- <li>+7.2% benchmark performance improvement</li>
53
- <li>+4.8% token retention compared to FineWeb2 heuristic filter</li>
54
- <li>Reliable thresholding with 0.6 and 0.7 quantiles</li>
55
- </ul>
56
- </div>
57
- <div class="highlight-box">
58
- <p><strong>⚡ Annotation Speed</strong></p>
59
- <ul>
60
- <li>~11,000 docs/min (on A100 GPU, avg. 690 tokens per doc)</li>
61
- </ul>
62
- </div>
63
- </div>
64
- </section>
65
-
66
- <section class="section">
67
- <div class="container">
68
- <h2 class="title is-3">📁 Available Artifacts</h2>
69
- <div class="highlight-box">
70
- <ul>
71
- <li>📄 Ground truth annotations in <strong>35 languages</strong></li>
72
- <li>🧠 Synthetic LLM-annotated dataset (<strong>14M+ documents</strong>)</li>
73
- <li>🪶 Lightweight annotation models:
74
- <ul>
75
- <li>JQL-Gemma</li>
76
- <li>JQL-Mistral</li>
77
- <li>JQL-Llama</li>
78
- </ul>
79
- </li>
80
- <li>🛠️ Training & inference scripts <em>(coming soon)</em></li>
81
- </ul>
82
- </div>
83
- </div>
84
- </section>
85
 
86
  <section class="section">
87
  <div class="container content">
@@ -104,15 +58,15 @@
104
  <div class="container content">
105
  <h2 class="title is-3">📊 Results</h2>
106
  <ul>
107
- <li><strong>Accuracy:</strong> Spearman’s ρ > 0.87 with human ground truth</li>
108
- <li><strong>Downstream LLM Training:</strong>
109
  <ul>
110
  <li>+7.2% benchmark performance improvement</li>
111
  <li>+4.8% token retention vs. FineWeb2 heuristic filter</li>
112
  <li>Effective threshold strategies: 0.6 and 0.7 quantile</li>
113
  </ul>
114
  </li>
115
- <li><strong>Annotation Speed:</strong> ~11,000 docs/min (A100 GPU, avg. 690 tokens)</li>
116
  </ul>
117
  </div>
118
  </section>
@@ -121,9 +75,9 @@
121
  <div class="container content">
122
  <h2 class="title is-3">📁 Available Artifacts</h2>
123
  <ul>
124
- <li>✅ Ground truth annotations in 35 languages</li>
125
- <li>✅ Synthetic LLM-annotated dataset (14M+ documents)</li>
126
- <li>✅ Lightweight annotation models:
127
  <ul>
128
  <li>JQL-Gemma</li>
129
  <li>JQL-Mistral</li>
 
36
  </p>
37
  </div>
38
  </section>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  <section class="section">
41
  <div class="container content">
 
58
  <div class="container content">
59
  <h2 class="title is-3">📊 Results</h2>
60
  <ul>
61
+ <li><strong>✔️ Accuracy:</strong> Spearman’s ρ > 0.87 with human ground truth</li>
62
+ <li><strong>📈 Downstream LLM Training:</strong>
63
  <ul>
64
  <li>+7.2% benchmark performance improvement</li>
65
  <li>+4.8% token retention vs. FineWeb2 heuristic filter</li>
66
  <li>Effective threshold strategies: 0.6 and 0.7 quantile</li>
67
  </ul>
68
  </li>
69
+ <li><strong>⚡ Annotation Speed:</strong> ~11,000 docs/min (A100 GPU, avg. 690 tokens)</li>
70
  </ul>
71
  </div>
72
  </section>
 
75
  <div class="container content">
76
  <h2 class="title is-3">📁 Available Artifacts</h2>
77
  <ul>
78
+ <li>📄 Ground truth annotations in 35 languages</li>
79
+ <li>🧠 Synthetic LLM-annotated dataset (14M+ documents)</li>
80
+ <li>🪶 Lightweight annotation models:
81
  <ul>
82
  <li>JQL-Gemma</li>
83
  <li>JQL-Mistral</li>