Update index.html
Browse files- index.html +6 -52
index.html
CHANGED
@@ -36,52 +36,6 @@
|
|
36 |
</p>
|
37 |
</div>
|
38 |
</section>
|
39 |
-
|
40 |
-
<section class="section">
|
41 |
-
<div class="container">
|
42 |
-
<h2 class="title is-3">📊 Results</h2>
|
43 |
-
<div class="highlight-box">
|
44 |
-
<p><strong>✔️ Accuracy</strong></p>
|
45 |
-
<ul>
|
46 |
-
<li>Spearman’s ρ > 0.87 with human ground truth</li>
|
47 |
-
</ul>
|
48 |
-
</div>
|
49 |
-
<div class="highlight-box">
|
50 |
-
<p><strong>📈 Downstream LLM Training Impact</strong></p>
|
51 |
-
<ul>
|
52 |
-
<li>+7.2% benchmark performance improvement</li>
|
53 |
-
<li>+4.8% token retention compared to FineWeb2 heuristic filter</li>
|
54 |
-
<li>Reliable thresholding with 0.6 and 0.7 quantiles</li>
|
55 |
-
</ul>
|
56 |
-
</div>
|
57 |
-
<div class="highlight-box">
|
58 |
-
<p><strong>⚡ Annotation Speed</strong></p>
|
59 |
-
<ul>
|
60 |
-
<li>~11,000 docs/min (on A100 GPU, avg. 690 tokens per doc)</li>
|
61 |
-
</ul>
|
62 |
-
</div>
|
63 |
-
</div>
|
64 |
-
</section>
|
65 |
-
|
66 |
-
<section class="section">
|
67 |
-
<div class="container">
|
68 |
-
<h2 class="title is-3">📁 Available Artifacts</h2>
|
69 |
-
<div class="highlight-box">
|
70 |
-
<ul>
|
71 |
-
<li>📄 Ground truth annotations in <strong>35 languages</strong></li>
|
72 |
-
<li>🧠 Synthetic LLM-annotated dataset (<strong>14M+ documents</strong>)</li>
|
73 |
-
<li>🪶 Lightweight annotation models:
|
74 |
-
<ul>
|
75 |
-
<li>JQL-Gemma</li>
|
76 |
-
<li>JQL-Mistral</li>
|
77 |
-
<li>JQL-Llama</li>
|
78 |
-
</ul>
|
79 |
-
</li>
|
80 |
-
<li>🛠️ Training & inference scripts <em>(coming soon)</em></li>
|
81 |
-
</ul>
|
82 |
-
</div>
|
83 |
-
</div>
|
84 |
-
</section>
|
85 |
|
86 |
<section class="section">
|
87 |
<div class="container content">
|
@@ -104,15 +58,15 @@
|
|
104 |
<div class="container content">
|
105 |
<h2 class="title is-3">📊 Results</h2>
|
106 |
<ul>
|
107 |
-
<li><strong
|
108 |
-
<li><strong
|
109 |
<ul>
|
110 |
<li>+7.2% benchmark performance improvement</li>
|
111 |
<li>+4.8% token retention vs. FineWeb2 heuristic filter</li>
|
112 |
<li>Effective threshold strategies: 0.6 and 0.7 quantile</li>
|
113 |
</ul>
|
114 |
</li>
|
115 |
-
<li><strong
|
116 |
</ul>
|
117 |
</div>
|
118 |
</section>
|
@@ -121,9 +75,9 @@
|
|
121 |
<div class="container content">
|
122 |
<h2 class="title is-3">📁 Available Artifacts</h2>
|
123 |
<ul>
|
124 |
-
<li
|
125 |
-
<li
|
126 |
-
<li
|
127 |
<ul>
|
128 |
<li>JQL-Gemma</li>
|
129 |
<li>JQL-Mistral</li>
|
|
|
36 |
</p>
|
37 |
</div>
|
38 |
</section>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
<section class="section">
|
41 |
<div class="container content">
|
|
|
58 |
<div class="container content">
|
59 |
<h2 class="title is-3">📊 Results</h2>
|
60 |
<ul>
|
61 |
+
<li><strong>✔️ Accuracy:</strong> Spearman’s ρ > 0.87 with human ground truth</li>
|
62 |
+
<li><strong>📈 Downstream LLM Training:</strong>
|
63 |
<ul>
|
64 |
<li>+7.2% benchmark performance improvement</li>
|
65 |
<li>+4.8% token retention vs. FineWeb2 heuristic filter</li>
|
66 |
<li>Effective threshold strategies: 0.6 and 0.7 quantile</li>
|
67 |
</ul>
|
68 |
</li>
|
69 |
+
<li><strong>⚡ Annotation Speed:</strong> ~11,000 docs/min (A100 GPU, avg. 690 tokens)</li>
|
70 |
</ul>
|
71 |
</div>
|
72 |
</section>
|
|
|
75 |
<div class="container content">
|
76 |
<h2 class="title is-3">📁 Available Artifacts</h2>
|
77 |
<ul>
|
78 |
+
<li>📄 Ground truth annotations in 35 languages</li>
|
79 |
+
<li>🧠 Synthetic LLM-annotated dataset (14M+ documents)</li>
|
80 |
+
<li>🪶 Lightweight annotation models:
|
81 |
<ul>
|
82 |
<li>JQL-Gemma</li>
|
83 |
<li>JQL-Mistral</li>
|