Update app.py
Browse files
app.py
CHANGED
@@ -2,23 +2,23 @@
|
|
2 |
import os
|
3 |
import gradio as gr
|
4 |
import requests
|
|
|
5 |
from io import BytesIO
|
6 |
import matplotlib.pyplot as plt
|
7 |
-
import tempfile
|
8 |
from datasets import load_dataset
|
9 |
from train_tokenizer import train_tokenizer
|
10 |
from tokenizers import Tokenizer
|
11 |
|
12 |
# Ρυθμίσεις checkpointing
|
13 |
-
CHECKPOINT_FILE = "checkpoint.txt"
|
14 |
-
|
|
|
|
|
15 |
|
16 |
def fetch_splits(dataset_name):
|
|
|
17 |
try:
|
18 |
-
response = requests.get(
|
19 |
-
f"https://datasets-server.huggingface.co/splits?dataset={dataset_name}",
|
20 |
-
timeout=10
|
21 |
-
)
|
22 |
response.raise_for_status()
|
23 |
data = response.json()
|
24 |
|
@@ -37,145 +37,67 @@ def fetch_splits(dataset_name):
|
|
37 |
except Exception as e:
|
38 |
raise gr.Error(f"Σφάλμα κατά την ανάκτηση των splits: {str(e)}")
|
39 |
|
40 |
-
def update_components(dataset_name):
|
41 |
-
if not dataset_name:
|
42 |
-
return [gr.Textbox.update(value=""), gr.Dropdown.update(choices=[], value=None), gr.HTML.update(value="")]
|
43 |
-
try:
|
44 |
-
splits_data = fetch_splits(dataset_name)
|
45 |
-
config_choices = list(splits_data['splits'].keys())
|
46 |
-
first_config = config_choices[0] if config_choices else None
|
47 |
-
iframe_html = f"""
|
48 |
-
<iframe
|
49 |
-
src="{splits_data['viewer_template'].format(config=first_config, split='train')}"
|
50 |
-
frameborder="0"
|
51 |
-
width="100%"
|
52 |
-
height="560px"
|
53 |
-
></iframe>
|
54 |
-
""" if first_config else "Δεν βρέθηκαν διαθέσιμα δεδομένα"
|
55 |
-
# Προτείνουμε ως προεπιλογή για πολλαπλά configs τα ελληνικά και αγγλικά
|
56 |
-
default_configs = "20231101.el,20231101.en" if first_config and "el" in first_config else first_config
|
57 |
-
return [
|
58 |
-
gr.Textbox.update(value=default_configs),
|
59 |
-
gr.Dropdown.update(choices=splits_data['splits'].get(first_config, [])),
|
60 |
-
gr.HTML.update(value=iframe_html)
|
61 |
-
]
|
62 |
-
except Exception as e:
|
63 |
-
raise gr.Error(f"Σφάλμα: {str(e)}")
|
64 |
-
|
65 |
-
def update_split_choices(dataset_name, configs):
|
66 |
-
if not dataset_name or not configs:
|
67 |
-
return gr.Dropdown.update(choices=[])
|
68 |
-
try:
|
69 |
-
splits_data = fetch_splits(dataset_name)
|
70 |
-
first_config = configs.split(",")[0].strip()
|
71 |
-
return gr.Dropdown.update(choices=splits_data['splits'].get(first_config, []))
|
72 |
-
except:
|
73 |
-
return gr.Dropdown.update(choices=[])
|
74 |
-
|
75 |
def create_iterator(dataset_name, configs, split):
|
76 |
-
"""
|
77 |
-
Για κάθε config (χωρισμένα με κόμμα) φορτώνει το αντίστοιχο streaming dataset και παράγει τα κείμενα.
|
78 |
-
"""
|
79 |
configs_list = [c.strip() for c in configs.split(",") if c.strip()]
|
80 |
for config in configs_list:
|
81 |
try:
|
82 |
-
dataset = load_dataset(
|
83 |
-
dataset_name,
|
84 |
-
name=config,
|
85 |
-
split=split,
|
86 |
-
streaming=True
|
87 |
-
)
|
88 |
for example in dataset:
|
89 |
text = example.get('text', '')
|
90 |
if text:
|
91 |
yield text
|
92 |
except Exception as e:
|
93 |
-
print(f"Σφάλμα φόρτωσης dataset για config {config}: {e}")
|
94 |
|
95 |
-
def append_to_checkpoint(texts
|
96 |
-
"""
|
97 |
-
|
98 |
-
"""
|
99 |
-
with open(checkpoint_file, "a", encoding="utf-8") as f:
|
100 |
for t in texts:
|
101 |
f.write(t + "\n")
|
102 |
|
103 |
-
def load_checkpoint(
|
104 |
-
"""
|
105 |
-
|
106 |
-
|
107 |
-
if os.path.exists(checkpoint_file):
|
108 |
-
with open(checkpoint_file, "r", encoding="utf-8") as f:
|
109 |
return f.read().splitlines()
|
110 |
return []
|
111 |
|
112 |
-
def train_and_test(dataset_name, configs, split, vocab_size, min_freq, test_text
|
113 |
-
"""
|
114 |
-
|
115 |
-
Επιστρέφει στο τέλος την τελική πρόοδο, τα αποτελέσματα και το plot.
|
116 |
-
(Σημείωση: Σε αυτήν την έκδοση δεν υπάρχει streaming progress λόγω περιορισμών στο Gradio στο Spaces.)
|
117 |
-
"""
|
118 |
-
progress_messages = []
|
119 |
-
# Φόρτωση ήδη επεξεργασμένων δεδομένων από το checkpoint (αν υπάρχουν)
|
120 |
-
all_texts = load_checkpoint(CHECKPOINT_FILE)
|
121 |
-
total_processed = len(all_texts)
|
122 |
-
progress_messages.append(f"Έχετε {total_processed} δείγματα ήδη αποθηκευμένα στο checkpoint.")
|
123 |
|
124 |
-
|
|
|
|
|
|
|
125 |
dataset_iterator = create_iterator(dataset_name, configs, split)
|
126 |
-
|
127 |
new_texts = []
|
128 |
-
|
129 |
-
# Επεξεργασία νέων δεδομένων σε chunks
|
130 |
for text in dataset_iterator:
|
131 |
new_texts.append(text)
|
132 |
total_processed += 1
|
133 |
if len(new_texts) >= CHUNK_SIZE:
|
134 |
-
append_to_checkpoint(new_texts
|
135 |
-
|
136 |
-
progress_messages.append(f"Επεξεργάστηκαν {total_processed} δείγματα (chunk {chunk_count}).")
|
137 |
new_texts = []
|
138 |
-
# Αποθήκευση υπολειπόμενων δεδομένων
|
139 |
-
if new_texts:
|
140 |
-
append_to_checkpoint(new_texts, CHECKPOINT_FILE)
|
141 |
-
total_processed += len(new_texts)
|
142 |
-
chunk_count += 1
|
143 |
-
progress_messages.append(f"Τελικό chunk: συνολικά {total_processed} δείγματα αποθηκεύτηκαν.")
|
144 |
-
|
145 |
-
# Επεξεργασία των custom αρχείων, αν υπάρχουν
|
146 |
-
if custom_files:
|
147 |
-
custom_texts = []
|
148 |
-
for file_path in custom_files:
|
149 |
-
try:
|
150 |
-
with open(file_path, 'r', encoding='utf-8') as f:
|
151 |
-
content = f.read()
|
152 |
-
if content:
|
153 |
-
custom_texts.append(content)
|
154 |
-
except Exception as file_error:
|
155 |
-
progress_messages.append(f"Σφάλμα ανάγνωσης αρχείου {file_path}: {file_error}")
|
156 |
-
if custom_texts:
|
157 |
-
append_to_checkpoint(custom_texts, CHECKPOINT_FILE)
|
158 |
-
total_processed += len(custom_texts)
|
159 |
-
progress_messages.append(f"Προστέθηκαν {len(custom_texts)} δείγματα από custom αρχεία.")
|
160 |
-
|
161 |
-
# Φόρτωση όλων των δεδομένων για εκπαίδευση
|
162 |
-
all_texts = load_checkpoint(CHECKPOINT_FILE)
|
163 |
-
progress_messages.append(f"Ξεκινάει η εκπαίδευση του tokenizer σε {len(all_texts)} δείγματα...")
|
164 |
|
|
|
|
|
|
|
|
|
165 |
# Εκπαίδευση του tokenizer
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
trained_tokenizer = Tokenizer.from_file(f.name)
|
172 |
-
os.unlink(f.name)
|
173 |
|
174 |
-
#
|
175 |
encoded = trained_tokenizer.encode(test_text)
|
176 |
decoded = trained_tokenizer.decode(encoded.ids)
|
177 |
|
178 |
-
#
|
179 |
token_lengths = [len(t) for t in encoded.tokens]
|
180 |
fig = plt.figure()
|
181 |
plt.hist(token_lengths, bins=20)
|
@@ -185,73 +107,23 @@ def train_and_test(dataset_name, configs, split, vocab_size, min_freq, test_text
|
|
185 |
plt.savefig(img_buffer, format='png')
|
186 |
plt.close()
|
187 |
|
188 |
-
|
189 |
-
"Πρωτότυπο Κείμενο": test_text,
|
190 |
-
"��ποκωδικοποιημένο": decoded,
|
191 |
-
"Αριθμός Tokens": len(encoded.tokens),
|
192 |
-
"Αγνώστων Tokens": sum(1 for t in encoded.tokens if t == "<unk>")
|
193 |
-
}
|
194 |
-
progress_messages.append("Η εκπαίδευση ολοκληρώθηκε!")
|
195 |
-
|
196 |
-
# Επιστρέφουμε τα μηνύματα προόδου μαζί με τα τελικά αποτελέσματα και το plot
|
197 |
-
final_progress = "\n".join(progress_messages)
|
198 |
-
return final_progress, results, img_buffer.getvalue()
|
199 |
|
200 |
# Gradio Interface
|
201 |
-
with gr.Blocks(
|
202 |
gr.Markdown("## Wikipedia Tokenizer Trainer with Checkpointing")
|
203 |
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
split = gr.Dropdown(
|
217 |
-
label="Split",
|
218 |
-
choices=["train"],
|
219 |
-
value="train",
|
220 |
-
allow_custom_value=True
|
221 |
-
)
|
222 |
-
vocab_size = gr.Slider(20000, 100000, value=50000, label="Μέγεθος Λεξιλογίου")
|
223 |
-
min_freq = gr.Slider(1, 100, value=3, label="Ελάχιστη Συχνότητα")
|
224 |
-
test_text = gr.Textbox(
|
225 |
-
value="Η Ακρόπολη είναι σύμβολο της αρχαίας ελληνικής πολιτισμικής κληρονομιάς.",
|
226 |
-
label="Test Text"
|
227 |
-
)
|
228 |
-
custom_files = gr.File(
|
229 |
-
label="Προσαρμοσμένα Ελληνικά Κείμενα",
|
230 |
-
file_count="multiple",
|
231 |
-
type="filepath"
|
232 |
-
)
|
233 |
-
train_btn = gr.Button("Εκπαίδευση", variant="primary")
|
234 |
-
with gr.Column():
|
235 |
-
progress_box = gr.Textbox(label="Πρόοδος", interactive=False, lines=10)
|
236 |
-
results_json = gr.JSON(label="Αποτελέσματα")
|
237 |
-
results_plot = gr.Image(label="Κατανομή Μηκών Tokens")
|
238 |
-
|
239 |
-
# Event handlers
|
240 |
-
dataset_name.change(
|
241 |
-
fn=update_components,
|
242 |
-
inputs=dataset_name,
|
243 |
-
outputs=[configs, split, gr.HTML(label="Dataset Preview")]
|
244 |
-
)
|
245 |
-
split.change(
|
246 |
-
fn=update_split_choices,
|
247 |
-
inputs=[dataset_name, configs],
|
248 |
-
outputs=split
|
249 |
-
)
|
250 |
-
train_btn.click(
|
251 |
-
fn=train_and_test,
|
252 |
-
inputs=[dataset_name, configs, split, vocab_size, min_freq, test_text, custom_files],
|
253 |
-
outputs=[progress_box, results_json, results_plot]
|
254 |
-
)
|
255 |
|
256 |
-
|
257 |
-
demo.launch()
|
|
|
2 |
import os
|
3 |
import gradio as gr
|
4 |
import requests
|
5 |
+
import tempfile
|
6 |
from io import BytesIO
|
7 |
import matplotlib.pyplot as plt
|
|
|
8 |
from datasets import load_dataset
|
9 |
from train_tokenizer import train_tokenizer
|
10 |
from tokenizers import Tokenizer
|
11 |
|
12 |
# Ρυθμίσεις checkpointing
|
13 |
+
CHECKPOINT_FILE = "checkpoint.txt"
|
14 |
+
TOKENIZER_DIR = "tokenizer_model"
|
15 |
+
TOKENIZER_FILE = os.path.join(TOKENIZER_DIR, "tokenizer.json")
|
16 |
+
CHUNK_SIZE = 1000 # Μέγεθος batch για checkpoint
|
17 |
|
18 |
def fetch_splits(dataset_name):
|
19 |
+
"""Ανάκτηση των splits του dataset από το Hugging Face."""
|
20 |
try:
|
21 |
+
response = requests.get(f"https://datasets-server.huggingface.co/splits?dataset={dataset_name}", timeout=10)
|
|
|
|
|
|
|
22 |
response.raise_for_status()
|
23 |
data = response.json()
|
24 |
|
|
|
37 |
except Exception as e:
|
38 |
raise gr.Error(f"Σφάλμα κατά την ανάκτηση των splits: {str(e)}")
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
def create_iterator(dataset_name, configs, split):
|
41 |
+
"""Φορτώνει το dataset και αποδίδει τα κείμενα ως iterator."""
|
|
|
|
|
42 |
configs_list = [c.strip() for c in configs.split(",") if c.strip()]
|
43 |
for config in configs_list:
|
44 |
try:
|
45 |
+
dataset = load_dataset(dataset_name, name=config, split=split, streaming=True)
|
|
|
|
|
|
|
|
|
|
|
46 |
for example in dataset:
|
47 |
text = example.get('text', '')
|
48 |
if text:
|
49 |
yield text
|
50 |
except Exception as e:
|
51 |
+
print(f"⚠️ Σφάλμα φόρτωσης dataset για config {config}: {e}")
|
52 |
|
53 |
+
def append_to_checkpoint(texts):
|
54 |
+
"""Αποθήκευση δεδομένων στο αρχείο checkpoint."""
|
55 |
+
with open(CHECKPOINT_FILE, "a", encoding="utf-8") as f:
|
|
|
|
|
56 |
for t in texts:
|
57 |
f.write(t + "\n")
|
58 |
|
59 |
+
def load_checkpoint():
|
60 |
+
"""Φόρτωση δεδομένων από το checkpoint αν υπάρχει."""
|
61 |
+
if os.path.exists(CHECKPOINT_FILE):
|
62 |
+
with open(CHECKPOINT_FILE, "r", encoding="utf-8") as f:
|
|
|
|
|
63 |
return f.read().splitlines()
|
64 |
return []
|
65 |
|
66 |
+
def train_and_test(dataset_name, configs, split, vocab_size, min_freq, test_text):
|
67 |
+
"""Εκπαίδευση του tokenizer και δοκιμή του."""
|
68 |
+
print("🚀 Ξεκινά η διαδικασία εκπαίδευσης...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
|
70 |
+
all_texts = load_checkpoint()
|
71 |
+
total_processed = len(all_texts)
|
72 |
+
print(f"📌 Υπάρχουν ήδη {total_processed} δείγματα στο checkpoint.")
|
73 |
+
|
74 |
dataset_iterator = create_iterator(dataset_name, configs, split)
|
|
|
75 |
new_texts = []
|
76 |
+
|
|
|
77 |
for text in dataset_iterator:
|
78 |
new_texts.append(text)
|
79 |
total_processed += 1
|
80 |
if len(new_texts) >= CHUNK_SIZE:
|
81 |
+
append_to_checkpoint(new_texts)
|
82 |
+
print(f"✅ Αποθηκεύτηκαν {total_processed} δείγματα στο checkpoint.")
|
|
|
83 |
new_texts = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
|
85 |
+
if new_texts:
|
86 |
+
append_to_checkpoint(new_texts)
|
87 |
+
print(f"✅ Τελικό batch αποθηκεύτηκε ({total_processed} δείγματα).")
|
88 |
+
|
89 |
# Εκπαίδευση του tokenizer
|
90 |
+
all_texts = load_checkpoint()
|
91 |
+
tokenizer = train_tokenizer(all_texts, vocab_size, min_freq, TOKENIZER_DIR)
|
92 |
+
|
93 |
+
# Φόρτωση εκπαιδευμένου tokenizer
|
94 |
+
trained_tokenizer = Tokenizer.from_file(TOKENIZER_FILE)
|
|
|
|
|
95 |
|
96 |
+
# Δοκιμή
|
97 |
encoded = trained_tokenizer.encode(test_text)
|
98 |
decoded = trained_tokenizer.decode(encoded.ids)
|
99 |
|
100 |
+
# Γράφημα κατανομής tokens
|
101 |
token_lengths = [len(t) for t in encoded.tokens]
|
102 |
fig = plt.figure()
|
103 |
plt.hist(token_lengths, bins=20)
|
|
|
107 |
plt.savefig(img_buffer, format='png')
|
108 |
plt.close()
|
109 |
|
110 |
+
return f"✅ Εκπαίδευση ολοκληρώθηκε!\nΑποθηκεύτηκε στον φάκελο: {TOKENIZER_DIR}", decoded, img_buffer.getvalue()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
|
112 |
# Gradio Interface
|
113 |
+
with gr.Blocks() as demo:
|
114 |
gr.Markdown("## Wikipedia Tokenizer Trainer with Checkpointing")
|
115 |
|
116 |
+
dataset_name = gr.Textbox(value="wikimedia/wikipedia", label="Dataset Name")
|
117 |
+
configs = gr.Textbox(value="20231101.el,20231101.en", label="Configs")
|
118 |
+
split = gr.Dropdown(choices=["train"], value="train", label="Split")
|
119 |
+
vocab_size = gr.Slider(20000, 100000, value=50000, label="Vocabulary Size")
|
120 |
+
min_freq = gr.Slider(1, 100, value=3, label="Minimum Frequency")
|
121 |
+
test_text = gr.Textbox(value="Η Ακρόπολη είναι σύμβολο της αρχαίας Ελλάδας.", label="Test Text")
|
122 |
+
train_btn = gr.Button("Train")
|
123 |
+
progress = gr.Textbox(label="Progress", interactive=False)
|
124 |
+
results_plot = gr.Image(label="Token Length Distribution")
|
125 |
+
download_button = gr.File(label="Download Tokenizer", value=TOKENIZER_FILE)
|
126 |
+
|
127 |
+
train_btn.click(train_and_test, [dataset_name, configs, split, vocab_size, min_freq, test_text], [progress, test_text, results_plot])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
|
129 |
+
demo.launch()
|
|