Commit
·
bad8eb8
1
Parent(s):
9f824d8
model improved
Browse files- config.json +0 -0
- maker.py +5 -8
- pytorch_model.bin +2 -2
- tokenizer_config.json +1 -1
- ud.py +1 -1
config.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
maker.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
#! /usr/bin/python3
|
| 2 |
-
import os
|
| 3 |
src="99eren99/ModernBERT-base-Turkish-uncased-mlm"
|
| 4 |
tgt="KoichiYasuoka/modernbert-base-turkish-ud-embeds"
|
| 5 |
url="https://github.com/UniversalDependencies/UD_Turkish-"
|
| 6 |
-
for e in ["Kenet","Penn"
|
| 7 |
u=url+e
|
| 8 |
d=os.path.basename(u)
|
| 9 |
os.system("test -d "+d+" || git clone --depth=1 "+u)
|
|
@@ -41,14 +41,11 @@ class UDEmbedsDataset(object):
|
|
| 41 |
__len__=lambda self:(len(self.seeks)-1)*2
|
| 42 |
def __getitem__(self,i):
|
| 43 |
self.conllu.seek(self.seeks[int(i/2)])
|
| 44 |
-
z,c,t
|
| 45 |
while t[0]!="\n":
|
| 46 |
t=self.conllu.readline().split("\t")
|
| 47 |
if len(t)==10 and t[0].isdecimal():
|
| 48 |
-
if s:
|
| 49 |
-
t[1]=" "+t[1]
|
| 50 |
c.append(t)
|
| 51 |
-
s=t[9].find("SpaceAfter=No")<0
|
| 52 |
x=[True if t[6]=="0" or int(t[6])>j or sum([1 if int(c[i][6])==j+1 else 0 for i in range(j+1,len(c))])>0 else False for j,t in enumerate(c)]
|
| 53 |
v=self.tokenizer([t[1] for t in c],add_special_tokens=False)["input_ids"]
|
| 54 |
if z==0:
|
|
@@ -107,8 +104,8 @@ trainDS=UDEmbedsDataset("train.conllu",tkz)
|
|
| 107 |
devDS=UDEmbedsDataset("dev.conllu",tkz)
|
| 108 |
testDS=UDEmbedsDataset("test.conllu",tkz)
|
| 109 |
lid=trainDS(devDS,testDS)
|
| 110 |
-
cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,id2label={i:l for l,i in lid.items()}
|
| 111 |
-
mdl=AutoModelForTokenClassification.from_pretrained(src,config=cfg
|
| 112 |
trainDS.embeddings=mdl.get_input_embeddings().weight
|
| 113 |
arg=TrainingArguments(num_train_epochs=3,per_device_train_batch_size=1,dataloader_pin_memory=False,output_dir=tgt,overwrite_output_dir=True,save_total_limit=2,learning_rate=5e-05,warmup_ratio=0.1,save_safetensors=False)
|
| 114 |
trn=Trainer(args=arg,data_collator=DefaultDataCollator(),model=mdl,train_dataset=trainDS)
|
|
|
|
| 1 |
#! /usr/bin/python3
|
| 2 |
+
import os
|
| 3 |
src="99eren99/ModernBERT-base-Turkish-uncased-mlm"
|
| 4 |
tgt="KoichiYasuoka/modernbert-base-turkish-ud-embeds"
|
| 5 |
url="https://github.com/UniversalDependencies/UD_Turkish-"
|
| 6 |
+
for e in ["Kenet","Penn"]:
|
| 7 |
u=url+e
|
| 8 |
d=os.path.basename(u)
|
| 9 |
os.system("test -d "+d+" || git clone --depth=1 "+u)
|
|
|
|
| 41 |
__len__=lambda self:(len(self.seeks)-1)*2
|
| 42 |
def __getitem__(self,i):
|
| 43 |
self.conllu.seek(self.seeks[int(i/2)])
|
| 44 |
+
z,c,t=i%2,[],[""]
|
| 45 |
while t[0]!="\n":
|
| 46 |
t=self.conllu.readline().split("\t")
|
| 47 |
if len(t)==10 and t[0].isdecimal():
|
|
|
|
|
|
|
| 48 |
c.append(t)
|
|
|
|
| 49 |
x=[True if t[6]=="0" or int(t[6])>j or sum([1 if int(c[i][6])==j+1 else 0 for i in range(j+1,len(c))])>0 else False for j,t in enumerate(c)]
|
| 50 |
v=self.tokenizer([t[1] for t in c],add_special_tokens=False)["input_ids"]
|
| 51 |
if z==0:
|
|
|
|
| 104 |
devDS=UDEmbedsDataset("dev.conllu",tkz)
|
| 105 |
testDS=UDEmbedsDataset("test.conllu",tkz)
|
| 106 |
lid=trainDS(devDS,testDS)
|
| 107 |
+
cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,id2label={i:l for l,i in lid.items()})
|
| 108 |
+
mdl=AutoModelForTokenClassification.from_pretrained(src,config=cfg)
|
| 109 |
trainDS.embeddings=mdl.get_input_embeddings().weight
|
| 110 |
arg=TrainingArguments(num_train_epochs=3,per_device_train_batch_size=1,dataloader_pin_memory=False,output_dir=tgt,overwrite_output_dir=True,save_total_limit=2,learning_rate=5e-05,warmup_ratio=0.1,save_safetensors=False)
|
| 111 |
trn=Trainer(args=arg,data_collator=DefaultDataCollator(),model=mdl,train_dataset=trainDS)
|
pytorch_model.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f4b0b45b340abd523b7ddc8bd994aa6dbc93ac6bebe7c5a0eda8606ebba46500
|
| 3 |
+
size 555858738
|
tokenizer_config.json
CHANGED
|
@@ -48,11 +48,11 @@
|
|
| 48 |
"extra_special_tokens": {},
|
| 49 |
"mask_token": "[MASK]",
|
| 50 |
"max_len": 999999999,
|
| 51 |
-
"model_max_length": 999999999,
|
| 52 |
"model_input_names": [
|
| 53 |
"input_ids",
|
| 54 |
"attention_mask"
|
| 55 |
],
|
|
|
|
| 56 |
"never_split": null,
|
| 57 |
"pad_token": "[PAD]",
|
| 58 |
"sep_token": "[SEP]",
|
|
|
|
| 48 |
"extra_special_tokens": {},
|
| 49 |
"mask_token": "[MASK]",
|
| 50 |
"max_len": 999999999,
|
|
|
|
| 51 |
"model_input_names": [
|
| 52 |
"input_ids",
|
| 53 |
"attention_mask"
|
| 54 |
],
|
| 55 |
+
"model_max_length": 999999999,
|
| 56 |
"never_split": null,
|
| 57 |
"pad_token": "[PAD]",
|
| 58 |
"sep_token": "[SEP]",
|
ud.py
CHANGED
|
@@ -77,7 +77,7 @@ class UniversalDependenciesPipeline(BellmanFordTokenClassificationPipeline):
|
|
| 77 |
if d.strip()=="":
|
| 78 |
off.pop(i)
|
| 79 |
w.pop(i)
|
| 80 |
-
v=self.tokenizer([t["text"] for t in w],add_special_tokens=False)
|
| 81 |
x=[not t["entity_group"].endswith(".") for t in w]
|
| 82 |
if len(x)<127:
|
| 83 |
x=[True]*len(x)
|
|
|
|
| 77 |
if d.strip()=="":
|
| 78 |
off.pop(i)
|
| 79 |
w.pop(i)
|
| 80 |
+
v=self.tokenizer([t["text"].strip() for t in w],add_special_tokens=False)
|
| 81 |
x=[not t["entity_group"].endswith(".") for t in w]
|
| 82 |
if len(x)<127:
|
| 83 |
x=[True]*len(x)
|