KoichiYasuoka commited on
Commit
bad8eb8
·
1 Parent(s): 9f824d8

model improved

Browse files
Files changed (5) hide show
  1. config.json +0 -0
  2. maker.py +5 -8
  3. pytorch_model.bin +2 -2
  4. tokenizer_config.json +1 -1
  5. ud.py +1 -1
config.json CHANGED
The diff for this file is too large to render. See raw diff
 
maker.py CHANGED
@@ -1,9 +1,9 @@
1
  #! /usr/bin/python3
2
- import os,json
3
  src="99eren99/ModernBERT-base-Turkish-uncased-mlm"
4
  tgt="KoichiYasuoka/modernbert-base-turkish-ud-embeds"
5
  url="https://github.com/UniversalDependencies/UD_Turkish-"
6
- for e in ["Kenet","Penn","BOUN","Tourism","IMST","Atis","FrameNet"]:
7
  u=url+e
8
  d=os.path.basename(u)
9
  os.system("test -d "+d+" || git clone --depth=1 "+u)
@@ -41,14 +41,11 @@ class UDEmbedsDataset(object):
41
  __len__=lambda self:(len(self.seeks)-1)*2
42
  def __getitem__(self,i):
43
  self.conllu.seek(self.seeks[int(i/2)])
44
- z,c,t,s=i%2,[],[""],False
45
  while t[0]!="\n":
46
  t=self.conllu.readline().split("\t")
47
  if len(t)==10 and t[0].isdecimal():
48
- if s:
49
- t[1]=" "+t[1]
50
  c.append(t)
51
- s=t[9].find("SpaceAfter=No")<0
52
  x=[True if t[6]=="0" or int(t[6])>j or sum([1 if int(c[i][6])==j+1 else 0 for i in range(j+1,len(c))])>0 else False for j,t in enumerate(c)]
53
  v=self.tokenizer([t[1] for t in c],add_special_tokens=False)["input_ids"]
54
  if z==0:
@@ -107,8 +104,8 @@ trainDS=UDEmbedsDataset("train.conllu",tkz)
107
  devDS=UDEmbedsDataset("dev.conllu",tkz)
108
  testDS=UDEmbedsDataset("test.conllu",tkz)
109
  lid=trainDS(devDS,testDS)
110
- cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,id2label={i:l for l,i in lid.items()},ignore_mismatched_sizes=True,trust_remote_code=True)
111
- mdl=AutoModelForTokenClassification.from_pretrained(src,config=cfg,ignore_mismatched_sizes=True,trust_remote_code=True)
112
  trainDS.embeddings=mdl.get_input_embeddings().weight
113
  arg=TrainingArguments(num_train_epochs=3,per_device_train_batch_size=1,dataloader_pin_memory=False,output_dir=tgt,overwrite_output_dir=True,save_total_limit=2,learning_rate=5e-05,warmup_ratio=0.1,save_safetensors=False)
114
  trn=Trainer(args=arg,data_collator=DefaultDataCollator(),model=mdl,train_dataset=trainDS)
 
1
  #! /usr/bin/python3
2
+ import os
3
  src="99eren99/ModernBERT-base-Turkish-uncased-mlm"
4
  tgt="KoichiYasuoka/modernbert-base-turkish-ud-embeds"
5
  url="https://github.com/UniversalDependencies/UD_Turkish-"
6
+ for e in ["Kenet","Penn"]:
7
  u=url+e
8
  d=os.path.basename(u)
9
  os.system("test -d "+d+" || git clone --depth=1 "+u)
 
41
  __len__=lambda self:(len(self.seeks)-1)*2
42
  def __getitem__(self,i):
43
  self.conllu.seek(self.seeks[int(i/2)])
44
+ z,c,t=i%2,[],[""]
45
  while t[0]!="\n":
46
  t=self.conllu.readline().split("\t")
47
  if len(t)==10 and t[0].isdecimal():
 
 
48
  c.append(t)
 
49
  x=[True if t[6]=="0" or int(t[6])>j or sum([1 if int(c[i][6])==j+1 else 0 for i in range(j+1,len(c))])>0 else False for j,t in enumerate(c)]
50
  v=self.tokenizer([t[1] for t in c],add_special_tokens=False)["input_ids"]
51
  if z==0:
 
104
  devDS=UDEmbedsDataset("dev.conllu",tkz)
105
  testDS=UDEmbedsDataset("test.conllu",tkz)
106
  lid=trainDS(devDS,testDS)
107
+ cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,id2label={i:l for l,i in lid.items()})
108
+ mdl=AutoModelForTokenClassification.from_pretrained(src,config=cfg)
109
  trainDS.embeddings=mdl.get_input_embeddings().weight
110
  arg=TrainingArguments(num_train_epochs=3,per_device_train_batch_size=1,dataloader_pin_memory=False,output_dir=tgt,overwrite_output_dir=True,save_total_limit=2,learning_rate=5e-05,warmup_ratio=0.1,save_safetensors=False)
111
  trn=Trainer(args=arg,data_collator=DefaultDataCollator(),model=mdl,train_dataset=trainDS)
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1870ef3ec1d236b23d48f21a0cdecc78dc6cae298e961bf10ee12e1dcbfabd48
3
- size 592177074
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4b0b45b340abd523b7ddc8bd994aa6dbc93ac6bebe7c5a0eda8606ebba46500
3
+ size 555858738
tokenizer_config.json CHANGED
@@ -48,11 +48,11 @@
48
  "extra_special_tokens": {},
49
  "mask_token": "[MASK]",
50
  "max_len": 999999999,
51
- "model_max_length": 999999999,
52
  "model_input_names": [
53
  "input_ids",
54
  "attention_mask"
55
  ],
 
56
  "never_split": null,
57
  "pad_token": "[PAD]",
58
  "sep_token": "[SEP]",
 
48
  "extra_special_tokens": {},
49
  "mask_token": "[MASK]",
50
  "max_len": 999999999,
 
51
  "model_input_names": [
52
  "input_ids",
53
  "attention_mask"
54
  ],
55
+ "model_max_length": 999999999,
56
  "never_split": null,
57
  "pad_token": "[PAD]",
58
  "sep_token": "[SEP]",
ud.py CHANGED
@@ -77,7 +77,7 @@ class UniversalDependenciesPipeline(BellmanFordTokenClassificationPipeline):
77
  if d.strip()=="":
78
  off.pop(i)
79
  w.pop(i)
80
- v=self.tokenizer([t["text"] for t in w],add_special_tokens=False)
81
  x=[not t["entity_group"].endswith(".") for t in w]
82
  if len(x)<127:
83
  x=[True]*len(x)
 
77
  if d.strip()=="":
78
  off.pop(i)
79
  w.pop(i)
80
+ v=self.tokenizer([t["text"].strip() for t in w],add_special_tokens=False)
81
  x=[not t["entity_group"].endswith(".") for t in w]
82
  if len(x)<127:
83
  x=[True]*len(x)