SATEv1.5 / test_eval_segmentation.py
Shuwei Hou
initial_for_hf
5806e12
raw
history blame contribute delete
848 Bytes
from segmentation import segment_batchalign
chunk = [
"once a horse met elephant and then they saw a ball",
"in a pool and then the horse tried to swim",
"and get the ball they might be the same",
]
def clean_text(text):
import re
return re.sub(r"[^\w\s]", "", text.lower()).strip()
word_sequence = []
gt_label_sequence = []
for row in chunk:
cleaned = clean_text(row)
words = cleaned.split()
word_sequence.extend(words)
gt_label_sequence.extend([0] * (len(words) - 1) + [1])
input_text = " ".join(word_sequence)
predicted_labels = segment_batchalign(input_text)
print("Word sequence:", input_text)
print("GT:", " ".join(map(str, gt_label_sequence)))
print("Pred:", " ".join(map(str, predicted_labels)))
print("Length match:", len(gt_label_sequence) == len(predicted_labels))