|
from segmentation import segment_batchalign
|
|
|
|
chunk = [
|
|
"once a horse met elephant and then they saw a ball",
|
|
"in a pool and then the horse tried to swim",
|
|
"and get the ball they might be the same",
|
|
]
|
|
|
|
def clean_text(text):
|
|
import re
|
|
return re.sub(r"[^\w\s]", "", text.lower()).strip()
|
|
|
|
word_sequence = []
|
|
gt_label_sequence = []
|
|
|
|
for row in chunk:
|
|
cleaned = clean_text(row)
|
|
words = cleaned.split()
|
|
word_sequence.extend(words)
|
|
gt_label_sequence.extend([0] * (len(words) - 1) + [1])
|
|
|
|
input_text = " ".join(word_sequence)
|
|
predicted_labels = segment_batchalign(input_text)
|
|
|
|
print("Word sequence:", input_text)
|
|
print("GT:", " ".join(map(str, gt_label_sequence)))
|
|
print("Pred:", " ".join(map(str, predicted_labels)))
|
|
|
|
|
|
print("Length match:", len(gt_label_sequence) == len(predicted_labels)) |