from segmentation import segment_batchalign

chunk = [
    "once a horse met elephant and then they saw a ball",
    "in a pool and then the horse tried to swim",
    "and get the ball they might be the same",
]

def clean_text(text):
    import re
    return re.sub(r"[^\w\s]", "", text.lower()).strip()

word_sequence = []
gt_label_sequence = []

for row in chunk:
    cleaned = clean_text(row)
    words = cleaned.split()
    word_sequence.extend(words)
    gt_label_sequence.extend([0] * (len(words) - 1) + [1])

input_text = " ".join(word_sequence)
predicted_labels = segment_batchalign(input_text)

print("Word sequence:", input_text)
print("GT:", " ".join(map(str, gt_label_sequence)))
print("Pred:", " ".join(map(str, predicted_labels)))


print("Length match:", len(gt_label_sequence) == len(predicted_labels))