from segmentation import segment_batchalign chunk = [ "once a horse met elephant and then they saw a ball", "in a pool and then the horse tried to swim", "and get the ball they might be the same", ] def clean_text(text): import re return re.sub(r"[^\w\s]", "", text.lower()).strip() word_sequence = [] gt_label_sequence = [] for row in chunk: cleaned = clean_text(row) words = cleaned.split() word_sequence.extend(words) gt_label_sequence.extend([0] * (len(words) - 1) + [1]) input_text = " ".join(word_sequence) predicted_labels = segment_batchalign(input_text) print("Word sequence:", input_text) print("GT:", " ".join(map(str, gt_label_sequence))) print("Pred:", " ".join(map(str, predicted_labels))) print("Length match:", len(gt_label_sequence) == len(predicted_labels))