|
import json |
|
import os |
|
import re |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
"repetitions": [ |
|
{ |
|
"content": "now the now the", |
|
"words": [ |
|
1, |
|
2, |
|
3, |
|
4 |
|
], |
|
"mark_location": 4 |
|
} |
|
], |
|
""" |
|
def annotate_repetition_for_mazewhisper(session_id): |
|
file_path = f"session_data/{session_id}/transcription_cunit.json" |
|
if not os.path.exists(file_path): |
|
print(f"File not found: {file_path}") |
|
return |
|
|
|
with open(file_path, "r", encoding="utf-8") as f: |
|
data = json.load(f) |
|
|
|
for segment in data["segments"]: |
|
text_token = segment.get("text_token", "") |
|
tokens = text_token.split() |
|
|
|
repetitions_list = [] |
|
inside = False |
|
rep_words = [] |
|
rep_word_indices = [] |
|
word_count = 0 |
|
|
|
for tok in tokens: |
|
if tok == "<REPSTART>": |
|
inside = True |
|
rep_words, rep_word_indices = [], [] |
|
continue |
|
elif tok == "<REPEND>": |
|
inside = False |
|
if rep_words: |
|
clean_rep_words = [w for w in rep_words |
|
if not (w.startswith("<") and w.endswith(">"))] |
|
repetitions_list.append( |
|
{ |
|
"content": " ".join(clean_rep_words), |
|
"words": rep_word_indices.copy(), |
|
"mark_location": rep_word_indices[-1], |
|
} |
|
) |
|
continue |
|
|
|
if tok.startswith("<") and tok.endswith(">"): |
|
|
|
if inside: |
|
pass |
|
else: |
|
if inside: |
|
rep_words.append(tok) |
|
rep_word_indices.append(word_count) |
|
word_count += 1 |
|
|
|
if repetitions_list: |
|
segment["repetitions"] = repetitions_list |
|
|
|
with open(file_path, "w", encoding="utf-8") as f: |
|
json.dump(data, f, indent=2, ensure_ascii=False) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
"repetitions": [ |
|
{ |
|
"content": "now the now the", |
|
"words": [ |
|
1, |
|
2, |
|
3, |
|
4 |
|
], |
|
"mark_location": 4 |
|
} |
|
], |
|
""" |
|
def annotate_revision_for_mazewhisper(session_id): |
|
file_path = f"session_data/{session_id}/transcription_cunit.json" |
|
if not os.path.exists(file_path): |
|
print(f"File not found: {file_path}") |
|
return |
|
|
|
with open(file_path, "r", encoding="utf-8") as f: |
|
data = json.load(f) |
|
|
|
for segment in data["segments"]: |
|
text_token = segment.get("text_token", "") |
|
tokens = text_token.split() |
|
|
|
revisions_list = [] |
|
inside = False |
|
rev_words = [] |
|
rev_word_indices = [] |
|
word_count = 0 |
|
|
|
for tok in tokens: |
|
if tok == "<REVSTART>": |
|
inside = True |
|
rev_words, rev_word_indices = [], [] |
|
continue |
|
elif tok == "<REVEND>": |
|
inside = False |
|
if rev_words: |
|
clean_rev_words = [w for w in rev_words |
|
if not (w.startswith("<") and w.endswith(">"))] |
|
revisions_list.append( |
|
{ |
|
"content": " ".join(clean_rev_words), |
|
"words": rev_word_indices.copy(), |
|
"mark_location": rev_word_indices[-1], |
|
} |
|
) |
|
continue |
|
|
|
if tok.startswith("<") and tok.endswith(">"): |
|
continue |
|
else: |
|
if inside: |
|
rev_words.append(tok) |
|
rev_word_indices.append(word_count) |
|
word_count += 1 |
|
|
|
if revisions_list: |
|
segment["revisions"] = revisions_list |
|
|
|
with open(file_path, "w", encoding="utf-8") as f: |
|
json.dump(data, f, indent=2, ensure_ascii=False) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
"pauses": [ |
|
{ |
|
"start": 364.08, |
|
"end": 369.1, |
|
"duration": 5.02 |
|
}, |
|
{ |
|
"start": 369.18, |
|
"end": 369.56, |
|
"duration": 0.38 |
|
} |
|
], |
|
|
|
""" |
|
def annotate_pause_for_mazewhisper(session_id): |
|
|
|
file_path = f"session_data/{session_id}/transcription_cunit.json" |
|
|
|
if not os.path.exists(file_path): |
|
print(f"File not found: {file_path}") |
|
return |
|
|
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
data = json.load(f) |
|
|
|
for i, segment in enumerate(data['segments']): |
|
text_token = segment.get('text_token', '') |
|
words = segment.get('words', []) |
|
pauses_list = [] |
|
|
|
if '<PAUSE>' in text_token: |
|
tokens = text_token.split() |
|
|
|
for j, token in enumerate(tokens): |
|
if token == '<PAUSE>': |
|
|
|
start_time = None |
|
|
|
|
|
word_idx = 0 |
|
for k in range(j): |
|
if not (tokens[k].startswith('<') and tokens[k].endswith('>')): |
|
word_idx += 1 |
|
|
|
if word_idx > 0 and word_idx <= len(words): |
|
start_time = words[word_idx - 1].get('end') |
|
elif i > 0: |
|
start_time = data['segments'][i - 1].get('end') |
|
|
|
segment['start'] = start_time |
|
|
|
|
|
end_time = None |
|
|
|
|
|
next_word_idx = 0 |
|
for k in range(j + 1, len(tokens)): |
|
if not (tokens[k].startswith('<') and tokens[k].endswith('>')): |
|
next_word_idx = word_idx + 1 |
|
break |
|
if tokens[k].startswith('<') and tokens[k].endswith('>'): |
|
continue |
|
|
|
if next_word_idx > 0 and next_word_idx <= len(words): |
|
end_time = words[next_word_idx - 1].get('start') |
|
elif i < len(data['segments']) - 1: |
|
|
|
end_time = data['segments'][i + 1].get('start') if i + 1 < len(data['segments']) else segment.get('end') |
|
|
|
|
|
if start_time is not None and end_time is not None: |
|
duration = round(end_time - start_time, 2) |
|
|
|
pause_info = { |
|
"start": start_time, |
|
"end": end_time, |
|
"duration": duration |
|
} |
|
|
|
pauses_list.append(pause_info) |
|
|
|
|
|
if pauses_list: |
|
segment['pauses'] = pauses_list |
|
|
|
|
|
with open(file_path, 'w', encoding='utf-8') as f: |
|
json.dump(data, f, indent=2, ensure_ascii=False) |
|
|
|
return |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
"fillerwords": [ |
|
{ |
|
"start": , |
|
"end": , |
|
"content": "", |
|
"duration": |
|
} |
|
], |
|
""" |
|
|
|
|
|
|
|
def annotate_fillerword_for_mazewhisper(session_id): |
|
|
|
file_path = f"session_data/{session_id}/transcription_cunit.json" |
|
|
|
if not os.path.exists(file_path): |
|
print(f"File not found: {file_path}") |
|
return |
|
|
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
data = json.load(f) |
|
|
|
for segment in data['segments']: |
|
text_token = segment.get('text_token', '') |
|
words = segment.get('words', []) |
|
text = segment.get('text', '') |
|
fillerwords_list = [] |
|
|
|
if '<FILLER>' in text_token: |
|
|
|
filler_count = text_token.count('<FILLER>') |
|
|
|
for _ in range(filler_count): |
|
filler_info = { |
|
"start": None, |
|
"end": None, |
|
"content": "", |
|
"duration": None |
|
} |
|
fillerwords_list.append(filler_info) |
|
|
|
|
|
updated_text_token = text_token.replace('<FILLER>', 'um <FILLER>') |
|
segment['text_token'] = updated_text_token |
|
|
|
|
|
tokens = text_token.split() |
|
text_words = text.split() |
|
|
|
|
|
filler_positions = [] |
|
word_count = 0 |
|
|
|
for i, token in enumerate(tokens): |
|
if token == '<FILLER>': |
|
filler_positions.append(word_count) |
|
elif not (token.startswith('<') and token.endswith('>')): |
|
word_count += 1 |
|
|
|
|
|
for pos in reversed(filler_positions): |
|
text_words.insert(pos, 'um') |
|
|
|
|
|
um_word = { |
|
"word": "um", |
|
"start": None, |
|
"end": None |
|
} |
|
|
|
if pos <= len(words): |
|
words.insert(pos, um_word) |
|
|
|
|
|
segment['text'] = ' '.join(text_words) |
|
segment['words'] = words |
|
|
|
|
|
segment['fillerwords'] = fillerwords_list |
|
|
|
|
|
with open(file_path, 'w', encoding='utf-8') as f: |
|
json.dump(data, f, indent=2, ensure_ascii=False) |
|
|
|
return |
|
|
|
|
|
|
|
|
|
|
|
def annotate_maze_for_mazewhisper(session_id): |
|
annotate_fillerword_for_mazewhisper(session_id) |
|
annotate_repetition_for_mazewhisper(session_id) |
|
annotate_revision_for_mazewhisper(session_id) |
|
annotate_pause_for_mazewhisper(session_id) |
|
|
|
print("Maze annotation completed!") |
|
|
|
|
|
|
|
|