Spaces:
Running
on
Zero
Running
on
Zero
def regroup_words( | |
words: list[dict], | |
max_len: float = 15.0, | |
gap: float = 0.50, | |
) -> list[dict]: | |
""" | |
Returns a list of segments with keys: | |
'start', 'end', 'text', 'words' | |
""" | |
if not words: | |
return [] | |
segs, seg_words = [], [] | |
seg_start = words[0]["start"] | |
last_end = seg_start | |
for w in words: | |
over_max = (w["end"] - seg_start) > max_len | |
long_gap = (w["start"] - last_end) > gap | |
if (seg_words and (over_max or long_gap)): | |
segs.append({ | |
"start": seg_start, | |
"end": last_end, | |
"segment": " ".join(x["word"] for x in seg_words), | |
}) | |
seg_words = [] | |
seg_start = w["start"] | |
seg_words.append(w) | |
last_end = w["end"] | |
# flush final segment | |
segs.append({ | |
"start": seg_start, | |
"end": last_end, | |
"segment": " ".join(x["word"] for x in seg_words), | |
}) | |
return segs | |
def text_to_words(text: str) -> list[dict]: | |
""" | |
Convert text format like "word[start:end] word[start:end]..." to word list. | |
Args: | |
text: String in format "It's[4.96:5.52] a[5.52:5.84] long[5.84:6.16]..." | |
Returns: | |
List of word dictionaries with keys: 'word', 'start', 'end' | |
""" | |
import re | |
if not text.strip(): | |
return [] | |
# Pattern to match word[start:end] format | |
pattern = r'(\S+?)\[([^:]+):([^\]]+)\]' | |
matches = re.findall(pattern, text) | |
words = [] | |
for word, start_str, end_str in matches: | |
try: | |
start = float(start_str) if start_str != 'xxx' else 0.0 | |
end = float(end_str) if end_str != 'xxx' else 0.0 | |
words.append({ | |
'word': word, | |
'start': start, | |
'end': end | |
}) | |
except ValueError: | |
# Skip invalid entries | |
continue | |
return words | |
def words_to_text(words: list[dict]) -> str: | |
""" | |
Convert word list to text format "word[start:end] word[start:end]...". | |
Args: | |
words: List of word dictionaries with keys: 'word', 'start', 'end' | |
Returns: | |
String in format "It's[4.96:5.52] a[5.52:5.84] long[5.84:6.16]..." | |
""" | |
if not words: | |
return "" | |
text_parts = [] | |
for word in words: | |
word_text = word.get('word', '') | |
start = word.get('start', 0.0) | |
end = word.get('end', 0.0) | |
# Format timestamps to max 2 decimal places | |
start_str = f"{start:.2f}".rstrip('0').rstrip('.') | |
end_str = f"{end:.2f}".rstrip('0').rstrip('.') | |
text_parts.append(f"{word_text}[{start_str}:{end_str}]") | |
return " ".join(text_parts) | |
def json_to_text(json_data: dict) -> str: | |
""" | |
Convert JSON lyrics data to text format for display. | |
Only uses the 'word' layer from the JSON structure. | |
Groups words into sentences/lines for better readability. | |
Args: | |
json_data: Dictionary with 'word' key containing list of word objects | |
Returns: | |
String with words grouped into lines: "word[start:end] word[start:end]...\nword[start:end]..." | |
""" | |
if not isinstance(json_data, dict) or 'word' not in json_data: | |
return "" | |
words = json_data['word'] | |
# Group words into segments using the existing regroup_words function | |
segments = regroup_words(words, max_len=5, gap=0.50) | |
# Convert each segment to text format | |
segment_lines = [] | |
for seg in segments: | |
# Extract words for this segment based on time range | |
seg_words = [] | |
for word in words: | |
if seg['start'] <= word['start'] < seg['end'] or ( | |
word['start'] <= seg['start'] < word['end'] | |
): | |
seg_words.append(word) | |
if seg_words: | |
segment_text = words_to_text(seg_words) | |
segment_lines.append(segment_text) | |
return '\n\n'.join(segment_lines) | |
def round_to_quarter_beats(beat_position: float) -> float: | |
"""Round beat position to nearest quarter note for sample display.""" | |
return round(beat_position * 4) / 4 | |
def beats_to_seconds(beat_position: float, bpm: float) -> float: | |
"""Convert beat position to time in seconds.""" | |
return (beat_position * 60.0) / bpm | |
def seconds_to_beats(time_seconds: float, bpm: float) -> float: | |
"""Convert time in seconds to beat position.""" | |
return (time_seconds * bpm) / 60.0 | |
def convert_text_time_to_beats(text: str, bpm: float, round_to_quarters: bool = False) -> str: | |
""" | |
Convert time-based text format to beats-based format. | |
Args: | |
text: String in format "word[start_sec:end_sec] ..." | |
bpm: Beats per minute for conversion | |
round_to_quarters: If True, round beats to quarter notes (for sample display) | |
Returns: | |
String in format "word[start_beat:end_beat] ..." | |
""" | |
if not text.strip(): | |
return "" | |
words = text_to_words(text) | |
beat_words = [] | |
for word in words: | |
start_beat = seconds_to_beats(word['start'], bpm) | |
end_beat = seconds_to_beats(word['end'], bpm) | |
# Round to quarter notes for sample display | |
if round_to_quarters: | |
start_beat = round_to_quarter_beats(start_beat) | |
end_beat = round_to_quarter_beats(end_beat) | |
# Format to reasonable precision | |
start_str = f"{start_beat:.2f}".rstrip('0').rstrip('.') | |
end_str = f"{end_beat:.2f}".rstrip('0').rstrip('.') | |
beat_words.append(f"{word['word']}[{start_str}:{end_str}]") | |
return " ".join(beat_words) | |
def beats_to_text_with_regrouping(text: str, bpm: float, round_to_quarters: bool = False) -> str: | |
""" | |
Convert time-based text to beats format with regrouping (like time mode). | |
Args: | |
text: String in format "word[start_sec:end_sec] ..." | |
bpm: Beats per minute for conversion | |
round_to_quarters: If True, round beats to quarter notes (for sample display) | |
Returns: | |
String with beats format grouped into lines | |
""" | |
if not text.strip(): | |
return "" | |
# First convert to beats format | |
words = text_to_words(text) | |
beat_words = [] | |
for word in words: | |
start_beat = seconds_to_beats(word['start'], bpm) | |
end_beat = seconds_to_beats(word['end'], bpm) | |
# Round to quarter notes for sample display | |
if round_to_quarters: | |
start_beat = round_to_quarter_beats(start_beat) | |
end_beat = round_to_quarter_beats(end_beat) | |
beat_words.append({ | |
'word': word['word'], | |
'start': start_beat, | |
'end': end_beat | |
}) | |
# Group beats into segments (using beat positions instead of seconds) | |
segments = regroup_words(beat_words, max_len=20, gap=2.0) # 20 beats max, 2 beat gap | |
# Convert each segment to text format | |
segment_lines = [] | |
for seg in segments: | |
# Extract words for this segment based on beat range | |
seg_words = [] | |
for word in beat_words: | |
if seg['start'] <= word['start'] < seg['end'] or ( | |
word['start'] <= seg['start'] < word['end'] | |
): | |
seg_words.append(word) | |
if seg_words: | |
segment_text = words_to_text(seg_words) # This will format as word[beat:beat] | |
segment_lines.append(segment_text) | |
return '\n\n'.join(segment_lines) | |
def convert_text_beats_to_time(text: str, bpm: float) -> str: | |
""" | |
Convert beats-based text format to time-based format. | |
Args: | |
text: String in format "word[start_beat:end_beat] ..." | |
bpm: Beats per minute for conversion | |
Returns: | |
String in format "word[start_sec:end_sec] ..." | |
""" | |
if not text.strip(): | |
return "" | |
# Parse beats format (same pattern as time format) | |
words = text_to_words(text) | |
time_words = [] | |
for word in words: | |
# Convert beat positions to time | |
start_time = beats_to_seconds(word['start'], bpm) | |
end_time = beats_to_seconds(word['end'], bpm) | |
# Format to reasonable precision | |
start_str = f"{start_time:.2f}".rstrip('0').rstrip('.') | |
end_str = f"{end_time:.2f}".rstrip('0').rstrip('.') | |
time_words.append(f"{word['word']}[{start_str}:{end_str}]") | |
return " ".join(time_words) | |
def convert_text_beats_to_time_with_regrouping(text: str, bpm: float) -> str: | |
""" | |
Convert beats-based text format to time-based format while preserving line structure. | |
Args: | |
text: String in format "word[start_beat:end_beat] ..." (can be multi-line) | |
bpm: Beats per minute for conversion | |
Returns: | |
String in format "word[start_sec:end_sec] ..." with preserved line breaks | |
""" | |
if not text.strip(): | |
return "" | |
# Process each line separately to preserve segmentation | |
lines = text.split('\n') | |
converted_lines = [] | |
for line in lines: | |
line = line.strip() | |
if not line: | |
# Preserve empty lines | |
converted_lines.append("") | |
continue | |
# Convert this line from beats to time | |
words = text_to_words(line) | |
time_words = [] | |
for word in words: | |
# Convert beat positions to time | |
start_time = beats_to_seconds(word['start'], bpm) | |
end_time = beats_to_seconds(word['end'], bpm) | |
# Format to reasonable precision | |
start_str = f"{start_time:.2f}".rstrip('0').rstrip('.') | |
end_str = f"{end_time:.2f}".rstrip('0').rstrip('.') | |
time_words.append(f"{word['word']}[{start_str}:{end_str}]") | |
if time_words: | |
converted_lines.append(" ".join(time_words)) | |
return "\n".join(converted_lines) | |
def text_to_json(text: str) -> dict: | |
""" | |
Convert text format to JSON structure expected by the model. | |
Creates the 'word' layer that the model needs. | |
Handles multi-line input by joining lines. | |
Args: | |
text: String in format "word[start:end] word[start:end]..." (can be multi-line) | |
Returns: | |
Dictionary with 'word' key containing list of word objects | |
""" | |
# Join multiple lines into single line for parsing | |
single_line_text = ' '.join(line.strip() for line in text.split('\n') if line.strip()) | |
words = text_to_words(single_line_text) | |
return {"word": words} | |