Spaces:
Sleeping
Sleeping
File size: 4,858 Bytes
91394e0 7a23964 91394e0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
import json
import re
import warnings
from pathlib import Path
from kanjiconv import KanjiConv
from pypinyin import lazy_pinyin
from .resources.pinyin_dict import PINYIN_DICT
kanji_to_kana = KanjiConv()
yoon_map = {
"ใ": "ใ",
"ใ": "ใ",
"ใ
": "ใ",
"ใ": "ใ",
"ใ": "ใ",
"ใ": "ใ",
"ใ
": "ใ",
"ใ": "ใ",
"ใ": "ใ",
}
# ACE_phonemes
with open(Path(__file__).parent / "resources" / "all_plans.json", "r") as f:
ace_phonemes_all_plans = json.load(f)
for plan in ace_phonemes_all_plans["plans"]:
if plan["language"] == "zh":
ace_phonemes_zh_plan = plan
break
def preprocess_text(text: str, language: str) -> list[str]:
text = text.replace(" ", "")
if language == "mandarin":
text_list = to_pinyin(text)
elif language == "japanese":
text_list = to_kana(text)
else:
raise ValueError(f"Other languages are not supported")
return text_list
def to_pinyin(text: str) -> list[str]:
pinyin_list = lazy_pinyin(text)
text_list = []
for text in pinyin_list:
if text[0] == "S" or text[0] == "A" or text[0] == "-":
sp_strs = re.findall(r"-|AP|SP", text)
for phn in sp_strs:
text_list.append(phn)
else:
text_list.append(text)
return text_list
def replace_chouonpu(hiragana_text: str) -> str:
"""processใใผใsince the previous packages didn't support"""
vowels = {
"ใ": "ใ",
"ใ": "ใ",
"ใ": "ใ",
"ใ": "ใ",
"ใ": "ใ",
"ใ": "ใ",
"ใ": "ใ",
"ใ": "ใ",
"ใ": "ใ",
"ใ": "ใ",
"ใ": "ใ",
"ใ": "ใ",
"ใ": "ใ",
"ใ": "ใ",
"ใ": "ใ",
"ใ": "ใ",
"ใก": "ใ",
"ใค": "ใ",
"ใฆ": "ใ",
"ใจ": "ใ",
"ใช": "ใ",
"ใซ": "ใ",
"ใฌ": "ใ",
"ใญ": "ใ",
"ใฎ": "ใ",
"ใฏ": "ใ",
"ใฒ": "ใ",
"ใต": "ใ",
"ใธ": "ใ",
"ใป": "ใ",
"ใพ": "ใ",
"ใฟ": "ใ",
"ใ": "ใ",
"ใ": "ใ",
"ใ": "ใ",
"ใ": "ใ",
"ใ": "ใ",
"ใ": "ใ",
"ใ": "ใ",
"ใ": "ใ",
"ใ": "ใ",
"ใ": "ใ",
"ใ": "ใ",
"ใ": "ใ",
"ใ": "ใ",
}
new_text = []
for i, char in enumerate(hiragana_text):
if char == "ใผ" and i > 0:
prev_char = new_text[-1]
if prev_char in yoon_map:
prev_char = yoon_map[prev_char]
new_text.append(vowels.get(prev_char, prev_char))
else:
new_text.append(char)
return "".join(new_text)
def to_kana(text: str) -> list[str]:
hiragana_text = kanji_to_kana.to_hiragana(text.replace(" ", ""))
hiragana_text_wl = replace_chouonpu(hiragana_text).split(" ")
final_ls = []
for subword in hiragana_text_wl:
sl_prev = 0
for i in range(len(subword) - 1):
if sl_prev >= len(subword) - 1:
break
sl = sl_prev + 1
if subword[sl] in yoon_map:
final_ls.append(subword[sl_prev : sl + 1])
sl_prev += 2
else:
final_ls.append(subword[sl_prev])
sl_prev += 1
final_ls.append(subword[sl_prev])
return final_ls
def kana_to_phonemes_openjtalk(kana: str) -> list[str]:
import pyopenjtalk
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
# add space between each character
kana = " ".join(list(kana))
# phones is a str object separated by space
phones = pyopenjtalk.g2p(kana, kana=False)
if len(w) > 0:
for warning in w:
if "No phoneme" in str(warning.message):
raise ValueError(f"No phoneme found for {kana}. {warning.message}")
phones = phones.split(" ")
return phones
def pinyin_to_phonemes_opencpop(pinyin: str) -> list[str]:
pinyin = pinyin.lower()
if pinyin in ace_phonemes_zh_plan["dict"]:
phns = ace_phonemes_zh_plan["dict"][pinyin]
return phns
elif pinyin in ace_phonemes_zh_plan["syllable_alias"]:
phns = ace_phonemes_zh_plan["dict"][
ace_phonemes_zh_plan["syllable_alias"][pinyin]
]
return phns
else:
raise ValueError(f"{pinyin} not registered in Opencpop phoneme dict")
def pinyin_to_phonemes_ace(pinyin: str) -> list[str]:
pinyin = pinyin.lower()
if pinyin in PINYIN_DICT:
phns = PINYIN_DICT[pinyin]
return phns
else:
raise ValueError(f"{pinyin} not registered in ACE phoneme dict")
|