Spaces:
Runtime error
Runtime error
File size: 5,264 Bytes
55d736f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import os
import re
from pythainlp import word_tokenize
# Ensure UTF-8 encoding is set
os.environ['PYTHONIOENCODING'] = 'utf-8'
def english_to_thai_fallback(word):
mapping = {
"today": "ทูเด",
"hello": "เฮลโล",
"world": "เวิลด์",
"computer": "คอมพิวเตอร์",
"phone": "โฟน",
"school": "สคูล",
"teacher": "ทีเชอร์",
"student": "สตูเดนท์",
"apple": "แอปเปิล",
"orange": "ออเรนจ์",
"table": "เทเบิล",
"chair": "แชร์",
"window": "วินโดว์",
"door": "ดอร์",
"water": "วอเทอร์",
"coffee": "คอฟฟี่",
"milk": "มิลค์",
"juice": "จูซ",
"food": "ฟูด",
"car": "คาร์",
"bus": "บัส",
"train": "เทรน",
"airplane": "แอร์เพลน",
"boat": "โบ๊ท",
"dog": "ด็อก",
"cat": "แคท",
"bird": "เบิร์ด",
"fish": "ฟิช",
"house": "เฮ้าส์",
"city": "ซิตี้",
"country": "คันทรี",
"family": "แฟมิลี",
"friend": "เฟรนด์",
"love": "เลิฟ",
"happiness": "แฮปปิเนส",
"sadness": "แซดเนส",
"anger": "แองเกอร์",
"smile": "สไมล์",
"cry": "คราย",
"laugh": "ลาฟ",
"light": "ไลท์",
"dark": "ดาร์ก",
"sun": "ซัน",
"moon": "มูน",
"star": "สตาร์",
"ocean": "โอเชียน",
"mountain": "เมาเทน",
"river": "ริเวอร์",
"forest": "ฟอเรสต์",
"i": "ไอ",
"love": "เลิฟ",
"you": "ยู",
"talk": "ทอล์ก",
"sing": "ซิง",
"dance": "แดนซ์",
"read": "รีด",
"write": "ไรท์",
"run": "รัน",
"walk": "วอล์ค",
"jump": "จัมป์",
"swim": "สวิม",
"eat": "อีท",
"drink": "ดริงค์",
"sleep": "สลีป",
"wake": "เวค",
"good": "กู๊ด",
"bad": "แบด",
"happy": "แฮปปี้",
"sad": "แซด",
"angry": "แองกรี",
"tired": "ไทร์ด"
}
character_mapping = {
"a": "เอ",
"b": "บี",
"c": "ซี",
"d": "ดี",
"e": "อี",
"f": "เอฟ",
"g": "จี",
"h": "เอช",
"i": "ไอ",
"j": "เจ",
"k": "เค",
"l": "แอล",
"m": "เอ็ม",
"n": "เอ็น",
"o": "โอ",
"p": "พี",
"q": "คิว",
"r": "อาร์",
"s": "เอส",
"t": "ที",
"u": "ยู",
"v": "วี",
"w": "ดับเบิลยู",
"x": "เอ็กซ์",
"y": "วาย",
"z": "แซด"
}
number_mapping = {
"0": "ศูนย์",
"1": "หนึ่ง",
"2": "สอง",
"3": "สาม",
"4": "สี่",
"5": "ห้า",
"6": "หก",
"7": "เจ็ด",
"8": "แปด",
"9": "เก้า",
"10": "สิบ",
"20": "ยี่สิบ",
"30": "สามสิบ",
"40": "สี่สิบ",
"50": "ห้าสิบ",
"60": "หกสิบ",
"70": "เจ็ดสิบ",
"80": "แปดสิบ",
"90": "เก้าสิบ",
"100": "หนึ่งร้อย"
}
mapping.update(number_mapping)
mapping.update(character_mapping)
return mapping.get(word.lower(), word)
def clean_thai_text(text):
def replace_mai_ek(match):
return match.group(1) + '\u0E4D' + 'า' # Replace ำ with ํ + า
# Replace occurrences of ำ with ํา
text = re.sub(r'([ก-ฮ])ำ', replace_mai_ek, text)
# Tokenize the text
words = word_tokenize(text, keep_whitespace=True)
# Convert English words to Thai phonemes
cleaned_text = []
for word in words:
if re.search(r'[a-zA-Z]', word): # If the word contains English letters
try:
from pythainlp import transliterate # Import here to handle the library conditionally
thai_phoneme = transliterate(word, engine='ipa')
cleaned_text.append(thai_phoneme)
except Exception:
cleaned_text.append(english_to_thai_fallback(word))
else:
cleaned_text.append(word)
return ''.join(cleaned_text)
|