import os import re from pythainlp import word_tokenize # Ensure UTF-8 encoding is set os.environ['PYTHONIOENCODING'] = 'utf-8' def english_to_thai_fallback(word): mapping = { "today": "ทูเด", "hello": "เฮลโล", "world": "เวิลด์", "computer": "คอมพิวเตอร์", "phone": "โฟน", "school": "สคูล", "teacher": "ทีเชอร์", "student": "สตูเดนท์", "apple": "แอปเปิล", "orange": "ออเรนจ์", "table": "เทเบิล", "chair": "แชร์", "window": "วินโดว์", "door": "ดอร์", "water": "วอเทอร์", "coffee": "คอฟฟี่", "milk": "มิลค์", "juice": "จูซ", "food": "ฟูด", "car": "คาร์", "bus": "บัส", "train": "เทรน", "airplane": "แอร์เพลน", "boat": "โบ๊ท", "dog": "ด็อก", "cat": "แคท", "bird": "เบิร์ด", "fish": "ฟิช", "house": "เฮ้าส์", "city": "ซิตี้", "country": "คันทรี", "family": "แฟมิลี", "friend": "เฟรนด์", "love": "เลิฟ", "happiness": "แฮปปิเนส", "sadness": "แซดเนส", "anger": "แองเกอร์", "smile": "สไมล์", "cry": "คราย", "laugh": "ลาฟ", "light": "ไลท์", "dark": "ดาร์ก", "sun": "ซัน", "moon": "มูน", "star": "สตาร์", "ocean": "โอเชียน", "mountain": "เมาเทน", "river": "ริเวอร์", "forest": "ฟอเรสต์", "i": "ไอ", "love": "เลิฟ", "you": "ยู", "talk": "ทอล์ก", "sing": "ซิง", "dance": "แดนซ์", "read": "รีด", "write": "ไรท์", "run": "รัน", "walk": "วอล์ค", "jump": "จัมป์", "swim": "สวิม", "eat": "อีท", "drink": "ดริงค์", "sleep": "สลีป", "wake": "เวค", "good": "กู๊ด", "bad": "แบด", "happy": "แฮปปี้", "sad": "แซด", "angry": "แองกรี", "tired": "ไทร์ด" } character_mapping = { "a": "เอ", "b": "บี", "c": "ซี", "d": "ดี", "e": "อี", "f": "เอฟ", "g": "จี", "h": "เอช", "i": "ไอ", "j": "เจ", "k": "เค", "l": "แอล", "m": "เอ็ม", "n": "เอ็น", "o": "โอ", "p": "พี", "q": "คิว", "r": "อาร์", "s": "เอส", "t": "ที", "u": "ยู", "v": "วี", "w": "ดับเบิลยู", "x": "เอ็กซ์", "y": "วาย", "z": "แซด" } number_mapping = { "0": "ศูนย์", "1": "หนึ่ง", "2": "สอง", "3": "สาม", "4": "สี่", "5": "ห้า", "6": "หก", "7": "เจ็ด", "8": "แปด", "9": "เก้า", "10": "สิบ", "20": "ยี่สิบ", "30": "สามสิบ", "40": "สี่สิบ", "50": "ห้าสิบ", "60": "หกสิบ", "70": "เจ็ดสิบ", "80": "แปดสิบ", "90": "เก้าสิบ", "100": "หนึ่งร้อย" } mapping.update(number_mapping) mapping.update(character_mapping) return mapping.get(word.lower(), word) def clean_thai_text(text): def replace_mai_ek(match): return match.group(1) + '\u0E4D' + 'า' # Replace ำ with ํ + า # Replace occurrences of ำ with ํา text = re.sub(r'([ก-ฮ])ำ', replace_mai_ek, text) # Tokenize the text words = word_tokenize(text, keep_whitespace=True) # Convert English words to Thai phonemes cleaned_text = [] for word in words: if re.search(r'[a-zA-Z]', word): # If the word contains English letters try: from pythainlp import transliterate # Import here to handle the library conditionally thai_phoneme = transliterate(word, engine='ipa') cleaned_text.append(thai_phoneme) except Exception: cleaned_text.append(english_to_thai_fallback(word)) else: cleaned_text.append(word) return ''.join(cleaned_text)