File size: 5,264 Bytes
55d736f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import os
import re
from pythainlp import word_tokenize

# Ensure UTF-8 encoding is set
os.environ['PYTHONIOENCODING'] = 'utf-8'

def english_to_thai_fallback(word):
    mapping = {
        "today": "ทูเด",
        "hello": "เฮลโล",
        "world": "เวิลด์",
        "computer": "คอมพิวเตอร์",
        "phone": "โฟน",
        "school": "สคูล",
        "teacher": "ทีเชอร์",
        "student": "สตูเดนท์",
        "apple": "แอปเปิล",
        "orange": "ออเรนจ์",
        "table": "เทเบิล",
        "chair": "แชร์",
        "window": "วินโดว์",
        "door": "ดอร์",
        "water": "วอเทอร์",
        "coffee": "คอฟฟี่",
        "milk": "มิลค์",
        "juice": "จูซ",
        "food": "ฟูด",
        "car": "คาร์",
        "bus": "บัส",
        "train": "เทรน",
        "airplane": "แอร์เพลน",
        "boat": "โบ๊ท",
        "dog": "ด็อก",
        "cat": "แคท",
        "bird": "เบิร์ด",
        "fish": "ฟิช",
        "house": "เฮ้าส์",
        "city": "ซิตี้",
        "country": "คันทรี",
        "family": "แฟมิลี",
        "friend": "เฟรนด์",
        "love": "เลิฟ",
        "happiness": "แฮปปิเนส",
        "sadness": "แซดเนส",
        "anger": "แองเกอร์",
        "smile": "สไมล์",
        "cry": "คราย",
        "laugh": "ลาฟ",
        "light": "ไลท์",
        "dark": "ดาร์ก",
        "sun": "ซัน",
        "moon": "มูน",
        "star": "สตาร์",
        "ocean": "โอเชียน",
        "mountain": "เมาเทน",
        "river": "ริเวอร์",
        "forest": "ฟอเรสต์",
        "i": "ไอ",
        "love": "เลิฟ",
        "you": "ยู",
        "talk": "ทอล์ก",
        "sing": "ซิง",
        "dance": "แดนซ์",
        "read": "รีด",
        "write": "ไรท์",
        "run": "รัน",
        "walk": "วอล์ค",
        "jump": "จัมป์",
        "swim": "สวิม",
        "eat": "อีท",
        "drink": "ดริงค์",
        "sleep": "สลีป",
        "wake": "เวค",
        "good": "กู๊ด",
        "bad": "แบด",
        "happy": "แฮปปี้",
        "sad": "แซด",
        "angry": "แองกรี",
        "tired": "ไทร์ด"
    }
    character_mapping = {
        "a": "เอ",
        "b": "บี",
        "c": "ซี",
        "d": "ดี",
        "e": "อี",
        "f": "เอฟ",
        "g": "จี",
        "h": "เอช",
        "i": "ไอ",
        "j": "เจ",
        "k": "เค",
        "l": "แอล",
        "m": "เอ็ม",
        "n": "เอ็น",
        "o": "โอ",
        "p": "พี",
        "q": "คิว",
        "r": "อาร์",
        "s": "เอส",
        "t": "ที",
        "u": "ยู",
        "v": "วี",
        "w": "ดับเบิลยู",
        "x": "เอ็กซ์",
        "y": "วาย",
        "z": "แซด"
    }
    number_mapping = {
        "0": "ศูนย์",
        "1": "หนึ่ง",
        "2": "สอง",
        "3": "สาม",
        "4": "สี่",
        "5": "ห้า",
        "6": "หก",
        "7": "เจ็ด",
        "8": "แปด",
        "9": "เก้า",
        "10": "สิบ",
        "20": "ยี่สิบ",
        "30": "สามสิบ",
        "40": "สี่สิบ",
        "50": "ห้าสิบ",
        "60": "หกสิบ",
        "70": "เจ็ดสิบ",
        "80": "แปดสิบ",
        "90": "เก้าสิบ",
        "100": "หนึ่งร้อย"
    }
    mapping.update(number_mapping)
    mapping.update(character_mapping)
    return mapping.get(word.lower(), word)

def clean_thai_text(text):
    def replace_mai_ek(match):
        return match.group(1) + '\u0E4D' + 'า'  # Replace ำ with ํ + า

    # Replace occurrences of ำ with ํา
    text = re.sub(r'([ก-ฮ])ำ', replace_mai_ek, text)
    
    # Tokenize the text
    words = word_tokenize(text, keep_whitespace=True)

    # Convert English words to Thai phonemes
    cleaned_text = []
    for word in words:
        if re.search(r'[a-zA-Z]', word):  # If the word contains English letters
            try:
                from pythainlp import transliterate  # Import here to handle the library conditionally
                thai_phoneme = transliterate(word, engine='ipa')
                cleaned_text.append(thai_phoneme)
            except Exception:
                cleaned_text.append(english_to_thai_fallback(word))
        else:
            cleaned_text.append(word)

    return ''.join(cleaned_text)