Spaces:

VIZINTZOR
/

TTS_MMS_VITS

Runtime error

File size: 5,264 Bytes

55d736f

import os
import re
from pythainlp import word_tokenize

# Ensure UTF-8 encoding is set
os.environ['PYTHONIOENCODING'] = 'utf-8'

def english_to_thai_fallback(word):
    mapping = {
        "today": "ทูเด",
        "hello": "เฮลโล",
        "world": "เวิลด์",
        "computer": "คอมพิวเตอร์",
        "phone": "โฟน",
        "school": "สคูล",
        "teacher": "ทีเชอร์",
        "student": "สตูเดนท์",
        "apple": "แอปเปิล",
        "orange": "ออเรนจ์",
        "table": "เทเบิล",
        "chair": "แชร์",
        "window": "วินโดว์",
        "door": "ดอร์",
        "water": "วอเทอร์",
        "coffee": "คอฟฟี่",
        "milk": "มิลค์",
        "juice": "จูซ",
        "food": "ฟูด",
        "car": "คาร์",
        "bus": "บัส",
        "train": "เทรน",
        "airplane": "แอร์เพลน",
        "boat": "โบ๊ท",
        "dog": "ด็อก",
        "cat": "แคท",
        "bird": "เบิร์ด",
        "fish": "ฟิช",
        "house": "เฮ้าส์",
        "city": "ซิตี้",
        "country": "คันทรี",
        "family": "แฟมิลี",
        "friend": "เฟรนด์",
        "love": "เลิฟ",
        "happiness": "แฮปปิเนส",
        "sadness": "แซดเนส",
        "anger": "แองเกอร์",
        "smile": "สไมล์",
        "cry": "คราย",
        "laugh": "ลาฟ",
        "light": "ไลท์",
        "dark": "ดาร์ก",
        "sun": "ซัน",
        "moon": "มูน",
        "star": "สตาร์",
        "ocean": "โอเชียน",
        "mountain": "เมาเทน",
        "river": "ริเวอร์",
        "forest": "ฟอเรสต์",
        "i": "ไอ",
        "love": "เลิฟ",
        "you": "ยู",
        "talk": "ทอล์ก",
        "sing": "ซิง",
        "dance": "แดนซ์",
        "read": "รีด",
        "write": "ไรท์",
        "run": "รัน",
        "walk": "วอล์ค",
        "jump": "จัมป์",
        "swim": "สวิม",
        "eat": "อีท",
        "drink": "ดริงค์",
        "sleep": "สลีป",
        "wake": "เวค",
        "good": "กู๊ด",
        "bad": "แบด",
        "happy": "แฮปปี้",
        "sad": "แซด",
        "angry": "แองกรี",
        "tired": "ไทร์ด"
    }
    character_mapping = {
        "a": "เอ",
        "b": "บี",
        "c": "ซี",
        "d": "ดี",
        "e": "อี",
        "f": "เอฟ",
        "g": "จี",
        "h": "เอช",
        "i": "ไอ",
        "j": "เจ",
        "k": "เค",
        "l": "แอล",
        "m": "เอ็ม",
        "n": "เอ็น",
        "o": "โอ",
        "p": "พี",
        "q": "คิว",
        "r": "อาร์",
        "s": "เอส",
        "t": "ที",
        "u": "ยู",
        "v": "วี",
        "w": "ดับเบิลยู",
        "x": "เอ็กซ์",
        "y": "วาย",
        "z": "แซด"
    }
    number_mapping = {
        "0": "ศูนย์",
        "1": "หนึ่ง",
        "2": "สอง",
        "3": "สาม",
        "4": "สี่",
        "5": "ห้า",
        "6": "หก",
        "7": "เจ็ด",
        "8": "แปด",
        "9": "เก้า",
        "10": "สิบ",
        "20": "ยี่สิบ",
        "30": "สามสิบ",
        "40": "สี่สิบ",
        "50": "ห้าสิบ",
        "60": "หกสิบ",
        "70": "เจ็ดสิบ",
        "80": "แปดสิบ",
        "90": "เก้าสิบ",
        "100": "หนึ่งร้อย"
    }
    mapping.update(number_mapping)
    mapping.update(character_mapping)
    return mapping.get(word.lower(), word)

def clean_thai_text(text):
    def replace_mai_ek(match):
        return match.group(1) + '\u0E4D' + 'า'  # Replace ำ with ํ + า

    # Replace occurrences of ำ with ํา
    text = re.sub(r'([ก-ฮ])ำ', replace_mai_ek, text)
    
    # Tokenize the text
    words = word_tokenize(text, keep_whitespace=True)

    # Convert English words to Thai phonemes
    cleaned_text = []
    for word in words:
        if re.search(r'[a-zA-Z]', word):  # If the word contains English letters
            try:
                from pythainlp import transliterate  # Import here to handle the library conditionally
                thai_phoneme = transliterate(word, engine='ipa')
                cleaned_text.append(thai_phoneme)
            except Exception:
                cleaned_text.append(english_to_thai_fallback(word))
        else:
            cleaned_text.append(word)

    return ''.join(cleaned_text)