# # Copyright (c) 2013-present, Anoop Kunchukuttan # All rights reserved. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # import pandas as pd import numpy as np import os from indicnlp import common from indicnlp.common import IndicNlpException from indicnlp import langinfo as li ### # Phonetic Information about script characters ### """ Phonetic data about all languages except Tamil """ ALL_PHONETIC_DATA = None """ Phonetic data for Tamil """ TAMIL_PHONETIC_DATA = None """ Phonetic vector for all languages except Tamil """ ALL_PHONETIC_VECTORS = None """ Phonetic vector for Tamil """ TAMIL_PHONETIC_VECTORS = None """ Length of phonetic vector """ PHONETIC_VECTOR_LENGTH = 38 """ Start offset for the phonetic feature vector in the phonetic data vector """ PHONETIC_VECTOR_START_OFFSET = 6 ## PHONETIC PROPERTIES in order in which they occur in the vector ## This list must be in sync with the keys in the PV_PROP_RANGES dictionary PV_PROP = [ "basic_type", "vowel_length", "vowel_strength", "vowel_status", "consonant_type", "articulation_place", "aspiration", "voicing", "nasalization", "vowel_horizontal", "vowel_vertical", "vowel_roundness", ] ### # Bit vector ranges for various properties ### PV_PROP_RANGES = { "basic_type": [0, 6], "vowel_length": [6, 8], "vowel_strength": [8, 11], "vowel_status": [11, 13], "consonant_type": [13, 18], "articulation_place": [18, 23], "aspiration": [23, 25], "voicing": [25, 27], "nasalization": [27, 29], "vowel_horizontal": [29, 32], "vowel_vertical": [32, 36], "vowel_roundness": [36, 38], } #### # Indexes into the Phonetic Vector #### PVIDX_BT_VOWEL = 0 PVIDX_BT_CONSONANT = 1 PVIDX_BT_NUKTA = 2 PVIDX_BT_HALANT = 3 PVIDX_BT_ANUSVAAR = 4 PVIDX_BT_MISC = 5 PVIDX_BT_S = PVIDX_BT_VOWEL PVIDX_BT_E = PVIDX_BT_MISC + 1 PVIDX_VSTAT_DEP = 12 ##### # Unicode information about characters ##### SCRIPT_OFFSET_START = 0 SCRIPT_OFFSET_RANGE = 0x80 def init(): """ To be called by library loader, do not call it in your program """ global ALL_PHONETIC_DATA, ALL_PHONETIC_VECTORS, TAMIL_PHONETIC_DATA, TAMIL_PHONETIC_VECTORS, PHONETIC_VECTOR_LENGTH, PHONETIC_VECTOR_START_OFFSET ALL_PHONETIC_DATA = pd.read_csv( os.path.join( common.get_resources_path(), "script", "all_script_phonetic_data.csv" ), encoding="utf-8", ) TAMIL_PHONETIC_DATA = pd.read_csv( os.path.join( common.get_resources_path(), "script", "tamil_script_phonetic_data.csv" ), encoding="utf-8", ) ALL_PHONETIC_VECTORS = ALL_PHONETIC_DATA.iloc[ :, PHONETIC_VECTOR_START_OFFSET: ].values TAMIL_PHONETIC_VECTORS = TAMIL_PHONETIC_DATA.iloc[ :, PHONETIC_VECTOR_START_OFFSET: ].values PHONETIC_VECTOR_LENGTH = ALL_PHONETIC_VECTORS.shape[1] def is_supported_language(lang): return lang in list(li.SCRIPT_RANGES.keys()) def get_offset(c, lang): if not is_supported_language(lang): raise IndicNlpException("Language {} not supported".format(lang)) return ord(c) - li.SCRIPT_RANGES[lang][0] def offset_to_char(off, lang): """ Applicable to Brahmi derived Indic scripts """ if not is_supported_language(lang): raise IndicNlpException("Language {} not supported".format(lang)) return chr(off + li.SCRIPT_RANGES[lang][0]) def is_indiclang_char(c, lang): """ Applicable to Brahmi derived Indic scripts Note that DANDA and DOUBLE_DANDA have the same Unicode codepoint for all Indic scripts """ if not is_supported_language(lang): raise IndicNlpException("Language {} not supported".format(lang)) o = get_offset(c, lang) return ( (o >= SCRIPT_OFFSET_START and o < SCRIPT_OFFSET_RANGE) or ord(c) == li.DANDA or ord(c) == li.DOUBLE_DANDA ) def in_coordinated_range_offset(c_offset): """ Applicable to Brahmi derived Indic scripts """ return ( c_offset >= li.COORDINATED_RANGE_START_INCLUSIVE and c_offset <= li.COORDINATED_RANGE_END_INCLUSIVE ) def in_coordinated_range(c, lang): if not is_supported_language(lang): raise IndicNlpException("Language {} not supported".format(lang)) return in_coordinated_range_offset(get_offset(c, lang)) def get_phonetic_info(lang): if not is_supported_language(lang): raise IndicNlpException("Language {} not supported".format(lang)) phonetic_data = ALL_PHONETIC_DATA if lang != li.LC_TA else TAMIL_PHONETIC_DATA phonetic_vectors = ( ALL_PHONETIC_VECTORS if lang != li.LC_TA else TAMIL_PHONETIC_VECTORS ) return (phonetic_data, phonetic_vectors) def invalid_vector(): ## TODO: check if np datatype is correct? return np.array([0] * PHONETIC_VECTOR_LENGTH) def get_phonetic_feature_vector(c, lang): offset = get_offset(c, lang) if not in_coordinated_range_offset(offset): return invalid_vector() phonetic_data, phonetic_vectors = get_phonetic_info(lang) if phonetic_data.iloc[offset]["Valid Vector Representation"] == 0: return invalid_vector() return phonetic_vectors[offset] def get_phonetic_feature_vector_offset(offset, lang): if not in_coordinated_range_offset(offset): return invalid_vector() phonetic_data, phonetic_vectors = get_phonetic_info(lang) if phonetic_data.iloc[offset]["Valid Vector Representation"] == 0: return invalid_vector() return phonetic_vectors[offset] ### Unary operations on vectors def is_valid(v): return np.sum(v) > 0 def is_vowel(v): return v[PVIDX_BT_VOWEL] == 1 def is_consonant(v): return v[PVIDX_BT_CONSONANT] == 1 def is_halant(v): return v[PVIDX_BT_HALANT] == 1 def is_nukta(v): return v[PVIDX_BT_NUKTA] == 1 def is_anusvaar(v): return v[PVIDX_BT_ANUSVAAR] == 1 def is_misc(v): return v[PVIDX_BT_MISC] == 1 def is_dependent_vowel(v): return is_vowel(v) and v[PVIDX_VSTAT_DEP] == 1 def is_plosive(v): return is_consonant(v) and get_property_vector(v, "consonant_type")[0] == 1 ### Binary operations on phonetic vectors def or_vectors(v1, v2): return np.array([1 if (b1 + b2) >= 1 else 0 for b1, b2 in zip(v1, v2)]) def xor_vectors(v1, v2): return np.array([1 if b1 != b2 else 0 for b1, b2 in zip(v1, v2)]) ### Getting properties from phonetic vectors def get_property_vector(v, prop_name): return v[PV_PROP_RANGES[prop_name][0] : PV_PROP_RANGES[prop_name][1]] def get_property_value(v, prop_name): factor_bits = get_property_vector(v, prop_name).tolist() v = 0 c = 1 for b in factor_bits[::-1]: v += c * b c = c * 2.0 return int(v) def lcsr_indic(srcw, tgtw, slang, tlang): """ compute the Longest Common Subsequence Ratio (LCSR) between two strings at the character level. This works for Indic scripts by mapping both languages to a common script srcw: source language string tgtw: source language string slang: source language tlang: target language """ score_mat = np.zeros((len(srcw) + 1, len(tgtw) + 1)) for si, sc in enumerate(srcw, 1): for ti, tc in enumerate(tgtw, 1): so = get_offset(sc, slang) to = get_offset(tc, tlang) if ( in_coordinated_range_offset(so) and in_coordinated_range_offset(to) and so == to ): score_mat[si, ti] = score_mat[si - 1, ti - 1] + 1.0 elif ( not (in_coordinated_range_offset(so) or in_coordinated_range_offset(to)) and sc == tc ): score_mat[si, ti] = score_mat[si - 1, ti - 1] + 1.0 else: score_mat[si, ti] = max(score_mat[si, ti - 1], score_mat[si - 1, ti]) return ( score_mat[-1, -1] / float(max(len(srcw), len(tgtw))), float(len(srcw)), float(len(tgtw)), ) def lcsr_any(srcw, tgtw): """ LCSR computation if both languages have the same script """ score_mat = np.zeros((len(srcw) + 1, len(tgtw) + 1)) for si, sc in enumerate(srcw, 1): for ti, tc in enumerate(tgtw, 1): if sc == tc: score_mat[si, ti] = score_mat[si - 1, ti - 1] + 1.0 else: score_mat[si, ti] = max(score_mat[si, ti - 1], score_mat[si - 1, ti]) return ( score_mat[-1, -1] / float(max(len(srcw), len(tgtw))), float(len(srcw)), float(len(tgtw)), ) def lcsr(srcw, tgtw, slang, tlang): """ compute the Longest Common Subsequence Ratio (LCSR) between two strings at the character level. srcw: source language string tgtw: source language string slang: source language tlang: target language """ if ( slang == tlang or not is_supported_language(slang) or not is_supported_language(tlang) ): return lcsr_any(srcw, tgtw, slang, tlang) else: return lcsr_indic(srcw, tgtw)