Spaces:

Rajendransp133
/

microservice-NMT

Sleeping

File size: 3,906 Bytes

ac901c7

# coding: utf8
"""Rule based Sentence tokenization module"""

# Global Variables
_URDU_CONJUNCTIONS = [
    "جنہیں",
    "جس",
    "جن",
    "جو",
    "اور",
    "اگر",
    "اگرچہ",
    "لیکن",
    "مگر",
    "پر",
    "یا",
    "تاہم",
    "کہ",
    "کر",
    "تو",
    "گے",
    "گی",
]
_URDU_NEWLINE_WORDS = [
    "کیجیے",
    "کیجئے",
    "گئیں",
    "تھیں",
    "ہوں",
    "خریدا",
    "گے",
    "ہونگے",
    "گا",
    "چاہیے",
    "ہوئیں",
    "گی",
    "تھا",
    "تھی",
    "تھے",
    "ہیں",
    "ہے",
]


def _split_and_keep(_str, separator):
    """Replace end of sentence with separator"""
    if not _str:
        return []
    max_p = chr(ord(max(_str)) + 1)
    return _str.replace(separator, separator + max_p).split(max_p)


def _generate_sentences(text: str) -> list:
    """Generate a list of urdu sentences from a given string.

    This function automatically fixes multiple whitespaces

    or new lines so you just need to pass the data and

    get sentences in return.



    Args:

        text (str): base string

    Returns:

        list

    """
    all_sentences = []
    sentences = _split_and_keep(text, "۔")

    for sentence in sentences:  # pylint: disable=too-many-nested-blocks
        if sentence and (len(sentence.split()) >= 2):
            if "؟" in sentence:
                q_sentences = _split_and_keep(sentence, "؟")
                for _sen in q_sentences:
                    _sen = _sen.split()
                    new_sent = ""
                    is_cont = False

                    for index, word in enumerate(_sen):
                        if is_cont:
                            is_cont = False
                            continue

                        if (
                            word in _URDU_NEWLINE_WORDS
                            and index + 1 < len(_sen)
                            and _sen[index + 1] not in _URDU_CONJUNCTIONS
                        ):
                            if index + 1 < len(_sen) and _sen[index + 1] in ["۔", "،"]:
                                new_sent += " " + word + " " + _sen[index + 1] + "\n"
                                is_cont = True
                            else:
                                new_sent += " " + word + "\n"

                        else:
                            new_sent += " " + word

                    for sen in new_sent.split("\n"):
                        if sen and len(sen.split()) >= 2:
                            all_sentences.append(sen.strip())

            else:
                sentence = sentence.split()
                new_sent = ""
                is_cont = False

                for index, word in enumerate(sentence):
                    if is_cont:
                        is_cont = False
                        continue

                    if (
                        word in _URDU_NEWLINE_WORDS
                        and index + 1 < len(sentence)
                        and sentence[index + 1] not in _URDU_CONJUNCTIONS
                    ):
                        if index + 1 < len(sentence) and sentence[index + 1] in [
                            "۔",
                            "،",
                        ]:
                            new_sent += " " + word + " " + sentence[index + 1] + "\n"
                            is_cont = True
                        else:
                            new_sent += " " + word + "\n"
                    else:
                        new_sent += " " + word

                for sen in new_sent.split("\n"):
                    if sen and len(sen.split()) >= 2:
                        all_sentences.append(sen.strip())

    return all_sentences