File size: 3,906 Bytes
ac901c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# coding: utf8
"""Rule based Sentence tokenization module"""

# Global Variables
_URDU_CONJUNCTIONS = [
    "جنہیں",
    "جس",
    "جن",
    "جو",
    "اور",
    "اگر",
    "اگرچہ",
    "لیکن",
    "مگر",
    "پر",
    "یا",
    "تاہم",
    "کہ",
    "کر",
    "تو",
    "گے",
    "گی",
]
_URDU_NEWLINE_WORDS = [
    "کیجیے",
    "کیجئے",
    "گئیں",
    "تھیں",
    "ہوں",
    "خریدا",
    "گے",
    "ہونگے",
    "گا",
    "چاہیے",
    "ہوئیں",
    "گی",
    "تھا",
    "تھی",
    "تھے",
    "ہیں",
    "ہے",
]


def _split_and_keep(_str, separator):
    """Replace end of sentence with separator"""
    if not _str:
        return []
    max_p = chr(ord(max(_str)) + 1)
    return _str.replace(separator, separator + max_p).split(max_p)


def _generate_sentences(text: str) -> list:
    """Generate a list of urdu sentences from a given string.

    This function automatically fixes multiple whitespaces

    or new lines so you just need to pass the data and

    get sentences in return.



    Args:

        text (str): base string

    Returns:

        list

    """
    all_sentences = []
    sentences = _split_and_keep(text, "۔")

    for sentence in sentences:  # pylint: disable=too-many-nested-blocks
        if sentence and (len(sentence.split()) >= 2):
            if "؟" in sentence:
                q_sentences = _split_and_keep(sentence, "؟")
                for _sen in q_sentences:
                    _sen = _sen.split()
                    new_sent = ""
                    is_cont = False

                    for index, word in enumerate(_sen):
                        if is_cont:
                            is_cont = False
                            continue

                        if (
                            word in _URDU_NEWLINE_WORDS
                            and index + 1 < len(_sen)
                            and _sen[index + 1] not in _URDU_CONJUNCTIONS
                        ):
                            if index + 1 < len(_sen) and _sen[index + 1] in ["۔", "،"]:
                                new_sent += " " + word + " " + _sen[index + 1] + "\n"
                                is_cont = True
                            else:
                                new_sent += " " + word + "\n"

                        else:
                            new_sent += " " + word

                    for sen in new_sent.split("\n"):
                        if sen and len(sen.split()) >= 2:
                            all_sentences.append(sen.strip())

            else:
                sentence = sentence.split()
                new_sent = ""
                is_cont = False

                for index, word in enumerate(sentence):
                    if is_cont:
                        is_cont = False
                        continue

                    if (
                        word in _URDU_NEWLINE_WORDS
                        and index + 1 < len(sentence)
                        and sentence[index + 1] not in _URDU_CONJUNCTIONS
                    ):
                        if index + 1 < len(sentence) and sentence[index + 1] in [
                            "۔",
                            "،",
                        ]:
                            new_sent += " " + word + " " + sentence[index + 1] + "\n"
                            is_cont = True
                        else:
                            new_sent += " " + word + "\n"
                    else:
                        new_sent += " " + word

                for sen in new_sent.split("\n"):
                    if sen and len(sen.split()) >= 2:
                        all_sentences.append(sen.strip())

    return all_sentences