Spaces:
Sleeping
Sleeping
File size: 3,906 Bytes
ac901c7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
# coding: utf8
"""Rule based Sentence tokenization module"""
# Global Variables
_URDU_CONJUNCTIONS = [
"جنہیں",
"جس",
"جن",
"جو",
"اور",
"اگر",
"اگرچہ",
"لیکن",
"مگر",
"پر",
"یا",
"تاہم",
"کہ",
"کر",
"تو",
"گے",
"گی",
]
_URDU_NEWLINE_WORDS = [
"کیجیے",
"کیجئے",
"گئیں",
"تھیں",
"ہوں",
"خریدا",
"گے",
"ہونگے",
"گا",
"چاہیے",
"ہوئیں",
"گی",
"تھا",
"تھی",
"تھے",
"ہیں",
"ہے",
]
def _split_and_keep(_str, separator):
"""Replace end of sentence with separator"""
if not _str:
return []
max_p = chr(ord(max(_str)) + 1)
return _str.replace(separator, separator + max_p).split(max_p)
def _generate_sentences(text: str) -> list:
"""Generate a list of urdu sentences from a given string.
This function automatically fixes multiple whitespaces
or new lines so you just need to pass the data and
get sentences in return.
Args:
text (str): base string
Returns:
list
"""
all_sentences = []
sentences = _split_and_keep(text, "۔")
for sentence in sentences: # pylint: disable=too-many-nested-blocks
if sentence and (len(sentence.split()) >= 2):
if "؟" in sentence:
q_sentences = _split_and_keep(sentence, "؟")
for _sen in q_sentences:
_sen = _sen.split()
new_sent = ""
is_cont = False
for index, word in enumerate(_sen):
if is_cont:
is_cont = False
continue
if (
word in _URDU_NEWLINE_WORDS
and index + 1 < len(_sen)
and _sen[index + 1] not in _URDU_CONJUNCTIONS
):
if index + 1 < len(_sen) and _sen[index + 1] in ["۔", "،"]:
new_sent += " " + word + " " + _sen[index + 1] + "\n"
is_cont = True
else:
new_sent += " " + word + "\n"
else:
new_sent += " " + word
for sen in new_sent.split("\n"):
if sen and len(sen.split()) >= 2:
all_sentences.append(sen.strip())
else:
sentence = sentence.split()
new_sent = ""
is_cont = False
for index, word in enumerate(sentence):
if is_cont:
is_cont = False
continue
if (
word in _URDU_NEWLINE_WORDS
and index + 1 < len(sentence)
and sentence[index + 1] not in _URDU_CONJUNCTIONS
):
if index + 1 < len(sentence) and sentence[index + 1] in [
"۔",
"،",
]:
new_sent += " " + word + " " + sentence[index + 1] + "\n"
is_cont = True
else:
new_sent += " " + word + "\n"
else:
new_sent += " " + word
for sen in new_sent.split("\n"):
if sen and len(sen.split()) >= 2:
all_sentences.append(sen.strip())
return all_sentences
|