Spaces:
Sleeping
Sleeping
# coding: utf8 | |
"""Rule based Sentence tokenization module""" | |
# Global Variables | |
_URDU_CONJUNCTIONS = [ | |
"جنہیں", | |
"جس", | |
"جن", | |
"جو", | |
"اور", | |
"اگر", | |
"اگرچہ", | |
"لیکن", | |
"مگر", | |
"پر", | |
"یا", | |
"تاہم", | |
"کہ", | |
"کر", | |
"تو", | |
"گے", | |
"گی", | |
] | |
_URDU_NEWLINE_WORDS = [ | |
"کیجیے", | |
"کیجئے", | |
"گئیں", | |
"تھیں", | |
"ہوں", | |
"خریدا", | |
"گے", | |
"ہونگے", | |
"گا", | |
"چاہیے", | |
"ہوئیں", | |
"گی", | |
"تھا", | |
"تھی", | |
"تھے", | |
"ہیں", | |
"ہے", | |
] | |
def _split_and_keep(_str, separator): | |
"""Replace end of sentence with separator""" | |
if not _str: | |
return [] | |
max_p = chr(ord(max(_str)) + 1) | |
return _str.replace(separator, separator + max_p).split(max_p) | |
def _generate_sentences(text: str) -> list: | |
"""Generate a list of urdu sentences from a given string. | |
This function automatically fixes multiple whitespaces | |
or new lines so you just need to pass the data and | |
get sentences in return. | |
Args: | |
text (str): base string | |
Returns: | |
list | |
""" | |
all_sentences = [] | |
sentences = _split_and_keep(text, "۔") | |
for sentence in sentences: # pylint: disable=too-many-nested-blocks | |
if sentence and (len(sentence.split()) >= 2): | |
if "؟" in sentence: | |
q_sentences = _split_and_keep(sentence, "؟") | |
for _sen in q_sentences: | |
_sen = _sen.split() | |
new_sent = "" | |
is_cont = False | |
for index, word in enumerate(_sen): | |
if is_cont: | |
is_cont = False | |
continue | |
if ( | |
word in _URDU_NEWLINE_WORDS | |
and index + 1 < len(_sen) | |
and _sen[index + 1] not in _URDU_CONJUNCTIONS | |
): | |
if index + 1 < len(_sen) and _sen[index + 1] in ["۔", "،"]: | |
new_sent += " " + word + " " + _sen[index + 1] + "\n" | |
is_cont = True | |
else: | |
new_sent += " " + word + "\n" | |
else: | |
new_sent += " " + word | |
for sen in new_sent.split("\n"): | |
if sen and len(sen.split()) >= 2: | |
all_sentences.append(sen.strip()) | |
else: | |
sentence = sentence.split() | |
new_sent = "" | |
is_cont = False | |
for index, word in enumerate(sentence): | |
if is_cont: | |
is_cont = False | |
continue | |
if ( | |
word in _URDU_NEWLINE_WORDS | |
and index + 1 < len(sentence) | |
and sentence[index + 1] not in _URDU_CONJUNCTIONS | |
): | |
if index + 1 < len(sentence) and sentence[index + 1] in [ | |
"۔", | |
"،", | |
]: | |
new_sent += " " + word + " " + sentence[index + 1] + "\n" | |
is_cont = True | |
else: | |
new_sent += " " + word + "\n" | |
else: | |
new_sent += " " + word | |
for sen in new_sent.split("\n"): | |
if sen and len(sen.split()) >= 2: | |
all_sentences.append(sen.strip()) | |
return all_sentences | |