Spaces:

Rajendransp133
/

microservice-NMT

Sleeping

App Files Files Community

microservice-NMT / libs /indic_nlp_library /indicnlp /urduhack /tokenization /eos.py

Rajendransp133

Upload 86 files

ac901c7 verified 5 months ago

raw

history blame contribute delete

3.91 kB

	# coding: utf8
	"""Rule based Sentence tokenization module"""

	# Global Variables
	_URDU_CONJUNCTIONS = [
	"جنہیں",
	"جس",
	"جن",
	"جو",
	"اور",
	"اگر",
	"اگرچہ",
	"لیکن",
	"مگر",
	"پر",
	"یا",
	"تاہم",
	"کہ",
	"کر",
	"تو",
	"گے",
	"گی",
	]
	_URDU_NEWLINE_WORDS = [
	"کیجیے",
	"کیجئے",
	"گئیں",
	"تھیں",
	"ہوں",
	"خریدا",
	"گے",
	"ہونگے",
	"گا",
	"چاہیے",
	"ہوئیں",
	"گی",
	"تھا",
	"تھی",
	"تھے",
	"ہیں",
	"ہے",
	]


	def _split_and_keep(_str, separator):
	"""Replace end of sentence with separator"""
	if not _str:
	return []
	max_p = chr(ord(max(_str)) + 1)
	return _str.replace(separator, separator + max_p).split(max_p)


	def _generate_sentences(text: str) -> list:
	"""Generate a list of urdu sentences from a given string.
	This function automatically fixes multiple whitespaces
	or new lines so you just need to pass the data and
	get sentences in return.

	Args:
	text (str): base string
	Returns:
	list
	"""
	all_sentences = []
	sentences = _split_and_keep(text, "۔")

	for sentence in sentences: # pylint: disable=too-many-nested-blocks
	if sentence and (len(sentence.split()) >= 2):
	if "؟" in sentence:
	q_sentences = _split_and_keep(sentence, "؟")
	for _sen in q_sentences:
	_sen = _sen.split()
	new_sent = ""
	is_cont = False

	for index, word in enumerate(_sen):
	if is_cont:
	is_cont = False
	continue

	if (
	word in _URDU_NEWLINE_WORDS
	and index + 1 < len(_sen)
	and _sen[index + 1] not in _URDU_CONJUNCTIONS
	):
	if index + 1 < len(_sen) and _sen[index + 1] in ["۔", "،"]:
	new_sent += " " + word + " " + _sen[index + 1] + "\n"
	is_cont = True
	else:
	new_sent += " " + word + "\n"

	else:
	new_sent += " " + word

	for sen in new_sent.split("\n"):
	if sen and len(sen.split()) >= 2:
	all_sentences.append(sen.strip())

	else:
	sentence = sentence.split()
	new_sent = ""
	is_cont = False

	for index, word in enumerate(sentence):
	if is_cont:
	is_cont = False
	continue

	if (
	word in _URDU_NEWLINE_WORDS
	and index + 1 < len(sentence)
	and sentence[index + 1] not in _URDU_CONJUNCTIONS
	):
	if index + 1 < len(sentence) and sentence[index + 1] in [
	"۔",
	"،",
	]:
	new_sent += " " + word + " " + sentence[index + 1] + "\n"
	is_cont = True
	else:
	new_sent += " " + word + "\n"
	else:
	new_sent += " " + word

	for sen in new_sent.split("\n"):
	if sen and len(sen.split()) >= 2:
	all_sentences.append(sen.strip())

	return all_sentences