Spaces:

Rajendransp133
/

microservice-NMT

Sleeping

App Files Files Community

microservice-NMT / libs /indic_nlp_library /indicnlp /urduhack /preprocessing /character.py

Rajendransp133

Upload 86 files

ac901c7 verified 5 months ago

raw

history blame contribute delete

3.41 kB

	# coding: utf8
	"""
	Urduhack Character preprocess functions
	"""

	from .regexes import _SPACE_AFTER_ALL_PUNCTUATIONS_RE, _SPACE_BEFORE_ALL_PUNCTUATIONS_RE
	from .regexes import _SPACE_AFTER_DIGITS_RE, _SPACE_BEFORE_DIGITS_RE
	from .regexes import _SPACE_BEFORE_ENG_CHAR_RE, _SPACE_AFTER_ENG_CHAR_RE


	def digits_space(text: str) -> str:
	"""
	Add spaces before\|after numeric and urdu digits

	Args:
	text (str): ``Urdu`` text
	Returns:
	str: Returns a ``str`` object containing normalized text.
	Examples:
	>>> from urduhack.preprocessing import digits_space
	>>> text = "20فیصد"
	>>> normalized_text = digits_space(text)
	>>> normalized_text
	20 فیصد
	"""
	text = _SPACE_BEFORE_DIGITS_RE.sub(" ", text)
	text = _SPACE_AFTER_DIGITS_RE.sub(" ", text)

	return text


	def english_characters_space(text: str) -> str:
	"""
	Functionality to add spaces before and after English words in the given Urdu text. It is an important step in
	normalization of the Urdu data.

	this function returns a :py:class:`String` object which contains the original text with spaces before & after
	English words.

	Args:
	text (str): ``Urdu`` text
	Returns:
	str: Returns a ``str`` object containing normalized text.
	Examples:
	>>> from urduhack.preprocessing import english_characters_space
	>>> text = "خاتون Aliyaنے بچوںUzma and Aliyaکے قتل کا اعترافConfession کیا ہے۔"
	>>> normalized_text = english_characters_space(text)
	>>> normalized_text
	خاتون Aliya نے بچوں Uzma and Aliya کے قتل کا اعتراف Confession کیا ہے۔
	"""
	text = _SPACE_BEFORE_ENG_CHAR_RE.sub(" ", text)
	text = _SPACE_AFTER_ENG_CHAR_RE.sub(" ", text)

	return text


	def all_punctuations_space(text: str) -> str:
	"""
	Add spaces after punctuations used in ``urdu`` writing

	Args:
	text (str): ``Urdu`` text
	Returns:
	str: Returns a ``str`` object containing normalized text.
	"""
	text = _SPACE_BEFORE_ALL_PUNCTUATIONS_RE.sub(" ", text)
	text = _SPACE_AFTER_ALL_PUNCTUATIONS_RE.sub(" ", text)
	return text


	def preprocess(text: str) -> str:
	"""
	To preprocess some text, all you need to do pass ``unicode`` text. It will return a ``str``
	with proper spaces after digits and punctuations.

	Args:
	text (str): ``Urdu`` text
	Returns:
	str: urdu text
	Raises:
	TypeError: If text param is not not str Type.
	Examples:
	>>> from urduhack.preprocessing import preprocess
	>>> text = "اَباُوگل پاکستان ﻤﯿﮟ 20 سال ﺳﮯ ، وسائل کی کوئی کمی نہیں ﮨﮯ۔"
	>>> normalized_text = preprocess(text)
	>>> # The text now contains proper spaces after digits and punctuations,
	>>> # normalized characters and no diacritics!
	>>> normalized_text
	اباوگل پاکستان ﻤﯿﮟ 20 سال ﺳﮯ ، وسائل کی کوئی کمی نہیں ﮨﮯ ۔
	"""
	if not isinstance(text, str):
	raise TypeError("text must be str type.")

	text = digits_space(text)
	text = all_punctuations_space(text)
	text = english_characters_space(text)
	return text