Spaces:
Sleeping
Sleeping
# coding: utf8 | |
""" | |
Preprocessing utilities | |
""" | |
import sys | |
import unicodedata | |
import regex as re | |
CURRENCIES = { | |
"$": "USD", | |
"zł": "PLN", | |
"£": "GBP", | |
"¥": "JPY", | |
"฿": "THB", | |
"₡": "CRC", | |
"₦": "NGN", | |
"₩": "KRW", | |
"₪": "ILS", | |
"₫": "VND", | |
"€": "EUR", | |
"₱": "PHP", | |
"₲": "PYG", | |
"₴": "UAH", | |
"₹": "INR", | |
} | |
_EMAIL_RE = re.compile( | |
r"(?:^|(?<=[^\w@.)]))([\w+-](\.(?!\.))?)*?[\w+-]@(?:\w-?)*?\w+(\.([a-z]{2,})){1,3}(?:$|(?=\b))", | |
flags=re.IGNORECASE | re.UNICODE, | |
) | |
_PHONE_RE = re.compile( | |
r"(?:^|(?<=[^\w)]))(\+?1[ .-]?)?(\(?\d{3}\)?[ .-]?)?(\d{3}[ .-]?\d{4})(\s?(?:ext\.?" | |
r"|[#x-])\s?\d{2,6})?(?:$|(?=\W))" | |
) | |
_NUMBERS_RE = re.compile( | |
r"(?:^|(?<=[^\w,.]))[+–-]?(([1-9]\d{0,2}(,\d{3})+(\.\d*)?)|([1-9]\d{0,2}([ .]\d{3})+(,\d*)?)" | |
r"|(\d*?[.,]\d+)|\d+)(?:$|(?=\b))" | |
) | |
_CURRENCY_RE = re.compile("({})+".format("|".join(re.escape(c) for c in CURRENCIES))) | |
_LINEBREAK_RE = re.compile(r"((\r\n)|[\n\v])+") | |
_NONBREAKING_SPACE_RE = re.compile(r"(?!\n)\s+") | |
_URL_RE = re.compile( | |
r"(?:^|(?<![\w/.]))" | |
# protocol identifier | |
# r"(?:(?:https?|ftp)://)" <-- alt? | |
r"(?:(?:https?://|ftp://|www\d{0,3}\.))" | |
# user:pass authentication | |
r"(?:\S+(?::\S*)?@)?" r"(?:" | |
# IP address exclusion | |
# private & local networks | |
r"(?!(?:10|127)(?:\.\d{1,3}){3})" | |
r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})" | |
r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})" | |
# IP address dotted notation octets | |
# excludes loopback network 0.0.0.0 | |
# excludes reserved space >= 224.0.0.0 | |
# excludes network & broadcast addresses | |
# (first & last IP address of each class) | |
r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])" | |
r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}" | |
r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))" | |
r"|" | |
# host name | |
r"(?:(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)" | |
# domain name | |
r"(?:\.(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)*" | |
# TLD identifier | |
r"(?:\.(?:[a-z\u00a1-\uffff]{2,}))" r")" | |
# port number | |
r"(?::\d{2,5})?" | |
# resource path | |
r"(?:/\S*)?" r"(?:$|(?![\w?!+&/]))", | |
flags=re.UNICODE | re.IGNORECASE, | |
) # source: https://gist.github.com/dperini/729294 | |
_SHORT_URL_RE = re.compile( | |
r"(?:^|(?<![\w/.]))" | |
# optional scheme | |
r"(?:(?:https?://)?)" | |
# domain | |
r"(?:\w-?)*?\w+(?:\.[a-z]{2,12}){1,3}" r"/" | |
# hash | |
r"[^\s.,?!'\"|+]{2,12}" r"(?:$|(?![\w?!+&/]))", | |
flags=re.IGNORECASE, | |
) | |
def normalize_whitespace(text: str): | |
""" | |
Given ``text`` str, replace one or more spacings with a single space, and one | |
or more linebreaks with a single newline. Also strip leading/trailing whitespace. | |
Args: | |
text (str): ``Urdu`` text | |
Returns: | |
str: Returns a ``str`` object containing normalized text. | |
Examples: | |
>>> from urduhack.preprocessing import normalize_whitespace | |
>>> text = "عراق اور شام اعلان کیا ہے دونوں جلد اپنے گے؟" | |
>>> normalized_text = normalize_whitespace(text) | |
>>> normalized_text | |
عراق اور شام اعلان کیا ہے دونوں جلد اپنے گے؟ | |
""" | |
return _NONBREAKING_SPACE_RE.sub(" ", _LINEBREAK_RE.sub(r"\n", text)).strip() | |
def replace_urls(text: str, replace_with=""): | |
""" | |
Replace all URLs in ``text`` str with ``replace_with`` str. | |
Args: | |
text (str): ``Urdu`` text | |
replace_with (str): Replace string | |
Returns: | |
str: Returns a ``str`` object replace url with ``replace_with`` text. | |
Examples: | |
>>> from urduhack.preprocessing import replace_urls | |
>>> text = "20 www.gmail.com فیصد" | |
>>> replace_urls(text) | |
'20 فیصد' | |
""" | |
return _URL_RE.sub(replace_with, _SHORT_URL_RE.sub(replace_with, text)) | |
def replace_emails(text: str, replace_with=""): | |
""" | |
Replace all emails in ``text`` str with ``replace_with`` str. | |
Args: | |
text (str): ``Urdu`` text | |
replace_with (str): Replace string | |
Returns: | |
str: Returns a ``str`` object replace emails with ``replace_with`` text. | |
Examples: | |
>>> text = "20 gunner@gmail.com فیصد" | |
>>> from urduhack.preprocessing import replace_emails | |
>>> replace_emails(text) | |
""" | |
return _EMAIL_RE.sub(replace_with, text) | |
def replace_phone_numbers(text: str, replace_with=""): | |
""" | |
Replace all phone numbers in ``text`` str with ``replace_with`` str. | |
Args: | |
text (str): ``Urdu`` text | |
replace_with (str): Replace string | |
Returns: | |
str: Returns a ``str`` object replace number_no with ``replace_with`` text. | |
Examples: | |
>>> from urduhack.preprocessing import replace_numbers | |
>>> text = "20 فیصد" | |
>>> replace_numbers(text) | |
' فیصد' | |
""" | |
return _PHONE_RE.sub(replace_with, text) | |
def replace_numbers(text: str, replace_with=""): | |
""" | |
Replace all numbers in ``text`` str with ``replace_with`` str. | |
Args: | |
text (str): ``Urdu`` text | |
replace_with (str): Replace string | |
Returns: | |
str: Returns a ``str`` object replace number with ``replace_with`` text. | |
Examples: | |
>>> from urduhack.preprocessing import replace_phone_numbers | |
>>> text = "یعنی لائن آف کنٹرول پر فائربندی کا معاہدہ 555-123-4567 میں ہوا تھا" | |
>>> replace_phone_numbers(text) | |
'یعنی لائن آف کنٹرول پر فائربندی کا معاہدہ میں ہوا تھا' | |
""" | |
return _NUMBERS_RE.sub(replace_with, text) | |
def replace_currency_symbols(text: str, replace_with=None): | |
""" | |
Replace all currency symbols in ``text`` str with string specified by ``replace_with`` str. | |
Args: | |
text (str): Raw text | |
replace_with (str): if None (default), replace symbols with | |
their standard 3-letter abbreviations (e.g. '$' with 'USD', '£' with 'GBP'); | |
otherwise, pass in a string with which to replace all symbols | |
(e.g. "*CURRENCY*") | |
Returns: | |
str: Returns a ``str`` object containing normalized text. | |
Examples: | |
>>> from urduhack.preprocessing import replace_currency_symbols | |
>>> text = "یعنی لائن آف کنٹرول پر فائربندی کا معاہدہ 2003 میں ہوا 33$ تھا۔" | |
>>> replace_currency_symbols(text) | |
'یعنی لائن آف کنٹرول پر فائربندی کا معاہدہ 2003 میں ہوا 33USD تھا۔' | |
""" | |
if replace_with is None: | |
for key, value in CURRENCIES.items(): | |
text = text.replace(key, value) | |
return text | |
return _CURRENCY_RE.sub(replace_with, text) | |
PUNCTUATION_TRANSLATE_UNICODE = dict.fromkeys( | |
(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P")), | |
"", | |
) | |
def remove_punctuation(text: str, marks=None) -> str: | |
""" | |
Remove punctuation from ``text`` by removing all instances of ``marks``. | |
Args: | |
text (str): Urdu text | |
marks (str): If specified, remove only the characters in this string, | |
e.g. ``marks=',;:'`` removes commas, semi-colons, and colons. | |
Otherwise, all punctuation marks are removed. | |
Returns: | |
str: returns a ``str`` object containing normalized text. | |
Note: | |
When ``marks=None``, Python's built-in :meth:`str.translate()` is | |
used to remove punctuation; otherwise, a regular expression is used | |
instead. The former's performance is about 5-10x faster. | |
Examples: | |
>>> from urduhack.preprocessing import remove_punctuation | |
>>> output = remove_punctuation("کر ؟ سکتی ہے۔") | |
کر سکتی ہے | |
""" | |
if marks: | |
return re.sub("[{}]+".format(re.escape(marks)), "", text, flags=re.UNICODE) | |
return text.translate(PUNCTUATION_TRANSLATE_UNICODE) | |
def remove_accents(text: str) -> str: | |
""" | |
Remove accents from any accented unicode characters in ``text`` str, either by | |
transforming them into ascii equivalents or removing them entirely. | |
Args: | |
text (str): Urdu text | |
Returns: | |
str | |
Examples: | |
>>> from urduhack.preprocessing import remove_accents | |
>>>text = "دالتِ عظمیٰ درخواست" | |
>>> remove_accents(text) | |
'دالت عظمی درخواست' | |
""" | |
return "".join(c for c in text if not unicodedata.combining(c)) | |
def remove_english_alphabets(text: str): | |
""" | |
Removes ``English`` words and digits from a ``text`` | |
Args: | |
text (str): Urdu text | |
Returns: | |
str: ``str`` object with english alphabets removed | |
""" | |
characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890" | |
table = str.maketrans({key: None for key in characters}) | |
return text.translate(table) | |