Spaces:
Sleeping
Sleeping
# coding: utf8 | |
"""List of Regex for preprocess""" | |
import string | |
import regex as re | |
from indicnlp.urduhack.urdu_characters import URDU_ALL_CHARACTERS, URDU_PUNCTUATIONS | |
# Add spaces before|after numeric number and urdu words | |
# 18سالہ , 20فیصد | |
_EXCEPT_HAMZA = list(filter(lambda c: c != "\u0621", URDU_ALL_CHARACTERS)) | |
_SPACE_BEFORE_DIGITS_RE = re.compile( | |
r"(?<=[" + "".join(URDU_ALL_CHARACTERS) + "])(?=[0-9])", flags=re.U | re.M | re.I | |
) | |
_SPACE_AFTER_DIGITS_RE = re.compile( | |
r"(?<=[0-9])(?=[" + "".join(_EXCEPT_HAMZA) + "])", flags=re.U | re.M | re.I | |
) | |
# Add spaces before|after english characters and urdu words | |
# ikramسالہ , abفیصد | |
_SPACE_BEFORE_ENG_CHAR_RE = re.compile( | |
r"(?<=[" + "".join(URDU_ALL_CHARACTERS) + "])(?=[a-zA-Z])", flags=re.U | re.M | re.I | |
) | |
_SPACE_AFTER_ENG_CHAR_RE = re.compile( | |
r"(?<=[a-zA-Z])(?=[" + "".join(URDU_ALL_CHARACTERS) + "])", flags=re.U | re.M | re.I | |
) | |
# add space before and after all PUNCTUATIONS | |
_ALL_PUNCTUATIONS: str = "".join(URDU_PUNCTUATIONS) + "".join(string.punctuation) | |
_SPACE_BEFORE_ALL_PUNCTUATIONS_RE = re.compile( | |
r"(?<=[" | |
+ "".join(URDU_ALL_CHARACTERS) | |
+ "])(?=[" | |
+ "".join(_ALL_PUNCTUATIONS) | |
+ "])", | |
flags=re.U | re.M | re.I, | |
) | |
_SPACE_AFTER_ALL_PUNCTUATIONS_RE = re.compile( | |
r"(?<=[" | |
+ "".join(_ALL_PUNCTUATIONS) | |
+ "])(?=[^" | |
+ "".join(_ALL_PUNCTUATIONS) | |
+ "0-9 \n])", | |
flags=re.U | re.M | re.I, | |
) | |