Rajendransp133's picture
Upload 86 files
ac901c7 verified
# coding: utf8
"""List of Regex for preprocess"""
import string
import regex as re
from indicnlp.urduhack.urdu_characters import URDU_ALL_CHARACTERS, URDU_PUNCTUATIONS
# Add spaces before|after numeric number and urdu words
# 18سالہ , 20فیصد
_EXCEPT_HAMZA = list(filter(lambda c: c != "\u0621", URDU_ALL_CHARACTERS))
_SPACE_BEFORE_DIGITS_RE = re.compile(
r"(?<=[" + "".join(URDU_ALL_CHARACTERS) + "])(?=[0-9])", flags=re.U | re.M | re.I
)
_SPACE_AFTER_DIGITS_RE = re.compile(
r"(?<=[0-9])(?=[" + "".join(_EXCEPT_HAMZA) + "])", flags=re.U | re.M | re.I
)
# Add spaces before|after english characters and urdu words
# ikramسالہ , abفیصد
_SPACE_BEFORE_ENG_CHAR_RE = re.compile(
r"(?<=[" + "".join(URDU_ALL_CHARACTERS) + "])(?=[a-zA-Z])", flags=re.U | re.M | re.I
)
_SPACE_AFTER_ENG_CHAR_RE = re.compile(
r"(?<=[a-zA-Z])(?=[" + "".join(URDU_ALL_CHARACTERS) + "])", flags=re.U | re.M | re.I
)
# add space before and after all PUNCTUATIONS
_ALL_PUNCTUATIONS: str = "".join(URDU_PUNCTUATIONS) + "".join(string.punctuation)
_SPACE_BEFORE_ALL_PUNCTUATIONS_RE = re.compile(
r"(?<=["
+ "".join(URDU_ALL_CHARACTERS)
+ "])(?=["
+ "".join(_ALL_PUNCTUATIONS)
+ "])",
flags=re.U | re.M | re.I,
)
_SPACE_AFTER_ALL_PUNCTUATIONS_RE = re.compile(
r"(?<=["
+ "".join(_ALL_PUNCTUATIONS)
+ "])(?=[^"
+ "".join(_ALL_PUNCTUATIONS)
+ "0-9 \n])",
flags=re.U | re.M | re.I,
)