Spaces:
Sleeping
Sleeping
File size: 1,482 Bytes
ac901c7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
# coding: utf8
"""List of Regex for preprocess"""
import string
import regex as re
from indicnlp.urduhack.urdu_characters import URDU_ALL_CHARACTERS, URDU_PUNCTUATIONS
# Add spaces before|after numeric number and urdu words
# 18سالہ , 20فیصد
_EXCEPT_HAMZA = list(filter(lambda c: c != "\u0621", URDU_ALL_CHARACTERS))
_SPACE_BEFORE_DIGITS_RE = re.compile(
r"(?<=[" + "".join(URDU_ALL_CHARACTERS) + "])(?=[0-9])", flags=re.U | re.M | re.I
)
_SPACE_AFTER_DIGITS_RE = re.compile(
r"(?<=[0-9])(?=[" + "".join(_EXCEPT_HAMZA) + "])", flags=re.U | re.M | re.I
)
# Add spaces before|after english characters and urdu words
# ikramسالہ , abفیصد
_SPACE_BEFORE_ENG_CHAR_RE = re.compile(
r"(?<=[" + "".join(URDU_ALL_CHARACTERS) + "])(?=[a-zA-Z])", flags=re.U | re.M | re.I
)
_SPACE_AFTER_ENG_CHAR_RE = re.compile(
r"(?<=[a-zA-Z])(?=[" + "".join(URDU_ALL_CHARACTERS) + "])", flags=re.U | re.M | re.I
)
# add space before and after all PUNCTUATIONS
_ALL_PUNCTUATIONS: str = "".join(URDU_PUNCTUATIONS) + "".join(string.punctuation)
_SPACE_BEFORE_ALL_PUNCTUATIONS_RE = re.compile(
r"(?<=["
+ "".join(URDU_ALL_CHARACTERS)
+ "])(?=["
+ "".join(_ALL_PUNCTUATIONS)
+ "])",
flags=re.U | re.M | re.I,
)
_SPACE_AFTER_ALL_PUNCTUATIONS_RE = re.compile(
r"(?<=["
+ "".join(_ALL_PUNCTUATIONS)
+ "])(?=[^"
+ "".join(_ALL_PUNCTUATIONS)
+ "0-9 \n])",
flags=re.U | re.M | re.I,
)
|