File size: 3,414 Bytes
ac901c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# coding: utf8
"""

Urduhack Character preprocess functions

"""

from .regexes import _SPACE_AFTER_ALL_PUNCTUATIONS_RE, _SPACE_BEFORE_ALL_PUNCTUATIONS_RE
from .regexes import _SPACE_AFTER_DIGITS_RE, _SPACE_BEFORE_DIGITS_RE
from .regexes import _SPACE_BEFORE_ENG_CHAR_RE, _SPACE_AFTER_ENG_CHAR_RE


def digits_space(text: str) -> str:
    """

    Add spaces before|after numeric and urdu digits



    Args:

        text (str): ``Urdu`` text

    Returns:

        str: Returns a ``str`` object containing normalized text.

    Examples:

        >>> from urduhack.preprocessing import digits_space

        >>> text = "20فیصد"

        >>> normalized_text = digits_space(text)

        >>> normalized_text

        20 فیصد

    """
    text = _SPACE_BEFORE_DIGITS_RE.sub(" ", text)
    text = _SPACE_AFTER_DIGITS_RE.sub(" ", text)

    return text


def english_characters_space(text: str) -> str:
    """

    Functionality to add spaces before and after English words in the given Urdu text. It is an important step in

    normalization of the Urdu data.



    this function returns a :py:class:`String` object which contains the original text with spaces before & after

    English words.



    Args:

        text (str): ``Urdu`` text

    Returns:

        str: Returns a ``str`` object containing normalized text.

    Examples:

        >>> from urduhack.preprocessing import english_characters_space

        >>> text = "خاتون Aliyaنے بچوںUzma and Aliyaکے قتل کا اعترافConfession کیا ہے۔"

        >>> normalized_text = english_characters_space(text)

        >>> normalized_text

        خاتون Aliya نے بچوں Uzma and Aliya کے قتل کا اعتراف Confession کیا ہے۔

    """
    text = _SPACE_BEFORE_ENG_CHAR_RE.sub(" ", text)
    text = _SPACE_AFTER_ENG_CHAR_RE.sub(" ", text)

    return text


def all_punctuations_space(text: str) -> str:
    """

    Add spaces after punctuations used in ``urdu`` writing



    Args:

        text (str): ``Urdu`` text

    Returns:

        str: Returns a ``str`` object containing normalized text.

    """
    text = _SPACE_BEFORE_ALL_PUNCTUATIONS_RE.sub(" ", text)
    text = _SPACE_AFTER_ALL_PUNCTUATIONS_RE.sub(" ", text)
    return text


def preprocess(text: str) -> str:
    """

    To preprocess some text, all you need to do pass ``unicode`` text. It will return a ``str``

    with proper spaces after digits and punctuations.



    Args:

        text (str): ``Urdu`` text

    Returns:

        str: urdu text

    Raises:

        TypeError: If text param is not not str Type.

    Examples:

        >>> from urduhack.preprocessing import preprocess

        >>> text = "اَباُوگل پاکستان ﻤﯿﮟ 20 سال ﺳﮯ ، وسائل کی کوئی کمی نہیں ﮨﮯ۔"

        >>> normalized_text = preprocess(text)

        >>> # The text now contains proper spaces after digits and punctuations,

        >>> # normalized characters and no diacritics!

        >>> normalized_text

        اباوگل پاکستان ﻤﯿﮟ 20 سال ﺳﮯ ، وسائل کی کوئی کمی نہیں ﮨﮯ ۔

    """
    if not isinstance(text, str):
        raise TypeError("text must be str type.")

    text = digits_space(text)
    text = all_punctuations_space(text)
    text = english_characters_space(text)
    return text