File size: 3,068 Bytes
16f0ad7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import re
_letters_and_numbers_re = re.compile(
    r"((?:[a-zA-Z]+[0-9]|[0-9]+[a-zA-Z])[a-zA-Z0-9']*)", re.IGNORECASE)

_hardware_re = re.compile(
    '([0-9]+(?:[.,][0-9]+)?)(?:\s?)(tb|gb|mb|kb|ghz|mhz|khz|hz|mm)', re.IGNORECASE)
_hardware_key = {'tb': 'terabyte',
                 'gb': 'gigabyte',
                 'mb': 'megabyte',
                 'kb': 'kilobyte',
                 'ghz': 'gigahertz',
                 'mhz': 'megahertz',
                 'khz': 'kilohertz',
                 'hz': 'hertz',
                 'mm': 'millimeter',
                 'cm': 'centimeter',
                 'km': 'kilometer'}

_dimension_re = re.compile(
    r'\b(\d+(?:[,.]\d+)?\s*[xX]\s*\d+(?:[,.]\d+)?\s*[xX]\s*\d+(?:[,.]\d+)?(?:in|inch|m)?)\b|\b(\d+(?:[,.]\d+)?\s*[xX]\s*\d+(?:[,.]\d+)?(?:in|inch|m)?)\b')
_dimension_key = {'m': 'meter',
                  'in': 'inch',
                  'inch': 'inch'}




def _expand_letters_and_numbers(m):
    text = re.split(r'(\d+)', m.group(0))

    # remove trailing space
    if text[-1] == '':
        text = text[:-1]
    elif text[0] == '':
        text = text[1:]

    # if not like 1920s, or AK47's , 20th, 1st, 2nd, 3rd, etc...
    if text[-1] in ("'s", "s", "th", "nd", "st", "rd") and text[-2].isdigit():
        text[-2] = text[-2] + text[-1]
        text = text[:-1]

    # for combining digits 2 by 2
    new_text = []
    for i in range(len(text)):
        string = text[i]
        if string.isdigit() and len(string) < 5:
            # heuristics
            if len(string) > 2 and string[-2] == '0':
                if string[-1] == '0':
                    string = [string]
                else:
                    string = [string[:-2], string[-2], string[-1]]
            elif len(string) % 2 == 0:
                string = [string[i:i+2] for i in range(0, len(string), 2)]
            elif len(string) > 2:
                string = [string[0]] + [string[i:i+2] for i in range(1, len(string), 2)]
            new_text.extend(string)
        else:
            new_text.append(string)

    text = new_text
    text = " ".join(text)
    return text


def _expand_hardware(m):
    quantity, measure = m.groups(0)
    measure = _hardware_key[measure.lower()]
    if measure[-1] != 'z' and float(quantity.replace(',', '')) > 1:
        return "{} {}s".format(quantity, measure)
    return "{} {}".format(quantity, measure)


def _expand_dimension(m):
    text = "".join([x for x in m.groups(0) if x != 0])
    text = text.replace(' x ', ' by ')
    text = text.replace('x', ' by ')
    if text.endswith(tuple(_dimension_key.keys())):
        if text[-2].isdigit():
            text = "{} {}".format(text[:-1], _dimension_key[text[-1:]])
        elif text[-3].isdigit():
            text = "{} {}".format(text[:-2], _dimension_key[text[-2:]])
    return text


def normalize_letters_and_numbers(text):
    text = re.sub(_hardware_re, _expand_hardware, text)
    text = re.sub(_dimension_re, _expand_dimension, text)
    text = re.sub(_letters_and_numbers_re, _expand_letters_and_numbers, text)
    return text