import re _letters_and_numbers_re = re.compile( r"((?:[a-zA-Z]+[0-9]|[0-9]+[a-zA-Z])[a-zA-Z0-9']*)", re.IGNORECASE) _hardware_re = re.compile( '([0-9]+(?:[.,][0-9]+)?)(?:\s?)(tb|gb|mb|kb|ghz|mhz|khz|hz|mm)', re.IGNORECASE) _hardware_key = {'tb': 'terabyte', 'gb': 'gigabyte', 'mb': 'megabyte', 'kb': 'kilobyte', 'ghz': 'gigahertz', 'mhz': 'megahertz', 'khz': 'kilohertz', 'hz': 'hertz', 'mm': 'millimeter', 'cm': 'centimeter', 'km': 'kilometer'} _dimension_re = re.compile( r'\b(\d+(?:[,.]\d+)?\s*[xX]\s*\d+(?:[,.]\d+)?\s*[xX]\s*\d+(?:[,.]\d+)?(?:in|inch|m)?)\b|\b(\d+(?:[,.]\d+)?\s*[xX]\s*\d+(?:[,.]\d+)?(?:in|inch|m)?)\b') _dimension_key = {'m': 'meter', 'in': 'inch', 'inch': 'inch'} def _expand_letters_and_numbers(m): text = re.split(r'(\d+)', m.group(0)) # remove trailing space if text[-1] == '': text = text[:-1] elif text[0] == '': text = text[1:] # if not like 1920s, or AK47's , 20th, 1st, 2nd, 3rd, etc... if text[-1] in ("'s", "s", "th", "nd", "st", "rd") and text[-2].isdigit(): text[-2] = text[-2] + text[-1] text = text[:-1] # for combining digits 2 by 2 new_text = [] for i in range(len(text)): string = text[i] if string.isdigit() and len(string) < 5: # heuristics if len(string) > 2 and string[-2] == '0': if string[-1] == '0': string = [string] else: string = [string[:-2], string[-2], string[-1]] elif len(string) % 2 == 0: string = [string[i:i+2] for i in range(0, len(string), 2)] elif len(string) > 2: string = [string[0]] + [string[i:i+2] for i in range(1, len(string), 2)] new_text.extend(string) else: new_text.append(string) text = new_text text = " ".join(text) return text def _expand_hardware(m): quantity, measure = m.groups(0) measure = _hardware_key[measure.lower()] if measure[-1] != 'z' and float(quantity.replace(',', '')) > 1: return "{} {}s".format(quantity, measure) return "{} {}".format(quantity, measure) def _expand_dimension(m): text = "".join([x for x in m.groups(0) if x != 0]) text = text.replace(' x ', ' by ') text = text.replace('x', ' by ') if text.endswith(tuple(_dimension_key.keys())): if text[-2].isdigit(): text = "{} {}".format(text[:-1], _dimension_key[text[-1:]]) elif text[-3].isdigit(): text = "{} {}".format(text[:-2], _dimension_key[text[-2:]]) return text def normalize_letters_and_numbers(text): text = re.sub(_hardware_re, _expand_hardware, text) text = re.sub(_dimension_re, _expand_dimension, text) text = re.sub(_letters_and_numbers_re, _expand_letters_and_numbers, text) return text