1#!/usr/bin/env python 2# -*- coding: utf-8 -*- 3""" 4parse numeral from various formats 5""" 6from rebulk.remodule import re 7 8digital_numeral = r'\d{1,4}' 9 10roman_numeral = r'(?=[MCDLXVI]+)M{0,4}(?:CM|CD|D?C{0,3})(?:XC|XL|L?X{0,3})(?:IX|IV|V?I{0,3})' 11 12english_word_numeral_list = [ 13 'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 14 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen', 'twenty' 15] 16 17french_word_numeral_list = [ 18 'zéro', 'un', 'deux', 'trois', 'quatre', 'cinq', 'six', 'sept', 'huit', 'neuf', 'dix', 19 'onze', 'douze', 'treize', 'quatorze', 'quinze', 'seize', 'dix-sept', 'dix-huit', 'dix-neuf', 'vingt' 20] 21 22french_alt_word_numeral_list = [ 23 'zero', 'une', 'deux', 'trois', 'quatre', 'cinq', 'six', 'sept', 'huit', 'neuf', 'dix', 24 'onze', 'douze', 'treize', 'quatorze', 'quinze', 'seize', 'dixsept', 'dixhuit', 'dixneuf', 'vingt' 25] 26 27 28def __build_word_numeral(*args): 29 """ 30 Build word numeral regexp from list. 31 32 :param args: 33 :type args: 34 :param kwargs: 35 :type kwargs: 36 :return: 37 :rtype: 38 """ 39 re_ = None 40 for word_list in args: 41 for word in word_list: 42 if not re_: 43 re_ = r'(?:(?=\w+)' 44 else: 45 re_ += '|' 46 re_ += word 47 re_ += ')' 48 return re_ 49 50 51word_numeral = __build_word_numeral(english_word_numeral_list, french_word_numeral_list, french_alt_word_numeral_list) 52 53numeral = '(?:' + digital_numeral + '|' + roman_numeral + '|' + word_numeral + ')' 54 55__romanNumeralMap = ( 56 ('M', 1000), 57 ('CM', 900), 58 ('D', 500), 59 ('CD', 400), 60 ('C', 100), 61 ('XC', 90), 62 ('L', 50), 63 ('XL', 40), 64 ('X', 10), 65 ('IX', 9), 66 ('V', 5), 67 ('IV', 4), 68 ('I', 1) 69) 70 71__romanNumeralPattern = re.compile('^' + roman_numeral + '$') 72 73 74def __parse_roman(value): 75 """ 76 convert Roman numeral to integer 77 78 :param value: Value to parse 79 :type value: string 80 :return: 81 :rtype: 82 """ 83 if not __romanNumeralPattern.search(value): 84 raise ValueError('Invalid Roman numeral: %s' % value) 85 86 result = 0 87 index = 0 88 for num, integer in __romanNumeralMap: 89 while value[index:index + len(num)] == num: 90 result += integer 91 index += len(num) 92 return result 93 94 95def __parse_word(value): 96 """ 97 Convert Word numeral to integer 98 99 :param value: Value to parse 100 :type value: string 101 :return: 102 :rtype: 103 """ 104 for word_list in [english_word_numeral_list, french_word_numeral_list, french_alt_word_numeral_list]: 105 try: 106 return word_list.index(value.lower()) 107 except ValueError: 108 pass 109 raise ValueError # pragma: no cover 110 111 112_clean_re = re.compile(r'[^\d]*(\d+)[^\d]*') 113 114 115def parse_numeral(value, int_enabled=True, roman_enabled=True, word_enabled=True, clean=True): 116 """ 117 Parse a numeric value into integer. 118 119 :param value: Value to parse. Can be an integer, roman numeral or word. 120 :type value: string 121 :param int_enabled: 122 :type int_enabled: 123 :param roman_enabled: 124 :type roman_enabled: 125 :param word_enabled: 126 :type word_enabled: 127 :param clean: 128 :type clean: 129 :return: Numeric value, or None if value can't be parsed 130 :rtype: int 131 """ 132 # pylint: disable=too-many-branches 133 if int_enabled: 134 try: 135 if clean: 136 match = _clean_re.match(value) 137 if match: 138 clean_value = match.group(1) 139 return int(clean_value) 140 return int(value) 141 except ValueError: 142 pass 143 if roman_enabled: 144 try: 145 if clean: 146 for word in value.split(): 147 try: 148 return __parse_roman(word.upper()) 149 except ValueError: 150 pass 151 return __parse_roman(value) 152 except ValueError: 153 pass 154 if word_enabled: 155 try: 156 if clean: 157 for word in value.split(): 158 try: 159 return __parse_word(word) 160 except ValueError: # pragma: no cover 161 pass 162 return __parse_word(value) # pragma: no cover 163 except ValueError: # pragma: no cover 164 pass 165 raise ValueError('Invalid numeral: ' + value) # pragma: no cover 166