1from geodata.enum import Enum, EnumValue 2 3 4class token_types(Enum): 5 # Word types 6 WORD = EnumValue(1) 7 ABBREVIATION = EnumValue(2) 8 IDEOGRAPHIC_CHAR = EnumValue(3) 9 HANGUL_SYLLABLE = EnumValue(4) 10 ACRONYM = EnumValue(5) 11 12 # Special tokens 13 EMAIL = EnumValue(20) 14 URL = EnumValue(21) 15 US_PHONE = EnumValue(22) 16 INTL_PHONE = EnumValue(23) 17 18 # Numbers and numeric types 19 NUMERIC = EnumValue(50) 20 ORDINAL = EnumValue(51) 21 ROMAN_NUMERAL = EnumValue(52) 22 IDEOGRAPHIC_NUMBER = EnumValue(53) 23 24 # Punctuation types, may separate a phrase 25 PERIOD = EnumValue(100) 26 EXCLAMATION = EnumValue(101) 27 QUESTION_MARK = EnumValue(102) 28 COMMA = EnumValue(103) 29 COLON = EnumValue(104) 30 SEMICOLON = EnumValue(105) 31 PLUS = EnumValue(106) 32 AMPERSAND = EnumValue(107) 33 AT_SIGN = EnumValue(108) 34 POUND = EnumValue(109) 35 ELLIPSIS = EnumValue(110) 36 DASH = EnumValue(111) 37 BREAKING_DASH = EnumValue(112) 38 HYPHEN = EnumValue(113) 39 PUNCT_OPEN = EnumValue(114) 40 PUNCT_CLOSE = EnumValue(115) 41 DOUBLE_QUOTE = EnumValue(119) 42 SINGLE_QUOTE = EnumValue(120) 43 OPEN_QUOTE = EnumValue(121) 44 CLOSE_QUOTE = EnumValue(122) 45 SLASH = EnumValue(124) 46 BACKSLASH = EnumValue(125) 47 GREATER_THAN = EnumValue(126) 48 LESS_THAN = EnumValue(127) 49 50 # Non-letters and whitespace 51 OTHER = EnumValue(200) 52 WHITESPACE = EnumValue(300) 53 NEWLINE = EnumValue(301) 54 55 # Phrase, special application-level type not returned by the tokenizer 56 PHRASE = EnumValue(999) 57 58 WORD_TOKEN_TYPES = set([ 59 WORD, 60 ABBREVIATION, 61 IDEOGRAPHIC_CHAR, 62 HANGUL_SYLLABLE, 63 ACRONYM 64 ]) 65 66 NUMERIC_TOKEN_TYPES = set([ 67 NUMERIC, 68 ORDINAL, 69 ROMAN_NUMERAL, 70 IDEOGRAPHIC_NUMBER, 71 ]) 72 73 PUNCTUATION_TOKEN_TYPES = set([ 74 PERIOD, 75 EXCLAMATION, 76 QUESTION_MARK, 77 COMMA, 78 COLON, 79 SEMICOLON, 80 PLUS, 81 AMPERSAND, 82 AT_SIGN, 83 POUND, 84 ELLIPSIS, 85 DASH, 86 BREAKING_DASH, 87 HYPHEN, 88 PUNCT_OPEN, 89 PUNCT_CLOSE, 90 DOUBLE_QUOTE, 91 SINGLE_QUOTE, 92 OPEN_QUOTE, 93 CLOSE_QUOTE, 94 SLASH, 95 BACKSLASH, 96 GREATER_THAN, 97 LESS_THAN, 98 ]) 99 100 NON_ALPHANUMERIC_TOKEN_TYPES = PUNCTUATION_TOKEN_TYPES | set([ 101 OTHER, 102 WHITESPACE, 103 NEWLINE, 104 ]) 105