1from geodata.enum import Enum, EnumValue
2
3
4class token_types(Enum):
5    # Word types
6    WORD = EnumValue(1)
7    ABBREVIATION = EnumValue(2)
8    IDEOGRAPHIC_CHAR = EnumValue(3)
9    HANGUL_SYLLABLE = EnumValue(4)
10    ACRONYM = EnumValue(5)
11
12    # Special tokens
13    EMAIL = EnumValue(20)
14    URL = EnumValue(21)
15    US_PHONE = EnumValue(22)
16    INTL_PHONE = EnumValue(23)
17
18    # Numbers and numeric types
19    NUMERIC = EnumValue(50)
20    ORDINAL = EnumValue(51)
21    ROMAN_NUMERAL = EnumValue(52)
22    IDEOGRAPHIC_NUMBER = EnumValue(53)
23
24    # Punctuation types, may separate a phrase
25    PERIOD = EnumValue(100)
26    EXCLAMATION = EnumValue(101)
27    QUESTION_MARK = EnumValue(102)
28    COMMA = EnumValue(103)
29    COLON = EnumValue(104)
30    SEMICOLON = EnumValue(105)
31    PLUS = EnumValue(106)
32    AMPERSAND = EnumValue(107)
33    AT_SIGN = EnumValue(108)
34    POUND = EnumValue(109)
35    ELLIPSIS = EnumValue(110)
36    DASH = EnumValue(111)
37    BREAKING_DASH = EnumValue(112)
38    HYPHEN = EnumValue(113)
39    PUNCT_OPEN = EnumValue(114)
40    PUNCT_CLOSE = EnumValue(115)
41    DOUBLE_QUOTE = EnumValue(119)
42    SINGLE_QUOTE = EnumValue(120)
43    OPEN_QUOTE = EnumValue(121)
44    CLOSE_QUOTE = EnumValue(122)
45    SLASH = EnumValue(124)
46    BACKSLASH = EnumValue(125)
47    GREATER_THAN = EnumValue(126)
48    LESS_THAN = EnumValue(127)
49
50    # Non-letters and whitespace
51    OTHER = EnumValue(200)
52    WHITESPACE = EnumValue(300)
53    NEWLINE = EnumValue(301)
54
55    # Phrase, special application-level type not returned by the tokenizer
56    PHRASE = EnumValue(999)
57
58    WORD_TOKEN_TYPES = set([
59        WORD,
60        ABBREVIATION,
61        IDEOGRAPHIC_CHAR,
62        HANGUL_SYLLABLE,
63        ACRONYM
64    ])
65
66    NUMERIC_TOKEN_TYPES = set([
67        NUMERIC,
68        ORDINAL,
69        ROMAN_NUMERAL,
70        IDEOGRAPHIC_NUMBER,
71    ])
72
73    PUNCTUATION_TOKEN_TYPES = set([
74        PERIOD,
75        EXCLAMATION,
76        QUESTION_MARK,
77        COMMA,
78        COLON,
79        SEMICOLON,
80        PLUS,
81        AMPERSAND,
82        AT_SIGN,
83        POUND,
84        ELLIPSIS,
85        DASH,
86        BREAKING_DASH,
87        HYPHEN,
88        PUNCT_OPEN,
89        PUNCT_CLOSE,
90        DOUBLE_QUOTE,
91        SINGLE_QUOTE,
92        OPEN_QUOTE,
93        CLOSE_QUOTE,
94        SLASH,
95        BACKSLASH,
96        GREATER_THAN,
97        LESS_THAN,
98    ])
99
100    NON_ALPHANUMERIC_TOKEN_TYPES = PUNCTUATION_TOKEN_TYPES | set([
101        OTHER,
102        WHITESPACE,
103        NEWLINE,
104    ])
105