1 /*
2  * Unicode utilities
3  *
4  * Copyright (c) 2017-2018 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #ifndef LIBUNICODE_H
25 #define LIBUNICODE_H
26 
27 #include <inttypes.h>
28 
29 #define LRE_BOOL  int       /* for documentation purposes */
30 
31 /* define it to include all the unicode tables (40KB larger) */
32 #define CONFIG_ALL_UNICODE
33 
34 #define LRE_CC_RES_LEN_MAX 3
35 
36 typedef enum {
37     UNICODE_NFC,
38     UNICODE_NFD,
39     UNICODE_NFKC,
40     UNICODE_NFKD,
41 } UnicodeNormalizationEnum;
42 
43 int lre_case_conv(uint32_t *res, uint32_t c, int conv_type);
44 LRE_BOOL lre_is_cased(uint32_t c);
45 LRE_BOOL lre_is_case_ignorable(uint32_t c);
46 
47 /* char ranges */
48 
49 typedef struct {
50     int len; /* in points, always even */
51     int size;
52     uint32_t *points; /* points sorted by increasing value */
53     void *mem_opaque;
54     void *(*realloc_func)(void *opaque, void *ptr, size_t size);
55 } CharRange;
56 
57 typedef enum {
58     CR_OP_UNION,
59     CR_OP_INTER,
60     CR_OP_XOR,
61 } CharRangeOpEnum;
62 
63 void cr_init(CharRange *cr, void *mem_opaque, void *(*realloc_func)(void *opaque, void *ptr, size_t size));
64 void cr_free(CharRange *cr);
65 int cr_realloc(CharRange *cr, int size);
66 int cr_copy(CharRange *cr, const CharRange *cr1);
67 
cr_add_point(CharRange * cr,uint32_t v)68 static inline int cr_add_point(CharRange *cr, uint32_t v)
69 {
70     if (cr->len >= cr->size) {
71         if (cr_realloc(cr, cr->len + 1))
72             return -1;
73     }
74     cr->points[cr->len++] = v;
75     return 0;
76 }
77 
cr_add_interval(CharRange * cr,uint32_t c1,uint32_t c2)78 static inline int cr_add_interval(CharRange *cr, uint32_t c1, uint32_t c2)
79 {
80     if ((cr->len + 2) > cr->size) {
81         if (cr_realloc(cr, cr->len + 2))
82             return -1;
83     }
84     cr->points[cr->len++] = c1;
85     cr->points[cr->len++] = c2;
86     return 0;
87 }
88 
89 int cr_union1(CharRange *cr, const uint32_t *b_pt, int b_len);
90 
cr_union_interval(CharRange * cr,uint32_t c1,uint32_t c2)91 static inline int cr_union_interval(CharRange *cr, uint32_t c1, uint32_t c2)
92 {
93     uint32_t b_pt[2];
94     b_pt[0] = c1;
95     b_pt[1] = c2 + 1;
96     return cr_union1(cr, b_pt, 2);
97 }
98 
99 int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len,
100           const uint32_t *b_pt, int b_len, int op);
101 
102 int cr_invert(CharRange *cr);
103 
104 #ifdef CONFIG_ALL_UNICODE
105 
106 LRE_BOOL lre_is_id_start(uint32_t c);
107 LRE_BOOL lre_is_id_continue(uint32_t c);
108 
109 int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len,
110                       UnicodeNormalizationEnum n_type,
111                       void *opaque, void *(*realloc_func)(void *opaque, void *ptr, size_t size));
112 
113 /* Unicode character range functions */
114 
115 int unicode_script(CharRange *cr,
116                    const char *script_name, LRE_BOOL is_ext);
117 int unicode_general_category(CharRange *cr, const char *gc_name);
118 int unicode_prop(CharRange *cr, const char *prop_name);
119 
120 #endif /* CONFIG_ALL_UNICODE */
121 
122 #undef LRE_BOOL
123 
124 #endif /* LIBUNICODE_H */
125