1 #ifndef NOTCURSES_EGCPOOL
2 #define NOTCURSES_EGCPOOL
3
4 #include <wchar.h>
5 #include <errno.h>
6 #include <stdio.h>
7 #include <wctype.h>
8 #include <stddef.h>
9 #include <assert.h>
10 #include <stdlib.h>
11 #include <string.h>
12 #include <stdbool.h>
13 #include <unigbrk.h>
14 #include <unictype.h>
15 #include "notcurses/notcurses.h"
16 #include "compat/compat.h"
17 #include "logging.h"
18
19 #ifdef __cplusplus
20 extern "C" {
21 #endif
22
23 // cells only provide storage for a single 7-bit character. if there's anything
24 // more than that, it's spilled into the egcpool, and the cell is given an
25 // offset. when a cell is released, the memory it owned is zeroed out, and
26 // recognizable as use for another cell.
27
28 typedef struct egcpool {
29 char* pool; // ringbuffer of attached extension storage
30 int poolsize; // total number of bytes in pool
31 int poolused; // bytes actively used, grow when this gets too large
32 int poolwrite; // next place to *look for* a place to write
33 } egcpool;
34
35 #define POOL_MINIMUM_ALLOC BUFSIZ
36 #define POOL_MAXIMUM_BYTES (1u << 24u) // max 16MiB
37
38 static inline void
egcpool_init(egcpool * p)39 egcpool_init(egcpool* p){
40 memset(p, 0, sizeof(*p));
41 }
42
43 static inline int
egcpool_grow(egcpool * pool,size_t len)44 egcpool_grow(egcpool* pool, size_t len){
45 size_t newsize = pool->poolsize * 2;
46 if(newsize < POOL_MINIMUM_ALLOC){
47 newsize = POOL_MINIMUM_ALLOC;
48 }
49 while(len > newsize - pool->poolsize){ // ensure we make enough space
50 newsize *= 2;
51 }
52 if(newsize > POOL_MAXIMUM_BYTES){
53 return -1;
54 }
55 // nasty cast here because c++ source might include this header :/
56 char* tmp = (char*)realloc(pool->pool, newsize);
57 if(tmp == NULL){
58 return -1;
59 }
60 pool->pool = tmp;
61 memset(pool->pool + pool->poolsize, 0, newsize - pool->poolsize);
62 pool->poolsize = newsize;
63 return 0;
64 }
65
66 // get the expected length of the encoded codepoint from the first byte of a
67 // utf-8 character. if the byte is illegal as a first byte, 1 is returned.
68 // Table 3.1B, Legal UTF8 Byte Sequences, Corrigendum #1: UTF-8 Shortest Form.
69 // subsequent ("continuation") bytes must start with the bit pattern 10.
70 static inline size_t
utf8_codepoint_length(unsigned char c)71 utf8_codepoint_length(unsigned char c){
72 if(c <= 0x7f){ // 0x000000...0x00007f
73 return 1;
74 }else if(c <= 0xc1){ // illegal continuation byte
75 return 1;
76 }else if(c <= 0xdf){ // 0x000080...0x0007ff
77 return 2;
78 }else if(c <= 0xef){ // 0x000800...0x00ffff
79 return 3;
80 }else if(c <= 0xf4){ // c <= 0xf4, 0x100000...0x10ffff
81 return 4;
82 }else{ // illegal first byte
83 return 1;
84 }
85 }
86
87 // Eat an EGC from the UTF-8 string input, counting bytes and columns. We use
88 // libunistring's uc_is_grapheme_break() to segment EGCs. Writes the number of
89 // columns to '*colcount'. Returns the number of bytes consumed, not including
90 // any NUL terminator. Neither the number of bytes nor columns is necessarily
91 // equal to the number of decoded code points. Such are the ways of Unicode.
92 // uc_is_grapheme_break() wants UTF-32, which is fine, because we need wchar_t
93 // to use wcwidth() anyway FIXME except this doesn't work with 16-bit wchar_t!
94 static inline int
utf8_egc_len(const char * gcluster,int * colcount)95 utf8_egc_len(const char* gcluster, int* colcount){
96 size_t ret = 0;
97 *colcount = 0;
98 int r;
99 mbstate_t mbt;
100 memset(&mbt, 0, sizeof(mbt));
101 wchar_t wc, prevw = 0;
102 bool injoin = false;
103 do{
104 r = mbrtowc(&wc, gcluster, MB_LEN_MAX, &mbt);
105 if(r < 0){
106 // FIXME probably ought escape this somehow
107 logerror("invalid UTF8: %s\n", gcluster);
108 return -1;
109 }
110 if(prevw && !injoin && uc_is_grapheme_break(prevw, wc)){
111 break; // starts a new EGC, exit and do not claim
112 }
113 int cols;
114 if(uc_is_property_variation_selector(wc)){ // ends EGC
115 ret += r;
116 break;
117 }else if(wc == L'\u200d' || injoin){ // ZWJ is iswcntrl, so check it first
118 injoin = true;
119 cols = 0;
120 }else{
121 cols = wcwidth(wc);
122 if(cols < 0){
123 injoin = false;
124 if(iswspace(wc)){ // newline or tab
125 *colcount = 1;
126 return ret + 1;
127 }
128 cols = 1;
129 if(iswcntrl(wc)){
130 logerror("prohibited or invalid unicode: 0x%08x\n", (unsigned)wc);
131 return -1;
132 }
133 }
134 }
135 if(*colcount == 0){
136 *colcount += cols;
137 }
138 ret += r;
139 gcluster += r;
140 if(!prevw){
141 prevw = wc;
142 }
143 }while(r);
144 // FIXME what if injoin is set? incomplete EGC!
145 return ret;
146 }
147
148 // if we're inserting a EGC of |len| bytes, ought we proactively realloc?
149 static inline bool
egcpool_alloc_justified(const egcpool * pool,int len)150 egcpool_alloc_justified(const egcpool* pool, int len){
151 const int poolfree = pool->poolsize - pool->poolused;
152 // proactively get more space if we have less than 10% free. this doesn't
153 // guarantee that we'll have enough space to insert the string -- we could
154 // theoretically have every 10th byte free, and be unable to write even a
155 // two-byte egc -- so we might have to allocate after an expensive search :/.
156 if(poolfree >= len && poolfree * 10 > pool->poolsize){
157 return false;
158 }
159 return true;
160 }
161
162 // stash away the provided UTF8, NUL-terminated grapheme cluster. the cluster
163 // should not be less than 2 bytes (such a cluster should be directly stored in
164 // the cell). returns -1 on error, and otherwise a non-negative offset. 'ulen'
165 // must be the number of bytes to lift from egc (utf8_egc_len()).
166 __attribute__ ((nonnull (1, 2))) static inline int
egcpool_stash(egcpool * pool,const char * egc,size_t ulen)167 egcpool_stash(egcpool* pool, const char* egc, size_t ulen){
168 int len = ulen + 1; // count the NUL terminator
169 if(len <= 2){ // should never be empty, nor a single byte + NUL
170 return -1;
171 }
172 // the first time through, we don't force a grow unless we expect ourselves
173 // to have too little space. once we've done a search, we do force the grow.
174 // we should thus never have more than two iterations of this loop.
175 bool searched = false;
176 // we might have to realloc our underlying pool. it is possible that this EGC
177 // is actually *in* that pool, in which case our pointer will be invalidated.
178 // to be safe, duplicate prior to a realloc, and free along all paths.
179 char* duplicated = NULL;
180 do{
181 if(egcpool_alloc_justified(pool, len) || searched){
182 if(!duplicated){
183 // cast (and avoidance of strndup) to facilitate c++ inclusions
184 if((duplicated = (char *)malloc(ulen + 1)) == NULL){
185 return -1;
186 }
187 memcpy(duplicated, egc, ulen);
188 duplicated[ulen] = '\0';
189 }
190 if(egcpool_grow(pool, len) && searched){
191 free(duplicated);
192 return -1;
193 }
194 egc = duplicated;
195 }
196 // we now look for a place to lay out this egc. we need |len| zeroes in a
197 // row. starting at pool->poolwrite, look for such a range of unused
198 // memory. if we find it, write it out, and update used count. if we come
199 // back to where we started, force a growth and try again.
200 int curpos = pool->poolwrite;
201 //fprintf(stderr, "Stashing [%s] %d starting at %d\n", egc, len, curpos);
202 do{
203 if(curpos == pool->poolsize){
204 curpos = 0;
205 }
206 if(pool->pool[curpos]){ // can't write if there's stuff here
207 ++curpos;
208 }else if(curpos && pool->pool[curpos - 1]){ // don't kill someone's NUL
209 ++curpos;
210 }else if(pool->poolsize - curpos < len){ // can't wrap around
211 if(pool->poolwrite > curpos){
212 break;
213 }
214 curpos = 0; // can this skip pool->poolwrite?
215 }else{ // promising! let's see if there's enough space
216 int need = len;
217 size_t trial = curpos;
218 while(--need){
219 if(pool->pool[++trial]){ // alas, not enough space here
220 break;
221 }
222 }
223 if(need == 0){ // found a suitable space, copy it!
224 memcpy(pool->pool + curpos, egc, len - 1);
225 pool->pool[curpos + len - 1] = '\0';
226 pool->poolwrite = curpos + len;
227 pool->poolused += len;
228 free(duplicated);
229 //fprintf(stderr, "Stashing AT %d\n", curpos);
230 return curpos;
231 }
232 if(pool->poolwrite > curpos && pool->poolwrite - (len - need) < curpos){
233 break;
234 }
235 curpos += len - need;
236 }
237 }while(curpos != pool->poolwrite);
238 }while( (searched = !searched) );
239 free(duplicated);
240 assert(false);
241 return -1; // should never get here
242 }
243
244 // Run a consistency check on the offset; ensure it's a valid, non-empty EGC.
245 static inline bool
egcpool_check_validity(const egcpool * pool,int offset)246 egcpool_check_validity(const egcpool* pool, int offset){
247 if(offset >= pool->poolsize){
248 fprintf(stderr, "offset 0x%06x greater than size (%d)\n", offset, pool->poolsize);
249 return false;
250 }
251 const char* egc = pool->pool + offset;
252 if(*egc == '\0'){
253 fprintf(stderr, "bad offset 0x%06x: empty\n", offset);
254 return false;
255 }
256 mbstate_t mbstate;
257 memset(&mbstate, 0, sizeof(mbstate));
258 do{
259 wchar_t wcs;
260 int r = mbrtowc(&wcs, egc, strlen(egc), &mbstate);
261 if(r < 0){
262 fprintf(stderr, "invalid utf8 at offset 0x%06x [%s]\n", offset, strerror(errno));
263 return false;
264 }
265 egc += r;
266 }while(*egc);
267 return true;
268 }
269
270 // remove the egc from the pool. start at offset, and zero out everything until
271 // we find a zero (our own NUL terminator). remove that number of bytes from
272 // the usedcount.
273 static inline void
egcpool_release(egcpool * pool,int offset)274 egcpool_release(egcpool* pool, int offset){
275 size_t freed = 1; // account for free(d) NUL terminator
276 while(pool->pool[offset]){
277 pool->pool[offset] = '\0';
278 ++freed;
279 ++offset;
280 assert(offset < pool->poolsize);
281 }
282 pool->poolused -= freed;
283 // FIXME ought we update pool->poolwrite?
284 }
285
286 static inline void
egcpool_dump(egcpool * pool)287 egcpool_dump(egcpool* pool){
288 free(pool->pool);
289 pool->pool = NULL;
290 pool->poolsize = 0;
291 pool->poolwrite = 0;
292 pool->poolused = 0;
293 }
294
295 // get the offset into the egcpool for this cell's EGC. returns meaningless and
296 // unsafe results if called on a simple cell.
297 static inline uint32_t
cell_egc_idx(const nccell * c)298 cell_egc_idx(const nccell* c){
299 return (htole(c->gcluster) & 0x00fffffflu);
300 }
301
302 // Is the cell a spilled (more than 4 byte) UTF8 EGC?
303 static inline bool
cell_extended_p(const nccell * c)304 cell_extended_p(const nccell* c){
305 return (htole(c->gcluster) & 0xff000000ul) == 0x01000000ul;
306 }
307
308 // Is the cell simple (a UTF8-encoded EGC of four bytes or fewer)?
309 static inline bool
cell_simple_p(const nccell * c)310 cell_simple_p(const nccell* c){
311 return !cell_extended_p(c);
312 }
313
314 // only applies to complex cells, do not use on simple cells
315 __attribute__ ((__returns_nonnull__)) static inline const char*
egcpool_extended_gcluster(const egcpool * pool,const nccell * c)316 egcpool_extended_gcluster(const egcpool* pool, const nccell* c) {
317 assert(cell_extended_p(c));
318 uint32_t idx = cell_egc_idx(c);
319 return pool->pool + idx;
320 }
321
322 // Duplicate the contents of EGCpool 'src' onto another, wiping out any prior
323 // contents in 'dst'.
324 static inline int
egcpool_dup(egcpool * dst,const egcpool * src)325 egcpool_dup(egcpool* dst, const egcpool* src){
326 if(src->pool){
327 char* tmp;
328 if((tmp = (char*)realloc(dst->pool, src->poolsize)) == NULL){
329 return -1;
330 }
331 dst->pool = tmp;
332 memcpy(dst->pool, src->pool, src->poolsize);
333 }
334 dst->poolsize = src->poolsize;
335 dst->poolused = src->poolused;
336 dst->poolwrite = src->poolwrite;
337 return 0;
338 }
339
340 #ifdef __cplusplus
341 }
342 #endif
343
344 #endif
345