1 #ifndef NOTCURSES_EGCPOOL
2 #define NOTCURSES_EGCPOOL
3 
4 #include <wchar.h>
5 #include <errno.h>
6 #include <stdio.h>
7 #include <wctype.h>
8 #include <stddef.h>
9 #include <assert.h>
10 #include <stdlib.h>
11 #include <string.h>
12 #include <stdbool.h>
13 #include <unigbrk.h>
14 #include <unictype.h>
15 #include "notcurses/notcurses.h"
16 #include "compat/compat.h"
17 #include "logging.h"
18 
19 #ifdef __cplusplus
20 extern "C" {
21 #endif
22 
23 // cells only provide storage for a single 7-bit character. if there's anything
24 // more than that, it's spilled into the egcpool, and the cell is given an
25 // offset. when a cell is released, the memory it owned is zeroed out, and
26 // recognizable as use for another cell.
27 
28 typedef struct egcpool {
29   char* pool;         // ringbuffer of attached extension storage
30   int poolsize;       // total number of bytes in pool
31   int poolused;       // bytes actively used, grow when this gets too large
32   int poolwrite;      // next place to *look for* a place to write
33 } egcpool;
34 
35 #define POOL_MINIMUM_ALLOC BUFSIZ
36 #define POOL_MAXIMUM_BYTES (1u << 24u) // max 16MiB
37 
38 static inline void
egcpool_init(egcpool * p)39 egcpool_init(egcpool* p){
40   memset(p, 0, sizeof(*p));
41 }
42 
43 static inline int
egcpool_grow(egcpool * pool,size_t len)44 egcpool_grow(egcpool* pool, size_t len){
45   size_t newsize = pool->poolsize * 2;
46   if(newsize < POOL_MINIMUM_ALLOC){
47     newsize = POOL_MINIMUM_ALLOC;
48   }
49   while(len > newsize - pool->poolsize){ // ensure we make enough space
50     newsize *= 2;
51   }
52   if(newsize > POOL_MAXIMUM_BYTES){
53     return -1;
54   }
55   // nasty cast here because c++ source might include this header :/
56   char* tmp = (char*)realloc(pool->pool, newsize);
57   if(tmp == NULL){
58     return -1;
59   }
60   pool->pool = tmp;
61   memset(pool->pool + pool->poolsize, 0, newsize - pool->poolsize);
62   pool->poolsize = newsize;
63   return 0;
64 }
65 
66 // get the expected length of the encoded codepoint from the first byte of a
67 // utf-8 character. if the byte is illegal as a first byte, 1 is returned.
68 // Table 3.1B, Legal UTF8 Byte Sequences, Corrigendum #1: UTF-8 Shortest Form.
69 // subsequent ("continuation") bytes must start with the bit pattern 10.
70 static inline size_t
utf8_codepoint_length(unsigned char c)71 utf8_codepoint_length(unsigned char c){
72   if(c <= 0x7f){        // 0x000000...0x00007f
73     return 1;
74   }else if(c <= 0xc1){  // illegal continuation byte
75     return 1;
76   }else if(c <= 0xdf){  // 0x000080...0x0007ff
77     return 2;
78   }else if(c <= 0xef){  // 0x000800...0x00ffff
79     return 3;
80   }else if(c <= 0xf4){  // c <= 0xf4, 0x100000...0x10ffff
81     return 4;
82   }else{                // illegal first byte
83     return 1;
84   }
85 }
86 
87 // Eat an EGC from the UTF-8 string input, counting bytes and columns. We use
88 // libunistring's uc_is_grapheme_break() to segment EGCs. Writes the number of
89 // columns to '*colcount'. Returns the number of bytes consumed, not including
90 // any NUL terminator. Neither the number of bytes nor columns is necessarily
91 // equal to the number of decoded code points. Such are the ways of Unicode.
92 // uc_is_grapheme_break() wants UTF-32, which is fine, because we need wchar_t
93 // to use wcwidth() anyway FIXME except this doesn't work with 16-bit wchar_t!
94 static inline int
utf8_egc_len(const char * gcluster,int * colcount)95 utf8_egc_len(const char* gcluster, int* colcount){
96   size_t ret = 0;
97   *colcount = 0;
98   int r;
99   mbstate_t mbt;
100   memset(&mbt, 0, sizeof(mbt));
101   wchar_t wc, prevw = 0;
102   bool injoin = false;
103   do{
104     r = mbrtowc(&wc, gcluster, MB_LEN_MAX, &mbt);
105     if(r < 0){
106       // FIXME probably ought escape this somehow
107       logerror("invalid UTF8: %s\n", gcluster);
108       return -1;
109     }
110     if(prevw && !injoin && uc_is_grapheme_break(prevw, wc)){
111       break; // starts a new EGC, exit and do not claim
112     }
113     int cols;
114     if(uc_is_property_variation_selector(wc)){ // ends EGC
115       ret += r;
116       break;
117     }else if(wc == L'\u200d' || injoin){ // ZWJ is iswcntrl, so check it first
118       injoin = true;
119       cols = 0;
120     }else{
121       cols = wcwidth(wc);
122       if(cols < 0){
123         injoin = false;
124         if(iswspace(wc)){ // newline or tab
125           *colcount = 1;
126           return ret + 1;
127         }
128         cols = 1;
129         if(iswcntrl(wc)){
130           logerror("prohibited or invalid unicode: 0x%08x\n", (unsigned)wc);
131           return -1;
132         }
133       }
134     }
135     if(*colcount == 0){
136       *colcount += cols;
137     }
138     ret += r;
139     gcluster += r;
140     if(!prevw){
141       prevw = wc;
142     }
143   }while(r);
144   // FIXME what if injoin is set? incomplete EGC!
145   return ret;
146 }
147 
148 // if we're inserting a EGC of |len| bytes, ought we proactively realloc?
149 static inline bool
egcpool_alloc_justified(const egcpool * pool,int len)150 egcpool_alloc_justified(const egcpool* pool, int len){
151   const int poolfree = pool->poolsize - pool->poolused;
152   // proactively get more space if we have less than 10% free. this doesn't
153   // guarantee that we'll have enough space to insert the string -- we could
154   // theoretically have every 10th byte free, and be unable to write even a
155   // two-byte egc -- so we might have to allocate after an expensive search :/.
156   if(poolfree >= len && poolfree * 10 > pool->poolsize){
157     return false;
158   }
159   return true;
160 }
161 
162 // stash away the provided UTF8, NUL-terminated grapheme cluster. the cluster
163 // should not be less than 2 bytes (such a cluster should be directly stored in
164 // the cell). returns -1 on error, and otherwise a non-negative offset. 'ulen'
165 // must be the number of bytes to lift from egc (utf8_egc_len()).
166 __attribute__ ((nonnull (1, 2))) static inline int
egcpool_stash(egcpool * pool,const char * egc,size_t ulen)167 egcpool_stash(egcpool* pool, const char* egc, size_t ulen){
168   int len = ulen + 1; // count the NUL terminator
169   if(len <= 2){ // should never be empty, nor a single byte + NUL
170     return -1;
171   }
172   // the first time through, we don't force a grow unless we expect ourselves
173   // to have too little space. once we've done a search, we do force the grow.
174   // we should thus never have more than two iterations of this loop.
175   bool searched = false;
176   // we might have to realloc our underlying pool. it is possible that this EGC
177   // is actually *in* that pool, in which case our pointer will be invalidated.
178   // to be safe, duplicate prior to a realloc, and free along all paths.
179   char* duplicated = NULL;
180   do{
181     if(egcpool_alloc_justified(pool, len) || searched){
182       if(!duplicated){
183         // cast (and avoidance of strndup) to facilitate c++ inclusions
184         if((duplicated = (char *)malloc(ulen + 1)) == NULL){
185           return -1;
186         }
187         memcpy(duplicated, egc, ulen);
188         duplicated[ulen] = '\0';
189       }
190       if(egcpool_grow(pool, len) && searched){
191         free(duplicated);
192         return -1;
193       }
194       egc = duplicated;
195     }
196     // we now look for a place to lay out this egc. we need |len| zeroes in a
197     // row. starting at pool->poolwrite, look for such a range of unused
198     // memory. if we find it, write it out, and update used count. if we come
199     // back to where we started, force a growth and try again.
200     int curpos = pool->poolwrite;
201 //fprintf(stderr, "Stashing [%s] %d starting at %d\n", egc, len, curpos);
202     do{
203       if(curpos == pool->poolsize){
204         curpos = 0;
205       }
206       if(pool->pool[curpos]){ // can't write if there's stuff here
207         ++curpos;
208       }else if(curpos && pool->pool[curpos - 1]){ // don't kill someone's NUL
209         ++curpos;
210       }else if(pool->poolsize - curpos < len){ // can't wrap around
211         if(pool->poolwrite > curpos){
212           break;
213         }
214         curpos = 0; // can this skip pool->poolwrite?
215       }else{ // promising! let's see if there's enough space
216         int need = len;
217         size_t trial = curpos;
218         while(--need){
219           if(pool->pool[++trial]){ // alas, not enough space here
220             break;
221           }
222         }
223         if(need == 0){ // found a suitable space, copy it!
224           memcpy(pool->pool + curpos, egc, len - 1);
225           pool->pool[curpos + len - 1] = '\0';
226           pool->poolwrite = curpos + len;
227           pool->poolused += len;
228           free(duplicated);
229 //fprintf(stderr, "Stashing AT %d\n", curpos);
230           return curpos;
231         }
232         if(pool->poolwrite > curpos && pool->poolwrite - (len - need) < curpos){
233           break;
234         }
235         curpos += len - need;
236       }
237     }while(curpos != pool->poolwrite);
238   }while( (searched = !searched) );
239   free(duplicated);
240   assert(false);
241   return -1; // should never get here
242 }
243 
244 // Run a consistency check on the offset; ensure it's a valid, non-empty EGC.
245 static inline bool
egcpool_check_validity(const egcpool * pool,int offset)246 egcpool_check_validity(const egcpool* pool, int offset){
247   if(offset >= pool->poolsize){
248     fprintf(stderr, "offset 0x%06x greater than size (%d)\n", offset, pool->poolsize);
249     return false;
250   }
251   const char* egc = pool->pool + offset;
252   if(*egc == '\0'){
253     fprintf(stderr, "bad offset 0x%06x: empty\n", offset);
254     return false;
255   }
256   mbstate_t mbstate;
257   memset(&mbstate, 0, sizeof(mbstate));
258   do{
259     wchar_t wcs;
260     int r = mbrtowc(&wcs, egc, strlen(egc), &mbstate);
261     if(r < 0){
262       fprintf(stderr, "invalid utf8 at offset 0x%06x [%s]\n", offset, strerror(errno));
263       return false;
264     }
265     egc += r;
266   }while(*egc);
267   return true;
268 }
269 
270 // remove the egc from the pool. start at offset, and zero out everything until
271 // we find a zero (our own NUL terminator). remove that number of bytes from
272 // the usedcount.
273 static inline void
egcpool_release(egcpool * pool,int offset)274 egcpool_release(egcpool* pool, int offset){
275   size_t freed = 1; // account for free(d) NUL terminator
276   while(pool->pool[offset]){
277     pool->pool[offset] = '\0';
278     ++freed;
279     ++offset;
280     assert(offset < pool->poolsize);
281   }
282   pool->poolused -= freed;
283   // FIXME ought we update pool->poolwrite?
284 }
285 
286 static inline void
egcpool_dump(egcpool * pool)287 egcpool_dump(egcpool* pool){
288   free(pool->pool);
289   pool->pool = NULL;
290   pool->poolsize = 0;
291   pool->poolwrite = 0;
292   pool->poolused = 0;
293 }
294 
295 // get the offset into the egcpool for this cell's EGC. returns meaningless and
296 // unsafe results if called on a simple cell.
297 static inline uint32_t
cell_egc_idx(const nccell * c)298 cell_egc_idx(const nccell* c){
299   return (htole(c->gcluster) & 0x00fffffflu);
300 }
301 
302 // Is the cell a spilled (more than 4 byte) UTF8 EGC?
303 static inline bool
cell_extended_p(const nccell * c)304 cell_extended_p(const nccell* c){
305   return (htole(c->gcluster) & 0xff000000ul) == 0x01000000ul;
306 }
307 
308 // Is the cell simple (a UTF8-encoded EGC of four bytes or fewer)?
309 static inline bool
cell_simple_p(const nccell * c)310 cell_simple_p(const nccell* c){
311   return !cell_extended_p(c);
312 }
313 
314 // only applies to complex cells, do not use on simple cells
315 __attribute__ ((__returns_nonnull__)) static inline const char*
egcpool_extended_gcluster(const egcpool * pool,const nccell * c)316 egcpool_extended_gcluster(const egcpool* pool, const nccell* c) {
317   assert(cell_extended_p(c));
318   uint32_t idx = cell_egc_idx(c);
319   return pool->pool + idx;
320 }
321 
322 // Duplicate the contents of EGCpool 'src' onto another, wiping out any prior
323 // contents in 'dst'.
324 static inline int
egcpool_dup(egcpool * dst,const egcpool * src)325 egcpool_dup(egcpool* dst, const egcpool* src){
326   if(src->pool){
327     char* tmp;
328     if((tmp = (char*)realloc(dst->pool, src->poolsize)) == NULL){
329       return -1;
330     }
331     dst->pool = tmp;
332     memcpy(dst->pool, src->pool, src->poolsize);
333   }
334   dst->poolsize = src->poolsize;
335   dst->poolused = src->poolused;
336   dst->poolwrite = src->poolwrite;
337   return 0;
338 }
339 
340 #ifdef __cplusplus
341 }
342 #endif
343 
344 #endif
345