1 /*
2  * Copyright (C) 2021  Brodie Gaslam
3  *
4  * This file is part of "fansi - ANSI Control Sequence Aware String Functions"
5  *
6  * This program is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * Go to <https://www.r-project.org/Licenses/GPL-2> for a copy of the license.
17  */
18 
19 #include "fansi.h"
20 /*
21  * Used to set a global int_max value smaller than INT_MAX for testing
22  * purposes
23  *
24  * This does not affect FANSI_add_int as that we can test separately, and
25  * setting it there prevents us from testing some of the downstream overflow
26  * logic.
27  */
28 int FANSI_int_max = INT_MAX;
29 int FANSI_int_min = INT_MIN;  // no way to change this externally
30 
FANSI_set_int_max(SEXP x)31 SEXP FANSI_set_int_max(SEXP x) {
32   if(TYPEOF(x) != INTSXP || XLENGTH(x) != 1)
33     error("invalid int_max value");  // nocov
34   int x_int = asInteger(x);
35 
36   if(x_int < 1)
37     error("int_max value must be positive"); // nocov
38 
39   int old_int = FANSI_int_max;
40   FANSI_int_max = x_int;
41   return ScalarInteger(old_int);
42 }
43 // nocov start
44 // used only for debugging
FANSI_get_int_max()45 SEXP FANSI_get_int_max() {
46   return ScalarInteger(FANSI_int_max);
47 }
48 // nocov end
49 /*
50  * Add integers while checking for overflow
51  *
52  * Note we are stricter than necessary when y is negative because we want to
53  * count hitting INT_MIN as an overflow so that we can use the integer values
54  * in R where INT_MIN is NA.
55  */
56 
FANSI_add_int(int x,int y,const char * file,int line)57 int FANSI_add_int(int x, int y, const char * file, int line) {
58   if((y >= 0 && (x > INT_MAX - y)) || (y < 0 && (x <= INT_MIN - y)))
59     error(
60       "Integer overflow in file %s at line %d; %s", file, line,
61       "contact maintainer."
62     );
63   return x + y;
64 }
FANSI_add_int_ext(SEXP x,SEXP y)65 SEXP FANSI_add_int_ext(SEXP x, SEXP y) {
66   if(
67     TYPEOF(x) != INTSXP || XLENGTH(x) != 1 ||
68     TYPEOF(y) != INTSXP || XLENGTH(y) != 1
69   )
70     error("Internal error: arguments must be scalar integers"); // nocov
71 
72   return ScalarInteger(FANSI_ADD_INT(asInteger(x), asInteger(y)));
73 }
74 /*
75  * Compute Location and Size of Next ANSI Sequences
76  *
77  * See FANSI_parse_esc as well, where there is similar logic, although we keep
78  * it separated here for speed since we don't try to interpret the string.
79  *
80  * Length includes the ESC and [, and start point is the ESC.
81  *
82  * Validity here means striclty that all the contained escape sequences were
83  * valid CSI sequences as per the strict definition.
84  *
85  * We report the length of invalid sequnces, but you really can't trust them.
86  * The true length may actually be different depending on your terminal,
87  * (e.g. OSX terminal spits out illegal characters to screen but keeps
88  * processing the sequence).
89  *
90  * @param ctl is a bit flag to line up against VALID.WHAT index values, so
91  *   (ctl & (1 << 0)) is newlines, (ctl & (1 << 1)) is C0, etc, though note
92  *   this does not act
93  */
94 
FANSI_find_esc(const char * x,int ctl)95 struct FANSI_csi_pos FANSI_find_esc(const char * x, int ctl) {
96   /***************************************************\
97   | IMPORTANT: KEEP THIS ALIGNED WITH FANSI_read_esc  |
98   | although now this also deals with c0              |
99   \***************************************************/
100   int valid = 1;
101   int found = 0;
102   int found_ctl = 0;
103   const char * x_track = x;
104   const char * x_found_start;
105   const char * x_found_end;
106 
107   struct FANSI_csi_pos res;
108 
109   while(*x_track) {
110     const char x_val = *(x_track++);
111     // use found & found_this in conjunction so that we can allow multiple
112     // adjacent elements to be found in one go
113 
114     int found_this = 0;
115 
116     // If not normal ASCII or UTF8, examine whether we need to found
117     if(!((x_val > 31 && x_val < 127) || x_val < 0 || x_val > 127)) {
118       if(!found) {
119         // Keep resetting strip start point until we find something we want to
120         // mark
121         x_found_start = x_found_end = x_track - 1;
122       }
123       found_this = 0;
124       if(x_val == 27) {
125         if(*x_track == '[') {
126           // This is a CSI sequence, so it has multiple characters that we
127           // need to skip.  The final character is processed outside of here
128           // since it has the same logic for CSI and non CSI sequences
129 
130           // skip [
131 
132           ++x_track;
133 
134           // Skip all the valid parameters tokens
135 
136           while(*x_track >= 0x30 && *x_track <= 0x3F) ++x_track;
137 
138           // And all the valid intermediates
139 
140           int intermediate = 0;
141           while(*x_track >= 0x20 && *x_track <= 0x2F) {
142             if(!intermediate) intermediate = 1;
143             ++x_track;
144           }
145           // Check validity
146 
147           int valid_tmp = *x_track >= 0x40 && *x_track <= 0x7E;
148 
149           // If not valid, consume all subsequent parameter tokens  as that
150           // seems to be terminal.osx and iterm behavior (though terminal.osx
151           // seems pretty picky about what it considers intermediate or even
152           // parameter characters).
153 
154           if(!valid_tmp)
155             while(*x_track >= 0x20 && *x_track <= 0x3F) ++x_track;
156 
157           valid = valid && valid_tmp;
158 
159           // CSI SGR only found if ends in m and no intermediate
160 
161           int sgr = !intermediate && *x_track == 'm';
162           found_ctl |= sgr ? FANSI_CTL_SGR & ctl : FANSI_CTL_CSI & ctl;
163           found_this =
164             (sgr && (ctl & FANSI_CTL_SGR)) ||  // SGR
165             (!sgr && (ctl & FANSI_CTL_CSI));      // CSI
166         } else {
167           // Includes both the C1 set and "controls strings"
168           found_this = ctl & FANSI_CTL_ESC;
169           found_ctl |= ctl & FANSI_CTL_ESC;
170           valid = valid && (*x_track >= 0x40 && *x_track <= 0x7E);
171         }
172         // Advance unless next char is ESC, in which case we want to keep
173         // looping
174 
175         if(*x_track && *x_track != 27) x_track++;
176       } else {
177         // x01-x1F, x7F, all the C0 codes
178 
179         found_ctl |= (x_val == '\n' ? ctl & FANSI_CTL_NL : ctl & FANSI_CTL_C0);
180         found_this =
181           (x_val == '\n' && (ctl & FANSI_CTL_NL)) ||
182           (x_val != '\n' && (ctl & FANSI_CTL_C0));
183       }
184       if(found_this) {
185         x_found_end = x_track;
186         if(!found) found = 1;
187       }
188     }
189     if(found && !found_this) break;
190   }
191   if(found) {
192     res = (struct FANSI_csi_pos){
193       .start=x_found_start, .len=(x_found_end - x_found_start),
194       .valid=valid, .ctl=found_ctl
195     };
196   } else {
197     res = (struct FANSI_csi_pos){
198       .start=x, .len=0, .valid=valid, ctl=found_ctl
199     };
200   }
201   return res;
202 }
203 /*
204  * Allocates a fresh chunk of memory if the existing one is not large enough.
205  *
206  * We never intend to re-use what's already in memory so we don't realloc.  If
207  * allocation is needed the buffer will be either twice as large as it was
208  * before, or size `size` if that is greater than twice the size.
209  */
FANSI_size_buff(struct FANSI_buff * buff,size_t size)210 void FANSI_size_buff(struct FANSI_buff * buff, size_t size) {
211   if(size > buff->len) {
212     // Special case for intial alloc
213 
214     if(!buff->len) {
215       if(size < 128 && FANSI_int_max > 128)
216         size = 128;  // in theory little penalty to ask this minimum
217       else if(size > (size_t) FANSI_int_max + 1) {
218         // nocov start
219         // assumptions check that  SIZE_T fits INT_MAX + 1
220         // too difficult to test, all the code pretty much checks for overflow
221         // before requesting memory
222         error(
223           "Internal Error: requested buff size %zu greater than INT_MAX + 1.",
224            size
225         );
226         // nocov end
227       }
228       else buff->len = size;
229     }
230     // More generic case
231 
232     if(size > buff->len) {
233       size_t tmp_double_size = 0;
234       if(buff->len > (size_t) FANSI_int_max + 1 - buff->len) {
235         tmp_double_size = (size_t) FANSI_int_max + 1;
236       } else {
237         tmp_double_size = buff->len + buff->len;
238       }
239       if(size > tmp_double_size) tmp_double_size = size;
240 
241       if(tmp_double_size > (size_t) FANSI_int_max + 1)
242         // nocov start
243         // this can't really happen unless size starts off bigger than
244         // INT_MAX + 1
245         error(
246           "%s  Requesting %zu",
247           "Internal Error: max allowed buffer size is INT_MAX + 1.",
248            tmp_double_size
249         );
250         // nocov end
251       buff->len = tmp_double_size;
252     }
253     buff->buff = R_alloc(buff->len, sizeof(char));
254   }
255 }
256 /*
257  * Compute how many digits are in a number
258  *
259  * Add an extra character for negative integers.
260  */
261 
FANSI_digits_in_int(int x)262 int FANSI_digits_in_int(int x) {
263   int num = 1;
264   if(x < 0) {
265     ++num;
266     x = -x;
267   }
268   while((x = (x / 10))) ++num;
269   return num;
270 }
FANSI_digits_in_int_ext(SEXP y)271 SEXP FANSI_digits_in_int_ext(SEXP y) {
272   if(TYPEOF(y) != INTSXP) error("Internal Error: required int.");
273 
274   R_xlen_t ylen = XLENGTH(y);
275   SEXP res = PROTECT(allocVector(INTSXP, ylen));
276 
277   for(R_xlen_t i = 0; i < ylen; ++i)
278     INTEGER(res)[i] = FANSI_digits_in_int(INTEGER(y)[i]);
279 
280   UNPROTECT(1);
281   return(res);
282 }
283 /*
284  * Compresses the ctl vector into a single integer by encoding each value of
285  * ctl as a bit.
286  */
287 
FANSI_ctl_as_int(SEXP ctl)288 int FANSI_ctl_as_int(SEXP ctl) {
289   int ctl_int = 0;
290   int flip_bits = 0;
291   for(R_xlen_t i = 0; i < XLENGTH(ctl); ++i) {
292     // -2 because ctl is 1 indexed (from R), and position 1 means "all", so we
293     // need to shift by 1 for the 0 index, and then by one more for the position
294     // occupied by "all" that really means flip bits
295     int ctl_val = INTEGER(ctl)[i] - 2;
296     if(ctl_val > 4)
297       error("Internal Error: max ctl value allowed is 4.");
298     if(ctl_val < 0) flip_bits = 1;
299     else ctl_int |= 1 << ctl_val;
300   }
301   if(flip_bits) ctl_int ^= FANSI_CTL_ALL;
302   return ctl_int;
303 }
FANSI_ctl_as_int_ext(SEXP ctl)304 SEXP FANSI_ctl_as_int_ext(SEXP ctl) {
305   return ScalarInteger(FANSI_ctl_as_int(ctl));
306 }
307 /*
308  * Partial match a single string byte by byte
309  *
310  * @param x a scalar STRSXP
311  * @param choices an array of strings to match against
312  * @param choice_count how many elements there are in array
313  * @param arg_name the name of the argument to use in an error message if
314  *   the match fails.
315  * @return the position in choices that partial matches x, on a 0-index basis
316  *   (ie. 0 == 1st, 1 == 2nd, etc.)
317  */
318 // nocov start
FANSI_pmatch(SEXP x,const char ** choices,int choice_count,const char * arg_name)319 int FANSI_pmatch(
320   SEXP x, const char ** choices, int choice_count, const char * arg_name
321 ) {
322   error("remove nocov if we start to use this");
323   if(TYPEOF(x) != STRSXP || XLENGTH(x) != 1)
324     error("Argument `%s` must be a length 1 character vector.", arg_name);
325 
326   SEXP x_chrsxp = STRING_ELT(x, 0);
327   const char * x_chr = CHAR(x_chrsxp);
328 
329   if(!LENGTH(x_chrsxp))
330     error("Argument `%s` may not be an empty string.", arg_name);
331 
332   int match_count = choice_count;
333   int last_match_index = -1;
334 
335   for(int i = 0; i < choice_count; ++i) {
336     if(!strncmp(x_chr, choices[i], LENGTH(x_chrsxp))) {
337       last_match_index = i;
338       --match_count;
339     }
340   }
341   if(match_count > 1) {
342     error(
343       "Argument `%s` matches more than one of the possible choices.",
344       arg_name
345     );
346   } else if(!match_count) {
347     error("Argument `%s` does not match any of the valid choices.", arg_name);
348   }
349   // success
350 
351   return last_match_index;
352 }
353 // nocov end
354 
355 // concept borrowed from utf8-lite
356 
FANSI_interrupt(int i)357 void FANSI_interrupt(int i) {if(!(i % 1000)) R_CheckUserInterrupt();}
358 /*
359  * Split an integer vector into two equal size pieces
360  */
361 
FANSI_cleave(SEXP x)362 SEXP FANSI_cleave(SEXP x) {
363   if(TYPEOF(x) != INTSXP || XLENGTH(x) % 2)
364     error("Internal error, need even length INTSXP.");  // nocov
365 
366   R_xlen_t len = XLENGTH(x) / 2;
367   if((size_t) len > SIZE_MAX)
368     error("Internal error: vector too long to cleave"); // nocov
369 
370   SEXP a, b;
371   a = PROTECT(allocVector(INTSXP, len));
372   b = PROTECT(allocVector(INTSXP, len));
373 
374   size_t size = 0;
375   for(int i = 0; i < (int) sizeof(int); ++i) {
376     if(size > SIZE_MAX - len)
377       error("Internal error: vector too long to cleave"); // nocov
378     size += len;
379   }
380   memcpy(INTEGER(a), INTEGER(x), size);
381   memcpy(INTEGER(b), INTEGER(x) + len, size);
382 
383   SEXP res = PROTECT(allocVector(VECSXP, 2));
384   SET_VECTOR_ELT(res, 0, a);
385   SET_VECTOR_ELT(res, 1, b);
386   UNPROTECT(3);
387   return res;
388 }
389 struct datum {int val; R_xlen_t idx;};
390 
cmpfun(const void * p,const void * q)391 static int cmpfun (const void * p, const void * q) {
392   struct datum a = *(struct datum *) p;
393   struct datum b = *(struct datum *) q;
394   return(a.val > b.val ? 1 : (a.val < b.val ? -1 : 0));
395 }
396 /*
397  * Equivalent to `order`, but less overhead.  May not be faster for longer
398  * vectors but since we call it potentially repeatedly via our initial version
399  * of strsplit, we want to do this to make somewhat less sub-optimal
400  */
FANSI_order(SEXP x)401 SEXP FANSI_order(SEXP x) {
402   if(TYPEOF(x) != INTSXP)
403     error("Internal error: this order only supports ints.");  // nocov
404 
405   R_xlen_t len = XLENGTH(x);
406   SEXP res;
407 
408   if(len) {
409     size_t size = 0;
410     for(int i = 0; i < (int) sizeof(struct datum); ++i) {
411       if(size > SIZE_MAX - len)
412         error("Internal error: vector too long to order"); // nocov
413       size += len;
414     }
415     struct datum * data = (struct datum *) R_alloc(len, sizeof(struct datum));
416 
417     for(R_xlen_t i = 0; i < len; ++i)
418       *(data + i) = (struct datum){.val=INTEGER(x)[i], .idx=i + 1};
419 
420     qsort(data, (size_t) len, sizeof(struct datum), cmpfun);
421 
422     res = PROTECT(allocVector(INTSXP, len));
423 
424     for(R_xlen_t i = 0; i < len; ++i) INTEGER(res)[i] = (data + i)->idx;
425   } else {
426     res = PROTECT(allocVector(INTSXP, 0));
427   }
428   UNPROTECT(1);
429   return res;
430 }
431 /*
432  * Equivalent to `sort`, but less overhead.  May not be faster for longer
433  * vectors but since we call it potentially repeatedly via our initial version
434  * of strsplit, we want to do this to make somewhat less sub-optimal
435  */
436 // nocov start
cmpfun2(const void * p,const void * q)437 static int cmpfun2 (const void * p, const void * q) {
438   int a = *(int *) p;
439   int b = *(int *) q;
440   return(a > b ? 1 : (a < b ? -1 : 0));
441 }
FANSI_sort_int(SEXP x)442 SEXP FANSI_sort_int(SEXP x) {
443   error("get rid of nocov if we start using");
444   if(TYPEOF(x) != INTSXP)
445     error("Internal error: this order only supports ints.");  // nocov
446 
447   R_xlen_t len = XLENGTH(x);
448 
449   SEXP res = PROTECT(duplicate(x));
450 
451   qsort(INTEGER(res), (size_t) len, sizeof(int), cmpfun2);
452 
453   UNPROTECT(1);
454   return res;
455 }
456 // nocov end
457 struct datum2 {SEXP val; R_xlen_t idx;};
458 
cmpfun3(const void * p,const void * q)459 static int cmpfun3 (const void * p, const void * q) {
460   struct datum2 a = *(struct datum2 *) p;
461   struct datum2 b = *(struct datum2 *) q;
462   const char * a_chr = CHAR(a.val);
463   const char * b_chr = CHAR(b.val);
464   return(a_chr > b_chr ? 1 : (a_chr < b_chr ? -1 : 0));
465 }
466 /*
467  * Sort chars so that equal values are contiguous
468  *
469  * Beware, the sort is not lexical, instead this is sorted by the memory addess
470  * of the character strings backing each CHARSXP.
471  *
472  * The only purpose of this is to support the unique_chr function.
473  */
474 
FANSI_sort_chr(SEXP x)475 SEXP FANSI_sort_chr(SEXP x) {
476   if(TYPEOF(x) != STRSXP)
477     error("Internal error: this sort only supports char vecs.");  // nocov
478 
479   R_xlen_t len = XLENGTH(x);
480   SEXP res = x;
481 
482   if(len > 2) {
483     // note we explictily check in assumptions that R_xlen_t is not bigger than
484     // size_t
485 
486     size_t size = 0;
487     for(int i = 0; i < (int) sizeof(struct datum); ++i) {
488       if(size > SIZE_MAX - len)
489         error("Internal error: vector too long to order"); // nocov
490       size += len;
491     }
492     struct datum2 * data = (struct datum2 *) R_alloc(len, sizeof(struct datum2));
493 
494     for(R_xlen_t i = 0; i < len; ++i)
495       *(data + i) = (struct datum2){.val=STRING_ELT(x, i), .idx=i};
496 
497     qsort(data, (size_t) len, sizeof(struct datum2), cmpfun3);
498 
499     res = PROTECT(allocVector(STRSXP, len));
500 
501     for(R_xlen_t i = 0; i < len; ++i)
502       SET_STRING_ELT(res, i, STRING_ELT(x, (data + i)->idx));
503 
504     UNPROTECT(1);
505   }
506   return res;
507 }
508 /*
509  * So we can use a consistent integer type in printing possibly large indeces.
510  *
511  * Returns in 1 based indexing, -1 in the unlikely case R_xlen_t == intmax_t.
512  */
513 
FANSI_ind(R_xlen_t i)514 intmax_t FANSI_ind(R_xlen_t i) {
515   intmax_t ind = i >= INTMAX_MAX ? -2 : i; // i == INTMAX_MAX is the issue
516   return ind + 1;
517 }
518 
FANSI_check_chr_size(char * start,char * end,R_xlen_t i)519 void FANSI_check_chr_size(char * start, char * end, R_xlen_t i) {
520   if(end - start > FANSI_int_max) {
521     // Can't get to this point with a string that violates, AFAICT
522     // nocov start
523     error(
524       "Internal Error: %s at index [%jd] (3).",
525       "attempting to write string longer than INT_MAX",
526       FANSI_ind(i)
527     );
528     // nocov end
529   }
530 }
531 
532 
533 
534