1 /*
2 * Copyright (C) 2021 Brodie Gaslam
3 *
4 * This file is part of "fansi - ANSI Control Sequence Aware String Functions"
5 *
6 * This program is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * Go to <https://www.r-project.org/Licenses/GPL-2> for a copy of the license.
17 */
18
19 #include "fansi.h"
20 /*
21 * Used to set a global int_max value smaller than INT_MAX for testing
22 * purposes
23 *
24 * This does not affect FANSI_add_int as that we can test separately, and
25 * setting it there prevents us from testing some of the downstream overflow
26 * logic.
27 */
28 int FANSI_int_max = INT_MAX;
29 int FANSI_int_min = INT_MIN; // no way to change this externally
30
FANSI_set_int_max(SEXP x)31 SEXP FANSI_set_int_max(SEXP x) {
32 if(TYPEOF(x) != INTSXP || XLENGTH(x) != 1)
33 error("invalid int_max value"); // nocov
34 int x_int = asInteger(x);
35
36 if(x_int < 1)
37 error("int_max value must be positive"); // nocov
38
39 int old_int = FANSI_int_max;
40 FANSI_int_max = x_int;
41 return ScalarInteger(old_int);
42 }
43 // nocov start
44 // used only for debugging
FANSI_get_int_max()45 SEXP FANSI_get_int_max() {
46 return ScalarInteger(FANSI_int_max);
47 }
48 // nocov end
49 /*
50 * Add integers while checking for overflow
51 *
52 * Note we are stricter than necessary when y is negative because we want to
53 * count hitting INT_MIN as an overflow so that we can use the integer values
54 * in R where INT_MIN is NA.
55 */
56
FANSI_add_int(int x,int y,const char * file,int line)57 int FANSI_add_int(int x, int y, const char * file, int line) {
58 if((y >= 0 && (x > INT_MAX - y)) || (y < 0 && (x <= INT_MIN - y)))
59 error(
60 "Integer overflow in file %s at line %d; %s", file, line,
61 "contact maintainer."
62 );
63 return x + y;
64 }
FANSI_add_int_ext(SEXP x,SEXP y)65 SEXP FANSI_add_int_ext(SEXP x, SEXP y) {
66 if(
67 TYPEOF(x) != INTSXP || XLENGTH(x) != 1 ||
68 TYPEOF(y) != INTSXP || XLENGTH(y) != 1
69 )
70 error("Internal error: arguments must be scalar integers"); // nocov
71
72 return ScalarInteger(FANSI_ADD_INT(asInteger(x), asInteger(y)));
73 }
74 /*
75 * Compute Location and Size of Next ANSI Sequences
76 *
77 * See FANSI_parse_esc as well, where there is similar logic, although we keep
78 * it separated here for speed since we don't try to interpret the string.
79 *
80 * Length includes the ESC and [, and start point is the ESC.
81 *
82 * Validity here means striclty that all the contained escape sequences were
83 * valid CSI sequences as per the strict definition.
84 *
85 * We report the length of invalid sequnces, but you really can't trust them.
86 * The true length may actually be different depending on your terminal,
87 * (e.g. OSX terminal spits out illegal characters to screen but keeps
88 * processing the sequence).
89 *
90 * @param ctl is a bit flag to line up against VALID.WHAT index values, so
91 * (ctl & (1 << 0)) is newlines, (ctl & (1 << 1)) is C0, etc, though note
92 * this does not act
93 */
94
FANSI_find_esc(const char * x,int ctl)95 struct FANSI_csi_pos FANSI_find_esc(const char * x, int ctl) {
96 /***************************************************\
97 | IMPORTANT: KEEP THIS ALIGNED WITH FANSI_read_esc |
98 | although now this also deals with c0 |
99 \***************************************************/
100 int valid = 1;
101 int found = 0;
102 int found_ctl = 0;
103 const char * x_track = x;
104 const char * x_found_start;
105 const char * x_found_end;
106
107 struct FANSI_csi_pos res;
108
109 while(*x_track) {
110 const char x_val = *(x_track++);
111 // use found & found_this in conjunction so that we can allow multiple
112 // adjacent elements to be found in one go
113
114 int found_this = 0;
115
116 // If not normal ASCII or UTF8, examine whether we need to found
117 if(!((x_val > 31 && x_val < 127) || x_val < 0 || x_val > 127)) {
118 if(!found) {
119 // Keep resetting strip start point until we find something we want to
120 // mark
121 x_found_start = x_found_end = x_track - 1;
122 }
123 found_this = 0;
124 if(x_val == 27) {
125 if(*x_track == '[') {
126 // This is a CSI sequence, so it has multiple characters that we
127 // need to skip. The final character is processed outside of here
128 // since it has the same logic for CSI and non CSI sequences
129
130 // skip [
131
132 ++x_track;
133
134 // Skip all the valid parameters tokens
135
136 while(*x_track >= 0x30 && *x_track <= 0x3F) ++x_track;
137
138 // And all the valid intermediates
139
140 int intermediate = 0;
141 while(*x_track >= 0x20 && *x_track <= 0x2F) {
142 if(!intermediate) intermediate = 1;
143 ++x_track;
144 }
145 // Check validity
146
147 int valid_tmp = *x_track >= 0x40 && *x_track <= 0x7E;
148
149 // If not valid, consume all subsequent parameter tokens as that
150 // seems to be terminal.osx and iterm behavior (though terminal.osx
151 // seems pretty picky about what it considers intermediate or even
152 // parameter characters).
153
154 if(!valid_tmp)
155 while(*x_track >= 0x20 && *x_track <= 0x3F) ++x_track;
156
157 valid = valid && valid_tmp;
158
159 // CSI SGR only found if ends in m and no intermediate
160
161 int sgr = !intermediate && *x_track == 'm';
162 found_ctl |= sgr ? FANSI_CTL_SGR & ctl : FANSI_CTL_CSI & ctl;
163 found_this =
164 (sgr && (ctl & FANSI_CTL_SGR)) || // SGR
165 (!sgr && (ctl & FANSI_CTL_CSI)); // CSI
166 } else {
167 // Includes both the C1 set and "controls strings"
168 found_this = ctl & FANSI_CTL_ESC;
169 found_ctl |= ctl & FANSI_CTL_ESC;
170 valid = valid && (*x_track >= 0x40 && *x_track <= 0x7E);
171 }
172 // Advance unless next char is ESC, in which case we want to keep
173 // looping
174
175 if(*x_track && *x_track != 27) x_track++;
176 } else {
177 // x01-x1F, x7F, all the C0 codes
178
179 found_ctl |= (x_val == '\n' ? ctl & FANSI_CTL_NL : ctl & FANSI_CTL_C0);
180 found_this =
181 (x_val == '\n' && (ctl & FANSI_CTL_NL)) ||
182 (x_val != '\n' && (ctl & FANSI_CTL_C0));
183 }
184 if(found_this) {
185 x_found_end = x_track;
186 if(!found) found = 1;
187 }
188 }
189 if(found && !found_this) break;
190 }
191 if(found) {
192 res = (struct FANSI_csi_pos){
193 .start=x_found_start, .len=(x_found_end - x_found_start),
194 .valid=valid, .ctl=found_ctl
195 };
196 } else {
197 res = (struct FANSI_csi_pos){
198 .start=x, .len=0, .valid=valid, ctl=found_ctl
199 };
200 }
201 return res;
202 }
203 /*
204 * Allocates a fresh chunk of memory if the existing one is not large enough.
205 *
206 * We never intend to re-use what's already in memory so we don't realloc. If
207 * allocation is needed the buffer will be either twice as large as it was
208 * before, or size `size` if that is greater than twice the size.
209 */
FANSI_size_buff(struct FANSI_buff * buff,size_t size)210 void FANSI_size_buff(struct FANSI_buff * buff, size_t size) {
211 if(size > buff->len) {
212 // Special case for intial alloc
213
214 if(!buff->len) {
215 if(size < 128 && FANSI_int_max > 128)
216 size = 128; // in theory little penalty to ask this minimum
217 else if(size > (size_t) FANSI_int_max + 1) {
218 // nocov start
219 // assumptions check that SIZE_T fits INT_MAX + 1
220 // too difficult to test, all the code pretty much checks for overflow
221 // before requesting memory
222 error(
223 "Internal Error: requested buff size %zu greater than INT_MAX + 1.",
224 size
225 );
226 // nocov end
227 }
228 else buff->len = size;
229 }
230 // More generic case
231
232 if(size > buff->len) {
233 size_t tmp_double_size = 0;
234 if(buff->len > (size_t) FANSI_int_max + 1 - buff->len) {
235 tmp_double_size = (size_t) FANSI_int_max + 1;
236 } else {
237 tmp_double_size = buff->len + buff->len;
238 }
239 if(size > tmp_double_size) tmp_double_size = size;
240
241 if(tmp_double_size > (size_t) FANSI_int_max + 1)
242 // nocov start
243 // this can't really happen unless size starts off bigger than
244 // INT_MAX + 1
245 error(
246 "%s Requesting %zu",
247 "Internal Error: max allowed buffer size is INT_MAX + 1.",
248 tmp_double_size
249 );
250 // nocov end
251 buff->len = tmp_double_size;
252 }
253 buff->buff = R_alloc(buff->len, sizeof(char));
254 }
255 }
256 /*
257 * Compute how many digits are in a number
258 *
259 * Add an extra character for negative integers.
260 */
261
FANSI_digits_in_int(int x)262 int FANSI_digits_in_int(int x) {
263 int num = 1;
264 if(x < 0) {
265 ++num;
266 x = -x;
267 }
268 while((x = (x / 10))) ++num;
269 return num;
270 }
FANSI_digits_in_int_ext(SEXP y)271 SEXP FANSI_digits_in_int_ext(SEXP y) {
272 if(TYPEOF(y) != INTSXP) error("Internal Error: required int.");
273
274 R_xlen_t ylen = XLENGTH(y);
275 SEXP res = PROTECT(allocVector(INTSXP, ylen));
276
277 for(R_xlen_t i = 0; i < ylen; ++i)
278 INTEGER(res)[i] = FANSI_digits_in_int(INTEGER(y)[i]);
279
280 UNPROTECT(1);
281 return(res);
282 }
283 /*
284 * Compresses the ctl vector into a single integer by encoding each value of
285 * ctl as a bit.
286 */
287
FANSI_ctl_as_int(SEXP ctl)288 int FANSI_ctl_as_int(SEXP ctl) {
289 int ctl_int = 0;
290 int flip_bits = 0;
291 for(R_xlen_t i = 0; i < XLENGTH(ctl); ++i) {
292 // -2 because ctl is 1 indexed (from R), and position 1 means "all", so we
293 // need to shift by 1 for the 0 index, and then by one more for the position
294 // occupied by "all" that really means flip bits
295 int ctl_val = INTEGER(ctl)[i] - 2;
296 if(ctl_val > 4)
297 error("Internal Error: max ctl value allowed is 4.");
298 if(ctl_val < 0) flip_bits = 1;
299 else ctl_int |= 1 << ctl_val;
300 }
301 if(flip_bits) ctl_int ^= FANSI_CTL_ALL;
302 return ctl_int;
303 }
FANSI_ctl_as_int_ext(SEXP ctl)304 SEXP FANSI_ctl_as_int_ext(SEXP ctl) {
305 return ScalarInteger(FANSI_ctl_as_int(ctl));
306 }
307 /*
308 * Partial match a single string byte by byte
309 *
310 * @param x a scalar STRSXP
311 * @param choices an array of strings to match against
312 * @param choice_count how many elements there are in array
313 * @param arg_name the name of the argument to use in an error message if
314 * the match fails.
315 * @return the position in choices that partial matches x, on a 0-index basis
316 * (ie. 0 == 1st, 1 == 2nd, etc.)
317 */
318 // nocov start
FANSI_pmatch(SEXP x,const char ** choices,int choice_count,const char * arg_name)319 int FANSI_pmatch(
320 SEXP x, const char ** choices, int choice_count, const char * arg_name
321 ) {
322 error("remove nocov if we start to use this");
323 if(TYPEOF(x) != STRSXP || XLENGTH(x) != 1)
324 error("Argument `%s` must be a length 1 character vector.", arg_name);
325
326 SEXP x_chrsxp = STRING_ELT(x, 0);
327 const char * x_chr = CHAR(x_chrsxp);
328
329 if(!LENGTH(x_chrsxp))
330 error("Argument `%s` may not be an empty string.", arg_name);
331
332 int match_count = choice_count;
333 int last_match_index = -1;
334
335 for(int i = 0; i < choice_count; ++i) {
336 if(!strncmp(x_chr, choices[i], LENGTH(x_chrsxp))) {
337 last_match_index = i;
338 --match_count;
339 }
340 }
341 if(match_count > 1) {
342 error(
343 "Argument `%s` matches more than one of the possible choices.",
344 arg_name
345 );
346 } else if(!match_count) {
347 error("Argument `%s` does not match any of the valid choices.", arg_name);
348 }
349 // success
350
351 return last_match_index;
352 }
353 // nocov end
354
355 // concept borrowed from utf8-lite
356
FANSI_interrupt(int i)357 void FANSI_interrupt(int i) {if(!(i % 1000)) R_CheckUserInterrupt();}
358 /*
359 * Split an integer vector into two equal size pieces
360 */
361
FANSI_cleave(SEXP x)362 SEXP FANSI_cleave(SEXP x) {
363 if(TYPEOF(x) != INTSXP || XLENGTH(x) % 2)
364 error("Internal error, need even length INTSXP."); // nocov
365
366 R_xlen_t len = XLENGTH(x) / 2;
367 if((size_t) len > SIZE_MAX)
368 error("Internal error: vector too long to cleave"); // nocov
369
370 SEXP a, b;
371 a = PROTECT(allocVector(INTSXP, len));
372 b = PROTECT(allocVector(INTSXP, len));
373
374 size_t size = 0;
375 for(int i = 0; i < (int) sizeof(int); ++i) {
376 if(size > SIZE_MAX - len)
377 error("Internal error: vector too long to cleave"); // nocov
378 size += len;
379 }
380 memcpy(INTEGER(a), INTEGER(x), size);
381 memcpy(INTEGER(b), INTEGER(x) + len, size);
382
383 SEXP res = PROTECT(allocVector(VECSXP, 2));
384 SET_VECTOR_ELT(res, 0, a);
385 SET_VECTOR_ELT(res, 1, b);
386 UNPROTECT(3);
387 return res;
388 }
389 struct datum {int val; R_xlen_t idx;};
390
cmpfun(const void * p,const void * q)391 static int cmpfun (const void * p, const void * q) {
392 struct datum a = *(struct datum *) p;
393 struct datum b = *(struct datum *) q;
394 return(a.val > b.val ? 1 : (a.val < b.val ? -1 : 0));
395 }
396 /*
397 * Equivalent to `order`, but less overhead. May not be faster for longer
398 * vectors but since we call it potentially repeatedly via our initial version
399 * of strsplit, we want to do this to make somewhat less sub-optimal
400 */
FANSI_order(SEXP x)401 SEXP FANSI_order(SEXP x) {
402 if(TYPEOF(x) != INTSXP)
403 error("Internal error: this order only supports ints."); // nocov
404
405 R_xlen_t len = XLENGTH(x);
406 SEXP res;
407
408 if(len) {
409 size_t size = 0;
410 for(int i = 0; i < (int) sizeof(struct datum); ++i) {
411 if(size > SIZE_MAX - len)
412 error("Internal error: vector too long to order"); // nocov
413 size += len;
414 }
415 struct datum * data = (struct datum *) R_alloc(len, sizeof(struct datum));
416
417 for(R_xlen_t i = 0; i < len; ++i)
418 *(data + i) = (struct datum){.val=INTEGER(x)[i], .idx=i + 1};
419
420 qsort(data, (size_t) len, sizeof(struct datum), cmpfun);
421
422 res = PROTECT(allocVector(INTSXP, len));
423
424 for(R_xlen_t i = 0; i < len; ++i) INTEGER(res)[i] = (data + i)->idx;
425 } else {
426 res = PROTECT(allocVector(INTSXP, 0));
427 }
428 UNPROTECT(1);
429 return res;
430 }
431 /*
432 * Equivalent to `sort`, but less overhead. May not be faster for longer
433 * vectors but since we call it potentially repeatedly via our initial version
434 * of strsplit, we want to do this to make somewhat less sub-optimal
435 */
436 // nocov start
cmpfun2(const void * p,const void * q)437 static int cmpfun2 (const void * p, const void * q) {
438 int a = *(int *) p;
439 int b = *(int *) q;
440 return(a > b ? 1 : (a < b ? -1 : 0));
441 }
FANSI_sort_int(SEXP x)442 SEXP FANSI_sort_int(SEXP x) {
443 error("get rid of nocov if we start using");
444 if(TYPEOF(x) != INTSXP)
445 error("Internal error: this order only supports ints."); // nocov
446
447 R_xlen_t len = XLENGTH(x);
448
449 SEXP res = PROTECT(duplicate(x));
450
451 qsort(INTEGER(res), (size_t) len, sizeof(int), cmpfun2);
452
453 UNPROTECT(1);
454 return res;
455 }
456 // nocov end
457 struct datum2 {SEXP val; R_xlen_t idx;};
458
cmpfun3(const void * p,const void * q)459 static int cmpfun3 (const void * p, const void * q) {
460 struct datum2 a = *(struct datum2 *) p;
461 struct datum2 b = *(struct datum2 *) q;
462 const char * a_chr = CHAR(a.val);
463 const char * b_chr = CHAR(b.val);
464 return(a_chr > b_chr ? 1 : (a_chr < b_chr ? -1 : 0));
465 }
466 /*
467 * Sort chars so that equal values are contiguous
468 *
469 * Beware, the sort is not lexical, instead this is sorted by the memory addess
470 * of the character strings backing each CHARSXP.
471 *
472 * The only purpose of this is to support the unique_chr function.
473 */
474
FANSI_sort_chr(SEXP x)475 SEXP FANSI_sort_chr(SEXP x) {
476 if(TYPEOF(x) != STRSXP)
477 error("Internal error: this sort only supports char vecs."); // nocov
478
479 R_xlen_t len = XLENGTH(x);
480 SEXP res = x;
481
482 if(len > 2) {
483 // note we explictily check in assumptions that R_xlen_t is not bigger than
484 // size_t
485
486 size_t size = 0;
487 for(int i = 0; i < (int) sizeof(struct datum); ++i) {
488 if(size > SIZE_MAX - len)
489 error("Internal error: vector too long to order"); // nocov
490 size += len;
491 }
492 struct datum2 * data = (struct datum2 *) R_alloc(len, sizeof(struct datum2));
493
494 for(R_xlen_t i = 0; i < len; ++i)
495 *(data + i) = (struct datum2){.val=STRING_ELT(x, i), .idx=i};
496
497 qsort(data, (size_t) len, sizeof(struct datum2), cmpfun3);
498
499 res = PROTECT(allocVector(STRSXP, len));
500
501 for(R_xlen_t i = 0; i < len; ++i)
502 SET_STRING_ELT(res, i, STRING_ELT(x, (data + i)->idx));
503
504 UNPROTECT(1);
505 }
506 return res;
507 }
508 /*
509 * So we can use a consistent integer type in printing possibly large indeces.
510 *
511 * Returns in 1 based indexing, -1 in the unlikely case R_xlen_t == intmax_t.
512 */
513
FANSI_ind(R_xlen_t i)514 intmax_t FANSI_ind(R_xlen_t i) {
515 intmax_t ind = i >= INTMAX_MAX ? -2 : i; // i == INTMAX_MAX is the issue
516 return ind + 1;
517 }
518
FANSI_check_chr_size(char * start,char * end,R_xlen_t i)519 void FANSI_check_chr_size(char * start, char * end, R_xlen_t i) {
520 if(end - start > FANSI_int_max) {
521 // Can't get to this point with a string that violates, AFAICT
522 // nocov start
523 error(
524 "Internal Error: %s at index [%jd] (3).",
525 "attempting to write string longer than INT_MAX",
526 FANSI_ind(i)
527 );
528 // nocov end
529 }
530 }
531
532
533
534