1 /*
2 * R : A Computer Language for Statistical Data Analysis
3 * Copyright (C) 1997--2021 The R Core Team
4 * Copyright (C) 1995, 1996 Robert Gentleman and Ross Ihaka
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, a copy is available at
18 * https://www.R-project.org/Licenses/
19 */
20
21 #ifdef HAVE_CONFIG_H
22 #include <config.h>
23 #endif
24
25 #define R_USE_SIGNALS 1
26 #include <Defn.h>
27 #include <Internal.h>
28 #include <R_ext/Print.h>
29 #include <ctype.h> /* for isspace */
30 #include <float.h> /* for DBL_MAX */
31 #include <R_ext/Itermacros.h> /* for ITERATE_BY_REGION */
32
33 #undef COMPILING_R
34
35 #define R_imax2(x, y) ((x < y) ? y : x)
36 #include <Print.h>
37
38 #ifdef HAVE_UNISTD_H
39 #include <unistd.h>
40 #endif
41
42 #ifdef Win32
43 void R_UTF8fixslash(char *s);
44 static void R_wfixslash(wchar_t *s);
45 #endif
46
47 #ifdef __cplusplus
48 #include "Clinkage.h"
49
50 extern "C" {
51 #endif
52
53 #if defined FC_LEN_T
54 # include <stddef.h>
55 void F77_SYMBOL(rwarnc)(char *msg, int *nchar, FC_LEN_T msg_len);
56 void NORET F77_SYMBOL(rexitc)(char *msg, int *nchar, FC_LEN_T msg_len);
57 #else
58 void F77_SYMBOL(rwarnc)(char *msg, int *nchar);
59 void NORET F77_SYMBOL(rexitc)(char *msg, int *nchar);
60 #endif
61
62 #ifdef __cplusplus
63 }
64 #endif
65
66 #include <rlocale.h>
67
68 /* Many small functions are included from ../include/Rinlinedfuns.h */
69
nrows(SEXP s)70 int nrows(SEXP s) // ~== NROW(.) in R
71 {
72 SEXP t;
73 if (isVector(s) || isList(s)) {
74 t = getAttrib(s, R_DimSymbol);
75 if (t == R_NilValue) return LENGTH(s);
76 return INTEGER(t)[0];
77 }
78 else if (isFrame(s)) {
79 return nrows(CAR(s));
80 }
81 else error(_("object is not a matrix"));
82 return -1;
83 }
84
85
ncols(SEXP s)86 int ncols(SEXP s) // ~== NCOL(.) in R
87 {
88 SEXP t;
89 if (isVector(s) || isList(s)) {
90 t = getAttrib(s, R_DimSymbol);
91 if (t == R_NilValue) return 1;
92 if (LENGTH(t) >= 2) return INTEGER(t)[1];
93 /* This is a 1D (or possibly 0D array) */
94 return 1;
95 }
96 else if (isFrame(s)) {
97 return length(s);
98 }
99 else error(_("object is not a matrix"));
100 return -1;/*NOTREACHED*/
101 }
102
103 #ifdef UNUSED
104 const static char type_msg[] = "invalid type passed to internal function\n";
105
internalTypeCheck(SEXP call,SEXP s,SEXPTYPE type)106 void internalTypeCheck(SEXP call, SEXP s, SEXPTYPE type)
107 {
108 if (TYPEOF(s) != type) {
109 if (call)
110 errorcall(call, type_msg);
111 else
112 error(type_msg);
113 }
114 }
115 #endif
116
117 const static char * const truenames[] = {
118 "T",
119 "True",
120 "TRUE",
121 "true",
122 (char *) NULL,
123 };
124
125 const static char * const falsenames[] = {
126 "F",
127 "False",
128 "FALSE",
129 "false",
130 (char *) NULL,
131 };
132
asChar(SEXP x)133 SEXP asChar(SEXP x)
134 {
135 if (isVectorAtomic(x) && XLENGTH(x) >= 1) {
136 int w, d, e, wi, di, ei;
137 char buf[MAXELTSIZE]; /* Probably 100 would suffice */
138
139 switch (TYPEOF(x)) {
140 case LGLSXP:
141 if (LOGICAL(x)[0] == NA_LOGICAL)
142 return NA_STRING;
143 if (LOGICAL(x)[0])
144 sprintf(buf, "TRUE");
145 else
146 sprintf(buf, "FALSE");
147 return mkChar(buf);
148 case INTSXP:
149 if (INTEGER(x)[0] == NA_INTEGER)
150 return NA_STRING;
151 snprintf(buf, MAXELTSIZE, "%d", INTEGER(x)[0]);
152 return mkChar(buf);
153 case REALSXP:
154 PrintDefaults();
155 formatReal(REAL(x), 1, &w, &d, &e, 0);
156 return mkChar(EncodeReal0(REAL(x)[0], w, d, e, OutDec));
157 case CPLXSXP:
158 PrintDefaults();
159 formatComplex(COMPLEX(x), 1, &w, &d, &e, &wi, &di, &ei, 0);
160 return mkChar(EncodeComplex(COMPLEX(x)[0], w, d, e, wi, di, ei, OutDec));
161 case STRSXP:
162 return STRING_ELT(x, 0);
163 default:
164 return NA_STRING;
165 }
166 } else if(TYPEOF(x) == CHARSXP) {
167 return x;
168 } else if(TYPEOF(x) == SYMSXP)
169 return PRINTNAME(x);
170 return NA_STRING;
171 }
172
isUnordered(SEXP s)173 Rboolean isUnordered(SEXP s)
174 {
175 return (TYPEOF(s) == INTSXP
176 && inherits(s, "factor")
177 && !inherits(s, "ordered"));
178 }
179
isOrdered(SEXP s)180 Rboolean isOrdered(SEXP s)
181 {
182 return (TYPEOF(s) == INTSXP
183 && inherits(s, "factor")
184 && inherits(s, "ordered"));
185 }
186
187
188 const static struct {
189 const char * const str;
190 const int type;
191 }
192 TypeTable[] = {
193 { "NULL", NILSXP }, /* real types */
194 { "symbol", SYMSXP },
195 { "pairlist", LISTSXP },
196 { "closure", CLOSXP },
197 { "environment", ENVSXP },
198 { "promise", PROMSXP },
199 { "language", LANGSXP },
200 { "special", SPECIALSXP },
201 { "builtin", BUILTINSXP },
202 { "char", CHARSXP },
203 { "logical", LGLSXP },
204 { "integer", INTSXP },
205 { "double", REALSXP }, /*- "real", for R <= 0.61.x */
206 { "complex", CPLXSXP },
207 { "character", STRSXP },
208 { "...", DOTSXP },
209 { "any", ANYSXP },
210 { "expression", EXPRSXP },
211 { "list", VECSXP },
212 { "externalptr", EXTPTRSXP },
213 { "bytecode", BCODESXP },
214 { "weakref", WEAKREFSXP },
215 { "raw", RAWSXP },
216 { "S4", S4SXP },
217 /* aliases : */
218 { "numeric", REALSXP },
219 { "name", SYMSXP },
220
221 { (char *)NULL, -1 }
222 };
223
224
str2type(const char * s)225 SEXPTYPE str2type(const char *s)
226 {
227 int i;
228 for (i = 0; TypeTable[i].str; i++) {
229 if (!strcmp(s, TypeTable[i].str))
230 return (SEXPTYPE) TypeTable[i].type;
231 }
232 /* SEXPTYPE is an unsigned int, so the compiler warns us w/o the cast. */
233 return (SEXPTYPE) -1;
234 }
235
236 static struct {
237 const char *cstrName;
238 SEXP rcharName;
239 SEXP rstrName;
240 SEXP rsymName;
241 } Type2Table[MAX_NUM_SEXPTYPE];
242
243
findTypeInTypeTable(SEXPTYPE t)244 static int findTypeInTypeTable(SEXPTYPE t)
245 {
246 for (int i = 0; TypeTable[i].str; i++)
247 if (TypeTable[i].type == t) return i;
248
249 return -1;
250 }
251
252 // called from main.c
253 attribute_hidden
InitTypeTables(void)254 void InitTypeTables(void) {
255
256 /* Type2Table */
257 for (int type = 0; type < MAX_NUM_SEXPTYPE; type++) {
258 int j = findTypeInTypeTable(type);
259
260 if (j != -1) {
261 const char *cstr = TypeTable[j].str;
262 SEXP rchar = PROTECT(mkChar(cstr));
263 SEXP rstr = ScalarString(rchar);
264 MARK_NOT_MUTABLE(rstr);
265 R_PreserveObject(rstr);
266 SEXP rsym = install(cstr);
267
268 Type2Table[type].cstrName = cstr;
269 Type2Table[type].rcharName = rchar;
270 Type2Table[type].rstrName = rstr;
271 Type2Table[type].rsymName = rsym;
272 UNPROTECT(1); /* rchar */
273 } else {
274 Type2Table[type].cstrName = NULL;
275 Type2Table[type].rcharName = NULL;
276 Type2Table[type].rstrName = NULL;
277 Type2Table[type].rsymName = NULL;
278 }
279 }
280 }
281
type2str_nowarn(SEXPTYPE t)282 SEXP type2str_nowarn(SEXPTYPE t) /* returns a CHARSXP */
283 {
284 // if (t >= 0 && t < MAX_NUM_SEXPTYPE) { /* branch not really needed */
285 SEXP res = Type2Table[t].rcharName;
286 if (res != NULL) return res;
287 // }
288 return R_NilValue;
289 }
290
type2str(SEXPTYPE t)291 SEXP type2str(SEXPTYPE t) /* returns a CHARSXP */
292 {
293 SEXP s = type2str_nowarn(t);
294 if (s != R_NilValue) {
295 return s;
296 }
297 warning(_("type %d is unimplemented in '%s'"), t, "type2str");
298 char buf[50];
299 snprintf(buf, 50, "unknown type #%d", t);
300 return mkChar(buf);
301 }
302
type2rstr(SEXPTYPE t)303 SEXP type2rstr(SEXPTYPE t) /* returns a STRSXP */
304 {
305 // if (t < MAX_NUM_SEXPTYPE) {
306 SEXP res = Type2Table[t].rstrName;
307 if (res != NULL) return res;
308 // }
309 error(_("type %d is unimplemented in '%s'"), t,
310 "type2ImmutableScalarString");
311 return R_NilValue; /* for -Wall */
312 }
313
type2char(SEXPTYPE t)314 const char *type2char(SEXPTYPE t) /* returns a char* */
315 {
316 // if (t >=0 && t < MAX_NUM_SEXPTYPE) { /* branch not really needed */
317 const char * res = Type2Table[t].cstrName;
318 if (res != NULL) return res;
319 // }
320 warning(_("type %d is unimplemented in '%s'"), t, "type2char");
321 static char buf[50];
322 snprintf(buf, 50, "unknown type #%d", t);
323 return buf;
324 }
325
326 #ifdef UNUSED
type2symbol(SEXPTYPE t)327 SEXP NORET type2symbol(SEXPTYPE t)
328 {
329 // if (t >= 0 && t < MAX_NUM_SEXPTYPE) { /* branch not really needed */
330 SEXP res = Type2Table[t].rsymName;
331 if (res != NULL) return res;
332 // }
333 error(_("type %d is unimplemented in '%s'"), t, "type2symbol");
334 }
335 #endif
336
337 attribute_hidden
UNIMPLEMENTED_TYPEt(const char * s,SEXPTYPE t)338 void NORET UNIMPLEMENTED_TYPEt(const char *s, SEXPTYPE t)
339 {
340 int i;
341
342 for (i = 0; TypeTable[i].str; i++) {
343 if (TypeTable[i].type == t)
344 error(_("unimplemented type '%s' in '%s'\n"), TypeTable[i].str, s);
345 }
346 error(_("unimplemented type (%d) in '%s'\n"), t, s);
347 }
348
UNIMPLEMENTED_TYPE(const char * s,SEXP x)349 void NORET UNIMPLEMENTED_TYPE(const char *s, SEXP x)
350 {
351 UNIMPLEMENTED_TYPEt(s, TYPEOF(x));
352 }
353
354 # include <R_ext/Riconv.h>
355 # include <sys/param.h>
356 # include <errno.h>
357
358
359 /* Previous versions of R (< 2.3.0) assumed wchar_t was in Unicode
360 (and it commonly is). These functions do not. */
361 # ifdef WORDS_BIGENDIAN
362 static const char UCS2ENC[] = "UCS-2BE";
363 # else
364 static const char UCS2ENC[] = "UCS-2LE";
365 # endif
366
367
368 /*
369 * out=NULL returns the number of the MBCS chars
370 */
371 /* Note: this does not terminate out, as all current uses are to look
372 * at 'out' a wchar at a time, and sometimes just one char.
373 */
mbcsToUcs2(const char * in,R_ucs2_t * out,int nout,int enc)374 size_t mbcsToUcs2(const char *in, R_ucs2_t *out, int nout, int enc)
375 {
376 void *cd = NULL ;
377 const char *i_buf;
378 char *o_buf;
379 size_t i_len, o_len, status, wc_len;
380 /* out length */
381 wc_len = (enc == CE_UTF8)? utf8towcs(NULL, in, 0) : mbstowcs(NULL, in, 0);
382 if (out == NULL || (int)wc_len < 0) return wc_len;
383
384 if ((void*)-1 == (cd = Riconv_open(UCS2ENC, (enc == CE_UTF8) ? "UTF-8": "")))
385 return (size_t) -1;
386
387 i_buf = (char *)in;
388 i_len = strlen(in); /* not including terminator */
389 o_buf = (char *)out;
390 o_len = ((size_t) nout) * sizeof(R_ucs2_t);
391 status = Riconv(cd, &i_buf, (size_t *)&i_len, &o_buf, (size_t *)&o_len);
392 int serrno = errno;
393 Riconv_close(cd);
394 if (status == (size_t)-1) {
395 switch(serrno){
396 case EINVAL:
397 return (size_t) -2;
398 case EILSEQ:
399 return (size_t) -1;
400 case E2BIG:
401 break;
402 default:
403 errno = EILSEQ;
404 return (size_t) -1;
405 }
406 }
407 return wc_len; /* status would be better? */
408 }
409
410
411 #include <wctype.h>
412
413 /* This one is not in Rinternals.h, but is used in internet module */
isBlankString(const char * s)414 Rboolean isBlankString(const char *s)
415 {
416 if(mbcslocale) {
417 wchar_t wc; size_t used; mbstate_t mb_st;
418 mbs_init(&mb_st);
419 // This does not allow for surrogate pairs, but all blanks are in BMP
420 while( (used = Mbrtowc(&wc, s, R_MB_CUR_MAX, &mb_st)) ) {
421 if(!iswspace((wint_t) wc)) return FALSE;
422 s += used;
423 }
424 } else
425 while (*s)
426 if (!isspace((int)*s++)) return FALSE;
427 return TRUE;
428 }
429
StringBlank(SEXP x)430 Rboolean StringBlank(SEXP x)
431 {
432 if (x == R_NilValue) return TRUE;
433 else return CHAR(x)[0] == '\0';
434 }
435
436 /* Function to test whether a string is a true value */
437
StringTrue(const char * name)438 Rboolean StringTrue(const char *name)
439 {
440 int i;
441 for (i = 0; truenames[i]; i++)
442 if (!strcmp(name, truenames[i]))
443 return TRUE;
444 return FALSE;
445 }
446
StringFalse(const char * name)447 Rboolean StringFalse(const char *name)
448 {
449 int i;
450 for (i = 0; falsenames[i]; i++)
451 if (!strcmp(name, falsenames[i]))
452 return TRUE;
453 return FALSE;
454 }
455
456 /* used in bind.c and options.c */
EnsureString(SEXP s)457 SEXP attribute_hidden EnsureString(SEXP s)
458 {
459 switch(TYPEOF(s)) {
460 case SYMSXP:
461 s = PRINTNAME(s);
462 break;
463 case STRSXP:
464 s = STRING_ELT(s, 0);
465 break;
466 case CHARSXP:
467 break;
468 case NILSXP:
469 s = R_BlankString;
470 break;
471 default:
472 error(_("invalid tag in name extraction"));
473 }
474 return s;
475 }
476
477 // NB: have checkArity(a,b) := Rf_checkArityCall(a,b,call)
Rf_checkArityCall(SEXP op,SEXP args,SEXP call)478 void Rf_checkArityCall(SEXP op, SEXP args, SEXP call)
479 {
480 if (PRIMARITY(op) >= 0 && PRIMARITY(op) != length(args)) {
481 /* FIXME: ngettext reguires unsigned long, but %u would seem appropriate */
482 if (PRIMINTERNAL(op))
483 error(ngettext("%d argument passed to .Internal(%s) which requires %d",
484 "%d arguments passed to .Internal(%s) which requires %d",
485 (unsigned long) length(args)),
486 length(args), PRIMNAME(op), PRIMARITY(op));
487 else
488 errorcall(call,
489 ngettext("%d argument passed to '%s' which requires %d",
490 "%d arguments passed to '%s' which requires %d",
491 (unsigned long) length(args)),
492 length(args), PRIMNAME(op), PRIMARITY(op));
493 }
494 }
495
Rf_check1arg(SEXP arg,SEXP call,const char * formal)496 void attribute_hidden Rf_check1arg(SEXP arg, SEXP call, const char *formal)
497 {
498 SEXP tag = TAG(arg);
499 if (tag == R_NilValue) return;
500 const char *supplied = CHAR(PRINTNAME(tag));
501 size_t ns = strlen(supplied);
502 if (ns > strlen(formal) || strncmp(supplied, formal, ns))
503 errorcall(call, _("supplied argument name '%s' does not match '%s'"),
504 supplied, formal);
505 }
506
507
nthcdr(SEXP s,int n)508 SEXP nthcdr(SEXP s, int n)
509 {
510 if (isList(s) || isLanguage(s) || isFrame(s) || TYPEOF(s) == DOTSXP ) {
511 while( n-- > 0 ) {
512 if (s == R_NilValue)
513 error(_("'nthcdr' list shorter than %d"), n);
514 s = CDR(s);
515 }
516 return s;
517 }
518 else error(_("'nthcdr' needs a list to CDR down"));
519 return R_NilValue;/* for -Wall */
520 }
521
522 /* Destructively removes R_NilValue ('NULL') elements from a pairlist. */
R_listCompact(SEXP s,Rboolean keep_initial)523 SEXP R_listCompact(SEXP s, Rboolean keep_initial) {
524 if(!keep_initial)
525 // skip initial NULL values
526 while (s != R_NilValue && CAR(s) == R_NilValue)
527 s = CDR(s);
528
529 SEXP val = s;
530 SEXP prev = s;
531 while (s != R_NilValue) {
532 s = CDR(s);
533 if (CAR(s) == R_NilValue) // skip it
534 SETCDR(prev, CDR(s));
535 else
536 prev = s;
537 }
538 return val;
539 }
540
541
542 /* This is a primitive (with no arguments) */
do_nargs(SEXP call,SEXP op,SEXP args,SEXP rho)543 SEXP attribute_hidden do_nargs(SEXP call, SEXP op, SEXP args, SEXP rho)
544 {
545 RCNTXT *cptr;
546 int nargs = NA_INTEGER;
547
548 checkArity(op, args);
549 for (cptr = R_GlobalContext; cptr != NULL; cptr = cptr->nextcontext) {
550 if ((cptr->callflag & CTXT_FUNCTION) && cptr->cloenv == rho) {
551 nargs = length(cptr->promargs);
552 break;
553 }
554 }
555 return ScalarInteger(nargs);
556 }
557
558
559 /* formerly used in subscript.c, in Utils.h */
setIVector(int * vec,int len,int val)560 void attribute_hidden setIVector(int * vec, int len, int val)
561 {
562 for (int i = 0; i < len; i++) vec[i] = val;
563 }
564
565
566 /* unused in R, in Utils.h, may have been used in Rcpp at some point,
567 but not any more (as per Nov. 2018) */
setRVector(double * vec,int len,double val)568 void attribute_hidden setRVector(double * vec, int len, double val)
569 {
570 for (int i = 0; i < len; i++) vec[i] = val;
571 }
572
573 /* unused in R, in Rinternals.h */
setSVector(SEXP * vec,int len,SEXP val)574 void setSVector(SEXP * vec, int len, SEXP val)
575 {
576 for (int i = 0; i < len; i++) vec[i] = val;
577 }
578
579
isFree(SEXP val)580 Rboolean isFree(SEXP val)
581 {
582 SEXP t;
583 for (t = R_FreeSEXP; t != R_NilValue; t = CAR(t))
584 if (val == t)
585 return TRUE;
586 return FALSE;
587 }
588
589
590 /* Debugging functions (hence the d-prefix). */
591 /* These are intended to be called interactively from */
592 /* a debugger such as gdb, so you don't have to remember */
593 /* the names of the data structure components. */
594
dtype(SEXP q)595 int dtype(SEXP q)
596 {
597 return((int)TYPEOF(q));
598 }
599
600
dcar(SEXP l)601 SEXP dcar(SEXP l)
602 {
603 return(CAR(l));
604 }
605
606
dcdr(SEXP l)607 SEXP dcdr(SEXP l)
608 {
609 return(CDR(l));
610 }
611
612
isort_with_index(int * x,int * indx,int n)613 static void isort_with_index(int *x, int *indx, int n)
614 {
615 int i, j, h, iv, v;
616
617 for (h = 1; h <= n / 9; h = 3 * h + 1);
618 for (; h > 0; h /= 3)
619 for (i = h; i < n; i++) {
620 v = x[i]; iv = indx[i];
621 j = i;
622 while (j >= h && x[j - h] > v)
623 { x[j] = x[j - h]; indx[j] = indx[j-h]; j -= h; }
624 x[j] = v; indx[j] = iv;
625 }
626 }
627
628
629 // body(x) without attributes "srcref", "srcfile", "wholeSrcref" :
630 // NOTE: Callers typically need PROTECT(R_body_no_src(.))
R_body_no_src(SEXP x)631 SEXP R_body_no_src(SEXP x) {
632 SEXP b = PROTECT(duplicate(BODY_EXPR(x)));
633 /* R's removeSource() works *recursively* on the body()
634 in ../library/utils/R/sourceutils.R but that seems unneeded (?) */
635 setAttrib(b, R_SrcrefSymbol, R_NilValue);
636 setAttrib(b, R_SrcfileSymbol, R_NilValue);
637 setAttrib(b, R_WholeSrcrefSymbol, R_NilValue);
638 UNPROTECT(1);
639 return b;
640 }
641
642 /* merge(xinds, yinds, all.x, all.y) */
643 /* xinds, yinds are along x and y rows matching into the (numeric)
644 common indices, with 0 for non-matches.
645
646 all.x and all.y are boolean.
647
648 The return value is a list with 4 elements (xi, yi, x.alone, y.alone),
649 which are index vectors for rows of x or y.
650 */
do_merge(SEXP call,SEXP op,SEXP args,SEXP rho)651 SEXP attribute_hidden do_merge(SEXP call, SEXP op, SEXP args, SEXP rho)
652 {
653 SEXP xi, yi, ansx, ansy, ans;
654 int nx = 0, ny = 0, i, j, k, nx_lone = 0, ny_lone = 0;
655 int all_x = 0, all_y = 0, ll = 0/* "= 0" : for -Wall */;
656 int nnx, nny;
657
658 checkArity(op, args);
659 xi = CAR(args);
660 // NB: long vectors are not supported for input
661 if ( !isInteger(xi) || !(nx = LENGTH(xi)) )
662 error(_("invalid '%s' argument"), "xinds");
663 yi = CADR(args);
664 if ( !isInteger(yi) || !(ny = LENGTH(yi)) )
665 error(_("invalid '%s' argument"), "yinds");
666 if(!LENGTH(ans = CADDR(args)) || NA_LOGICAL == (all_x = asLogical(ans)))
667 error(_("'all.x' must be TRUE or FALSE"));
668 if(!LENGTH(ans = CADDDR(args))|| NA_LOGICAL == (all_y = asLogical(ans)))
669 error(_("'all.y' must be TRUE or FALSE"));
670
671 /* 0. sort the indices */
672 int *ix = (int *) R_alloc((size_t) nx, sizeof(int));
673 int *iy = (int *) R_alloc((size_t) ny, sizeof(int));
674 for(i = 0; i < nx; i++) ix[i] = i+1;
675 for(i = 0; i < ny; i++) iy[i] = i+1;
676 isort_with_index(INTEGER(xi), ix, nx);
677 isort_with_index(INTEGER(yi), iy, ny);
678
679 /* 1. determine result sizes */
680 for (i = 0; i < nx; i++)
681 if (INTEGER(xi)[i] > 0) break;
682 nx_lone = i;
683 for (i = 0; i < ny; i++)
684 if (INTEGER(yi)[i] > 0) break;
685 ny_lone = i;
686 double dnans = 0;
687 for (i = nx_lone, j = ny_lone; i < nx; i = nnx, j = nny) {
688 int tmp = INTEGER(xi)[i];
689 for(nnx = i; nnx < nx; nnx++) if(INTEGER(xi)[nnx] != tmp) break;
690 /* the next is not in theory necessary,
691 since we have the common values only */
692 for(; j < ny; j++) if(INTEGER(yi)[j] >= tmp) break;
693 for(nny = j; nny < ny; nny++) if(INTEGER(yi)[nny] != tmp) break;
694 /* printf("i %d nnx %d j %d nny %d\n", i, nnx, j, nny); */
695 dnans += ((double)(nnx-i))*(nny-j);
696 }
697 if (dnans > R_XLEN_T_MAX)
698 error(_("number of rows in the result exceeds maximum vector length"));
699 R_xlen_t nans = (int) dnans;
700
701
702 /* 2. allocate and store result components */
703
704 const char *nms[] = {"xi", "yi", "x.alone", "y.alone", ""};
705 ans = PROTECT(mkNamed(VECSXP, nms));
706 ansx = allocVector(INTSXP, nans); SET_VECTOR_ELT(ans, 0, ansx);
707 ansy = allocVector(INTSXP, nans); SET_VECTOR_ELT(ans, 1, ansy);
708
709 if(all_x) {
710 SEXP x_lone = allocVector(INTSXP, nx_lone);
711 SET_VECTOR_ELT(ans, 2, x_lone);
712 for (i = 0, ll = 0; i < nx_lone; i++)
713 INTEGER(x_lone)[ll++] = ix[i];
714 }
715
716 if(all_y) {
717 SEXP y_lone = allocVector(INTSXP, ny_lone);
718 SET_VECTOR_ELT(ans, 3, y_lone);
719 for (i = 0, ll = 0; i < ny_lone; i++)
720 INTEGER(y_lone)[ll++] = iy[i];
721 }
722
723 for (i = nx_lone, j = ny_lone, k = 0; i < nx; i = nnx, j = nny) {
724 int tmp = INTEGER(xi)[i];
725 for(nnx = i; nnx < nx; nnx++) if(INTEGER(xi)[nnx] != tmp) break;
726 for(; j < ny; j++) if(INTEGER(yi)[j] >= tmp) break;
727 for(nny = j; nny < ny; nny++) if(INTEGER(yi)[nny] != tmp) break;
728 for(int i0 = i; i0 < nnx; i0++)
729 for(int j0 = j; j0 < nny; j0++) {
730 INTEGER(ansx)[k] = ix[i0];
731 INTEGER(ansy)[k++] = iy[j0];
732 }
733 }
734
735 UNPROTECT(1);
736 return ans;
737 }
738
739
740 /* Functions for getting and setting the working directory. */
741 #ifdef Win32
742 # define WIN32_LEAN_AND_MEAN 1
743 # include <windows.h>
744 #endif
745
intern_getwd(void)746 SEXP static intern_getwd(void)
747 {
748 SEXP rval = R_NilValue;
749 char buf[4*PATH_MAX+1];
750
751 #ifdef Win32
752 {
753 wchar_t wbuf[PATH_MAX+1];
754 int res = GetCurrentDirectoryW(PATH_MAX, wbuf);
755 if(res > 0) {
756 wcstoutf8(buf, wbuf, sizeof(buf));
757 R_UTF8fixslash(buf);
758 PROTECT(rval = allocVector(STRSXP, 1));
759 SET_STRING_ELT(rval, 0, mkCharCE(buf, CE_UTF8));
760 UNPROTECT(1);
761 }
762 }
763 #else
764 char *res = getcwd(buf, PATH_MAX); /* can return NULL */
765 if(res) rval = mkString(buf);
766 #endif
767 return(rval);
768 }
769
do_getwd(SEXP call,SEXP op,SEXP args,SEXP rho)770 SEXP attribute_hidden do_getwd(SEXP call, SEXP op, SEXP args, SEXP rho)
771 {
772 checkArity(op, args);
773
774 return(intern_getwd());
775 }
776
777
778 #if defined(Win32) && defined(_MSC_VER)
779 # include <direct.h> /* for chdir, via io.h */
780 #endif
781
do_setwd(SEXP call,SEXP op,SEXP args,SEXP rho)782 SEXP attribute_hidden do_setwd(SEXP call, SEXP op, SEXP args, SEXP rho)
783 {
784 SEXP s = R_NilValue, wd = R_NilValue; /* -Wall */
785
786 checkArity(op, args);
787 if (!isPairList(args) || !isValidString(s = CAR(args)))
788 error(_("character argument expected"));
789 if (STRING_ELT(s, 0) == NA_STRING)
790 error(_("missing value is invalid"));
791
792 /* get current directory to return */
793 PROTECT(wd = intern_getwd());
794
795 #ifdef Win32
796 {
797 const wchar_t *path = filenameToWchar(STRING_ELT(s, 0), TRUE);
798 if(_wchdir(path) < 0)
799 error(_("cannot change working directory"));
800 }
801 #else
802 {
803 const char *path
804 = R_ExpandFileName(translateCharFP(STRING_ELT(s, 0)));
805 if(chdir(path) < 0)
806 error(_("cannot change working directory"));
807 }
808 #endif
809 UNPROTECT(1); /* wd */
810 return(wd);
811 }
812
813 /* remove portion of path before file separator if one exists */
814
815 #ifdef Win32
do_basename(SEXP call,SEXP op,SEXP args,SEXP rho)816 SEXP attribute_hidden do_basename(SEXP call, SEXP op, SEXP args, SEXP rho)
817 {
818 SEXP ans, s = R_NilValue; /* -Wall */
819 char sp[4*PATH_MAX+1];
820 wchar_t buf[PATH_MAX], *p;
821 const wchar_t *pp;
822 int i, n;
823
824 checkArity(op, args);
825 if (TYPEOF(s = CAR(args)) != STRSXP)
826 error(_("a character vector argument expected"));
827 PROTECT(ans = allocVector(STRSXP, n = LENGTH(s)));
828 for(i = 0; i < n; i++) {
829 if (STRING_ELT(s, i) == NA_STRING)
830 SET_STRING_ELT(ans, i, NA_STRING);
831 else {
832 pp = filenameToWchar(STRING_ELT(s, i), TRUE);
833 if (wcslen(pp) > PATH_MAX - 1) error(_("path too long"));
834 wcscpy(buf, pp);
835 R_wfixslash(buf);
836 /* remove trailing file separator(s) */
837 if (*buf) {
838 p = buf + wcslen(buf) - 1;
839 while (p >= buf && *p == L'/') *(p--) = L'\0';
840 }
841 if ((p = wcsrchr(buf, L'/'))) p++; else p = buf;
842 wcstoutf8(sp, p, sizeof(sp));
843 SET_STRING_ELT(ans, i, mkCharCE(sp, CE_UTF8));
844 }
845 }
846 UNPROTECT(1);
847 return(ans);
848 }
849 #else
do_basename(SEXP call,SEXP op,SEXP args,SEXP rho)850 SEXP attribute_hidden do_basename(SEXP call, SEXP op, SEXP args, SEXP rho)
851 {
852 SEXP ans, s = R_NilValue; /* -Wall */
853 char buf[PATH_MAX], *p, fsp = FILESEP[0];
854 const char *pp;
855 int i, n;
856
857 checkArity(op, args);
858 if (TYPEOF(s = CAR(args)) != STRSXP)
859 error(_("a character vector argument expected"));
860 PROTECT(ans = allocVector(STRSXP, n = LENGTH(s)));
861 for(i = 0; i < n; i++) {
862 if (STRING_ELT(s, i) == NA_STRING)
863 SET_STRING_ELT(ans, i, NA_STRING);
864 else {
865 pp = R_ExpandFileName(translateCharFP(STRING_ELT(s, i)));
866 if (strlen(pp) > PATH_MAX - 1)
867 error(_("path too long"));
868 strcpy (buf, pp);
869 if (*buf) {
870 p = buf + strlen(buf) - 1;
871 while (p >= buf && *p == fsp) *(p--) = '\0';
872 }
873 if ((p = Rf_strrchr(buf, fsp)))
874 p++;
875 else
876 p = buf;
877 SET_STRING_ELT(ans, i, mkChar(p));
878 }
879 }
880 UNPROTECT(1);
881 return(ans);
882 }
883 #endif
884
885 /* remove portion of path after last file separator if one exists, else
886 return "."
887 */
888
889 #ifdef Win32
do_dirname(SEXP call,SEXP op,SEXP args,SEXP rho)890 SEXP attribute_hidden do_dirname(SEXP call, SEXP op, SEXP args, SEXP rho)
891 {
892 SEXP ans, s = R_NilValue; /* -Wall */
893 wchar_t buf[PATH_MAX], *p;
894 const wchar_t *pp;
895 char sp[4*PATH_MAX+1];
896 int i, n;
897
898 checkArity(op, args);
899 if (TYPEOF(s = CAR(args)) != STRSXP)
900 error(_("a character vector argument expected"));
901 PROTECT(ans = allocVector(STRSXP, n = LENGTH(s)));
902 for(i = 0; i < n; i++) {
903 if (STRING_ELT(s, i) == NA_STRING)
904 SET_STRING_ELT(ans, i, NA_STRING);
905 else {
906 memset(sp, 0, 4*PATH_MAX);
907 pp = filenameToWchar(STRING_ELT(s, i), TRUE);
908 if (wcslen(pp) > PATH_MAX - 1)
909 error(_("path too long"));
910 if (wcslen(pp)) {
911 wcscpy (buf, pp);
912 R_wfixslash(buf);
913 /* remove trailing file separator(s) */
914 p = buf + wcslen(buf) - 1;
915 while (p > buf && *p == L'/'
916 && (p > buf+2 || *(p-1) != L':')) *p-- = L'\0';
917 p = wcsrchr(buf, L'/');
918 if(p == NULL) wcscpy(buf, L".");
919 else {
920 while(p > buf && *p == L'/'
921 /* this covers both drives and network shares */
922 && (p > buf+2 || *(p-1) != L':')) --p;
923 p[1] = L'\0';
924 }
925 wcstoutf8(sp, buf, sizeof(sp));
926 }
927 SET_STRING_ELT(ans, i, mkCharCE(sp, CE_UTF8));
928 }
929 }
930 UNPROTECT(1);
931 return(ans);
932 }
933 #else
do_dirname(SEXP call,SEXP op,SEXP args,SEXP rho)934 SEXP attribute_hidden do_dirname(SEXP call, SEXP op, SEXP args, SEXP rho)
935 {
936 SEXP ans, s = R_NilValue; /* -Wall */
937 char buf[PATH_MAX], *p, fsp = FILESEP[0];
938 const char *pp;
939 int i, n;
940
941 checkArity(op, args);
942 if (TYPEOF(s = CAR(args)) != STRSXP)
943 error(_("a character vector argument expected"));
944 PROTECT(ans = allocVector(STRSXP, n = LENGTH(s)));
945 for(i = 0; i < n; i++) {
946 if (STRING_ELT(s, i) == NA_STRING)
947 SET_STRING_ELT(ans, i, NA_STRING);
948 else {
949 pp = R_ExpandFileName(translateCharFP(STRING_ELT(s, i)));
950 if (strlen(pp) > PATH_MAX - 1)
951 error(_("path too long"));
952 size_t ll = strlen(pp);
953 if (ll) { // svMisc calls this with ""
954 strcpy (buf, pp);
955 /* remove trailing file separator(s) */
956 while ( *(p = buf + ll - 1) == fsp && p > buf) *p = '\0';
957 p = Rf_strrchr(buf, fsp);
958 if(p == NULL)
959 strcpy(buf, ".");
960 else {
961 while(p > buf && *p == fsp) --p;
962 p[1] = '\0';
963 }
964 } else buf[0] = '\0';
965 SET_STRING_ELT(ans, i, mkChar(buf));
966 }
967 }
968 UNPROTECT(1);
969 return(ans);
970 }
971 #endif
972
973
974 #ifndef Win32 /* Windows version is in src/gnuwin32/extra.c */
975 #ifndef HAVE_DECL_REALPATH
976 extern char *realpath(const char *path, char *resolved_path);
977 #endif
978
do_normalizepath(SEXP call,SEXP op,SEXP args,SEXP rho)979 SEXP attribute_hidden do_normalizepath(SEXP call, SEXP op, SEXP args, SEXP rho)
980 {
981 SEXP ans, paths = CAR(args), elp;
982 int i, n = LENGTH(paths);
983 const char *path;
984 char abspath[PATH_MAX+1];
985
986 checkArity(op, args);
987 if (!isString(paths))
988 error(_("'path' must be a character vector"));
989
990 int mustWork = asLogical(CADDR(args)); /* 1, NA_LOGICAL or 0 */
991
992 /* Does any platform not have this? */
993 #ifdef HAVE_REALPATH
994 PROTECT(ans = allocVector(STRSXP, n));
995 for (i = 0; i < n; i++) {
996 elp = STRING_ELT(paths, i);
997 if (elp == NA_STRING) {
998 SET_STRING_ELT(ans, i, NA_STRING);
999 if (mustWork == 1)
1000 error("path[%d]=NA", i+1);
1001 else if (mustWork == NA_LOGICAL)
1002 warning("path[%d]=NA", i+1);
1003 continue;
1004 }
1005 path = translateCharFP2(elp);
1006 if (path) {
1007 char *res = realpath(path, abspath);
1008 if (res)
1009 SET_STRING_ELT(ans, i, mkChar(abspath));
1010 else {
1011 SET_STRING_ELT(ans, i, elp);
1012 /* and report the problem */
1013 if (mustWork == 1)
1014 error("path[%d]=\"%s\": %s", i+1, path, strerror(errno));
1015 else if (mustWork == NA_LOGICAL)
1016 warning("path[%d]=\"%s\": %s", i+1, path, strerror(errno));
1017 }
1018 }
1019 else if (mustWork == 1) error("fatal translation error");
1020 else SET_STRING_ELT(ans, i, elp);
1021 }
1022 #else
1023 Rboolean OK;
1024 warning("this platform does not have realpath so the results may not be canonical");
1025 PROTECT(ans = allocVector(STRSXP, n));
1026 for (i = 0; i < n; i++) {
1027 elp = STRING_ELT(paths, i);
1028 if (elp == NA_STRING) {
1029 SET_STRING_ELT(ans, i, NA_STRING);
1030 if (mustWork == 1)
1031 error("path[%d]=NA", i+1);
1032 else if (mustWork == NA_LOGICAL)
1033 warning("path[%d]=NA", i+1);
1034 continue;
1035 }
1036 path = translateCharFP(elp);
1037 OK = strlen(path) <= PATH_MAX;
1038 if (OK) {
1039 if (path[0] == '/') strncpy(abspath, path, PATH_MAX);
1040 else {
1041 OK = getcwd(abspath, PATH_MAX) != NULL;
1042 OK = OK && (strlen(path) + strlen(abspath) + 1 <= PATH_MAX);
1043 if (OK) {strcat(abspath, "/"); strcat(abspath, path);}
1044 }
1045 }
1046 /* we need to check that this exists */
1047 if (OK) OK = (access(abspath, 0 /* F_OK */) == 0);
1048 if (OK) SET_STRING_ELT(ans, i, mkChar(abspath));
1049 else {
1050 SET_STRING_ELT(ans, i, elp);
1051 /* and report the problem */
1052 if (mustWork == 1)
1053 error("path[%d]=\"%s\": %s", i+1, path, strerror(errno));
1054 else if (mustWork == NA_LOGICAL)
1055 warning("path[%d]=\"%s\": %s", i+1, path, strerror(errno));
1056 }
1057 }
1058 #endif
1059 UNPROTECT(1);
1060 return ans;
1061 }
1062
1063 #ifdef USE_INTERNAL_MKTIME
getTZinfo(void)1064 const char *getTZinfo(void)
1065 {
1066 static char def_tz[PATH_MAX+1] = "";
1067 if (def_tz[0]) return def_tz;
1068
1069 // call Sys.timezone()
1070 SEXP expr = PROTECT(install("Sys.timezone"));
1071 SEXP call = PROTECT(lang1(expr));
1072 SEXP ans = PROTECT(eval(call, R_GlobalEnv));
1073 if(TYPEOF(ans) == STRSXP && LENGTH(ans) == 1) {
1074 SEXP el = STRING_ELT(ans, 0);
1075 if (el != NA_STRING) {
1076 strcpy(def_tz, CHAR(el));
1077 // printf("tz is %s\n", CHAR(el));
1078 UNPROTECT(3);
1079 return def_tz;
1080 }
1081 }
1082 UNPROTECT(3);
1083 warning("system timezone name is unknown: set environment variable TZ");
1084 strcpy(def_tz, "unknown"); // code will then use TZDEFAULT, which is "UTC"
1085 return def_tz;
1086 }
1087 #endif
1088
1089 #endif // not Win32
1090
1091
1092 #ifdef Win32
encode_cleanup(void * data)1093 static void encode_cleanup(void *data)
1094 {
1095 WinUTF8out = TRUE;
1096 }
1097 #endif
1098
1099 /* encodeString(x, w, quote, justify) */
do_encodeString(SEXP call,SEXP op,SEXP args,SEXP rho)1100 SEXP attribute_hidden do_encodeString(SEXP call, SEXP op, SEXP args, SEXP rho)
1101 {
1102 SEXP ans, x, s;
1103 R_xlen_t i, len;
1104 int w, quote = 0, justify, na;
1105 const char *cs;
1106 Rboolean findWidth;
1107
1108 checkArity(op, args);
1109 if (TYPEOF(x = CAR(args)) != STRSXP)
1110 error(_("a character vector argument expected"));
1111 if(isNull(CADR(args))) w = NA_INTEGER;
1112 else {
1113 w = asInteger(CADR(args));
1114 if(w != NA_INTEGER && w < 0)
1115 error(_("invalid '%s' value"), "width");
1116 }
1117 findWidth = (w == NA_INTEGER);
1118 s = CADDR(args);
1119 if(LENGTH(s) != 1 || TYPEOF(s) != STRSXP)
1120 error(_("invalid '%s' value"), "quote");
1121 cs = translateChar(STRING_ELT(s, 0));
1122 if(strlen(cs) > 0) quote = cs[0];
1123 if(strlen(cs) > 1)
1124 warning(_("only the first character of 'quote' will be used"));
1125 justify = asInteger(CADDDR(args));
1126 if(justify == NA_INTEGER || justify < 0 || justify > 3)
1127 error(_("invalid '%s' value"), "justify");
1128 if(justify == 3) w = 0;
1129 na = asLogical(CAD4R(args));
1130 if(na == NA_LOGICAL) error(_("invalid '%s' value"), "na.encode");
1131
1132 len = XLENGTH(x);
1133 if(findWidth && justify < 3) {
1134 w = 0;
1135 for(i = 0; i < len; i++) {
1136 s = STRING_ELT(x, i);
1137 if(na || s != NA_STRING)
1138 w = R_imax2(w, Rstrlen(s, quote));
1139 }
1140 if(quote) w +=2; /* for surrounding quotes */
1141 }
1142 PROTECT(ans = duplicate(x));
1143 #ifdef Win32
1144 RCNTXT cntxt;
1145 Rboolean havecontext = FALSE;
1146 /* do_encodeString is not printing, but returning a string, it therefore
1147 must not produce Rgui escapes (do_encodeString may get called as part
1148 of print dispatch with WinUTF8out being already set to TRUE). */
1149 if (WinUTF8out) {
1150 begincontext(&cntxt, CTXT_CCODE, R_NilValue, R_BaseEnv, R_BaseEnv,
1151 R_NilValue, R_NilValue);
1152 cntxt.cend = &encode_cleanup;
1153 havecontext = TRUE;
1154 WinUTF8out = FALSE;
1155 }
1156 #endif
1157 for(i = 0; i < len; i++) {
1158 s = STRING_ELT(x, i);
1159 if(na || s != NA_STRING) {
1160 cetype_t ienc = getCharCE(s);
1161 if(ienc == CE_UTF8) {
1162 const char *ss = EncodeString(s, w-1000000, quote,
1163 (Rprt_adj) justify);
1164 SET_STRING_ELT(ans, i, mkCharCE(ss, ienc));
1165 } else {
1166 const char *ss = EncodeString(s, w, quote, (Rprt_adj) justify);
1167 SET_STRING_ELT(ans, i, mkChar(ss));
1168 }
1169 }
1170 }
1171 #ifdef Win32
1172 if (havecontext) {
1173 encode_cleanup(NULL);
1174 endcontext(&cntxt);
1175 }
1176 #endif
1177 UNPROTECT(1);
1178 return ans;
1179 }
1180
do_encoding(SEXP call,SEXP op,SEXP args,SEXP rho)1181 SEXP attribute_hidden do_encoding(SEXP call, SEXP op, SEXP args, SEXP rho)
1182 {
1183 SEXP ans, x;
1184 R_xlen_t i, n;
1185 char *tmp;
1186
1187 checkArity(op, args);
1188 if (TYPEOF(x = CAR(args)) != STRSXP)
1189 error(_("a character vector argument expected"));
1190 n = XLENGTH(x);
1191 PROTECT(ans = allocVector(STRSXP, n));
1192 for (i = 0; i < n; i++) {
1193 if(IS_BYTES(STRING_ELT(x, i))) tmp = "bytes";
1194 else if(IS_LATIN1(STRING_ELT(x, i))) tmp = "latin1";
1195 else if(IS_UTF8(STRING_ELT(x, i))) tmp = "UTF-8";
1196 else tmp = "unknown";
1197 SET_STRING_ELT(ans, i, mkChar(tmp));
1198 }
1199 UNPROTECT(1);
1200 return ans;
1201 }
1202
1203 #define IS_NATIVE(tmp) \
1204 (! IS_LATIN1(tmp) && ! IS_UTF8(tmp) && ! IS_BYTES(tmp))
1205
do_setencoding(SEXP call,SEXP op,SEXP args,SEXP rho)1206 SEXP attribute_hidden do_setencoding(SEXP call, SEXP op, SEXP args, SEXP rho)
1207 {
1208 SEXP x, enc, tmp;
1209 int m;
1210 R_xlen_t i, n;
1211 const char *this;
1212
1213 checkArity(op, args);
1214 if (TYPEOF(x = CAR(args)) != STRSXP)
1215 error(_("a character vector argument expected"));
1216 if (TYPEOF(enc = CADR(args)) != STRSXP)
1217 error(_("a character vector 'value' expected"));
1218 m = LENGTH(enc);
1219 if(m == 0)
1220 error(_("'value' must be of positive length"));
1221 if(MAYBE_REFERENCED(x)) x = duplicate(x);
1222 PROTECT(x);
1223 n = XLENGTH(x);
1224 for(i = 0; i < n; i++) {
1225 cetype_t ienc = CE_NATIVE;
1226 this = CHAR(STRING_ELT(enc, i % m)); /* ASCII */
1227 if(streql(this, "latin1")) ienc = CE_LATIN1;
1228 else if(streql(this, "UTF-8")) ienc = CE_UTF8;
1229 else if(streql(this, "bytes")) ienc = CE_BYTES;
1230 tmp = STRING_ELT(x, i);
1231 if(tmp == NA_STRING) continue;
1232 if (! ((ienc == CE_LATIN1 && IS_LATIN1(tmp)) ||
1233 (ienc == CE_UTF8 && IS_UTF8(tmp)) ||
1234 (ienc == CE_BYTES && IS_BYTES(tmp)) ||
1235 (ienc == CE_NATIVE && IS_NATIVE(tmp))))
1236 SET_STRING_ELT(x, i, mkCharLenCE(CHAR(tmp), LENGTH(tmp), ienc));
1237 }
1238 UNPROTECT(1);
1239 return x;
1240 }
1241
markKnown(const char * s,SEXP ref)1242 SEXP attribute_hidden markKnown(const char *s, SEXP ref)
1243 {
1244 int ienc = 0;
1245 if(ENC_KNOWN(ref)) {
1246 if(known_to_be_latin1) ienc = CE_LATIN1;
1247 if(known_to_be_utf8) ienc = CE_UTF8;
1248 }
1249 return mkCharCE(s, ienc);
1250 }
1251
strIsASCII(const char * str)1252 Rboolean strIsASCII(const char *str)
1253 {
1254 const char *p;
1255 for(p = str; *p; p++)
1256 if((unsigned int)*p > 0x7F) return FALSE;
1257 return TRUE;
1258 }
1259
1260 /* Number of additional bytes */
1261 static const unsigned char utf8_table4[] = {
1262 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1263 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1264 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
1265 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
1266
utf8clen(char c)1267 int attribute_hidden utf8clen(char c)
1268 {
1269 /* This allows through 8-bit chars 10xxxxxx, which are invalid */
1270 if ((c & 0xc0) != 0xc0) return 1;
1271 return 1 + utf8_table4[c & 0x3f];
1272 }
1273
1274 /* These are misnamed: they convert a single char */
1275 static R_wchar_t
utf16toucs(wchar_t high,wchar_t low)1276 utf16toucs(wchar_t high, wchar_t low)
1277 {
1278 return 0x10000 + ((int) (high & 0x3FF) << 10 ) + (int) (low & 0x3FF);
1279 }
1280
1281 /* Return the low UTF-16 surrogate from a UTF-8 string; assumes all testing has been done. */
1282 static wchar_t
utf8toutf16low(const char * s)1283 utf8toutf16low(const char *s)
1284 {
1285 return (unsigned int) LOW_SURROGATE_START | ((s[2] & 0x0F) << 6) | (s[3] & 0x3F);
1286 }
1287
1288 R_wchar_t attribute_hidden
utf8toucs32(wchar_t high,const char * s)1289 utf8toucs32(wchar_t high, const char *s)
1290 {
1291 return utf16toucs(high, utf8toutf16low(s));
1292 }
1293
1294 /* These return the result in wchar_t. If wchar_t is 16 bit (e.g. UTF-16LE on Windows)
1295 only the high surrogate is returned; call utf8toutf16low next. */
1296 size_t attribute_hidden
utf8toucs(wchar_t * wc,const char * s)1297 utf8toucs(wchar_t *wc, const char *s)
1298 {
1299 unsigned int byte;
1300 wchar_t local, *w;
1301 byte = *((unsigned char *)s);
1302 w = wc ? wc: &local;
1303
1304 if (byte == 0) {
1305 *w = (wchar_t) 0;
1306 return 0;
1307 } else if (byte < 0xC0) {
1308 *w = (wchar_t) byte;
1309 return 1;
1310 } else if (byte < 0xE0) {
1311 if(strlen(s) < 2) return (size_t)-2;
1312 if ((s[1] & 0xC0) == 0x80) {
1313 *w = (wchar_t) (((byte & 0x1F) << 6) | (s[1] & 0x3F));
1314 return 2;
1315 } else return (size_t)-1;
1316 } else if (byte < 0xF0) {
1317 if(strlen(s) < 3) return (size_t)-2;
1318 if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)) {
1319 *w = (wchar_t) (((byte & 0x0F) << 12)
1320 | (unsigned int) ((s[1] & 0x3F) << 6)
1321 | (s[2] & 0x3F));
1322 byte = (unsigned int) *w;
1323 /* Surrogates range */
1324 if(byte >= 0xD800 && byte <= 0xDFFF) return (size_t)-1;
1325 if(byte == 0xFFFE || byte == 0xFFFF) return (size_t)-1;
1326 return 3;
1327 } else return (size_t)-1;
1328
1329 } else if (byte < 0xf8) {
1330 if(strlen(s) < 4) return (size_t)-2;
1331 if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80) && ((s[3] & 0xC0) == 0x80)) {
1332 unsigned int cvalue = (((byte & 0x0F) << 18)
1333 | (unsigned int) ((s[1] & 0x3F) << 12)
1334 | (unsigned int) ((s[2] & 0x3F) << 6)
1335 | (s[3] & 0x3F));
1336 if(sizeof(wchar_t) < 4) /* Assume UTF-16 and return high surrogate. Users need to call utf8toutf16low next. */
1337 *w = (wchar_t) ((cvalue - 0x10000) >> 10) | 0xD800;
1338 else
1339 *w = (wchar_t) cvalue;
1340 return 4;
1341 } else return (size_t)-1;
1342 }
1343 if(sizeof(wchar_t) < 4) return (size_t)-2;
1344 /* So now handle 5.6 byte sequences with no testing */
1345 if (byte < 0xFC) {
1346 if(strlen(s) < 5) return (size_t)-2;
1347 *w = (wchar_t) (((byte & 0x0F) << 24)
1348 | (unsigned int) ((s[1] & 0x3F) << 12)
1349 | (unsigned int) ((s[2] & 0x3F) << 12)
1350 | (unsigned int) ((s[3] & 0x3F) << 6)
1351 | (s[4] & 0x3F));
1352 return 5;
1353 } else {
1354 if(strlen(s) < 6) return (size_t)-2;
1355 *w = (wchar_t) (((byte & 0x0F) << 30)
1356 | (unsigned int) ((s[1] & 0x3F) << 24)
1357 | (unsigned int) ((s[2] & 0x3F) << 18)
1358 | (unsigned int) ((s[3] & 0x3F) << 12)
1359 | (unsigned int) ((s[4] & 0x3F) << 6)
1360 | (s[5] & 0x3F));
1361 return 6;
1362 }
1363 }
1364
1365 /* despite its name this translates to UTF-16 if there are (invalid)
1366 * UTF-8 codings for surrogates in the input */
1367 size_t
utf8towcs(wchar_t * wc,const char * s,size_t n)1368 utf8towcs(wchar_t *wc, const char *s, size_t n)
1369 {
1370 ssize_t m, res = 0;
1371 const char *t;
1372 wchar_t *p;
1373 wchar_t local;
1374
1375 if(wc)
1376 for(p = wc, t = s; ; p++, t += m) {
1377 m = (ssize_t) utf8toucs(p, t);
1378 if (m < 0) error(_("invalid input '%s' in 'utf8towcs'"), s);
1379 if (m == 0) break;
1380 res ++;
1381 if (res >= n) break;
1382 if (IS_HIGH_SURROGATE(*p)) {
1383 *(++p) = utf8toutf16low(t);
1384 res ++;
1385 if (res >= n) break;
1386 }
1387 }
1388 else
1389 for(t = s; ; t += m) {
1390 m = (ssize_t) utf8toucs(&local, t);
1391 if (m < 0) error(_("invalid input '%s' in 'utf8towcs'"), s);
1392 if (m == 0) break;
1393 res ++;
1394 if (IS_HIGH_SURROGATE(local))
1395 res ++;
1396 }
1397 return (size_t) res;
1398 }
1399
1400 size_t
utf8towcs4(R_wchar_t * wc,const char * s,size_t n)1401 utf8towcs4(R_wchar_t *wc, const char *s, size_t n)
1402 {
1403 ssize_t m, res = 0;
1404 const char *t;
1405 R_wchar_t *p;
1406
1407 if(wc)
1408 for(p = wc, t = s; ; p++, t += m) {
1409 // FIXME this gives a warning on Windows.
1410 m = (ssize_t) utf8toucs(p, t);
1411 if (m < 0) error(_("invalid input '%s' in 'utf8towcs32'"), s);
1412 if (m == 0) break;
1413 if (IS_HIGH_SURROGATE(*p)) *p = utf8toucs32(*p, s);
1414 res ++;
1415 if (res >= n) break;
1416 }
1417 else
1418 for(t = s; ; t += m) {
1419 wchar_t local;
1420 m = (ssize_t) utf8toucs(&local, t);
1421 if (m < 0) error(_("invalid input '%s' in 'utf8towcs32'"), s);
1422 if (m == 0) break;
1423 res ++;
1424 }
1425 return (size_t) res;
1426 }
1427
1428 /* based on pcre.c */
1429 static const unsigned int utf8_table1[] =
1430 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
1431 static const unsigned int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
1432
1433 /* s is NULL, or it contains at least n bytes. Just write a
1434 terminator if it's not big enough.
1435
1436 Strangely named: converts from UCS-4 to UTF-8.
1437 */
1438
Rwcrtomb32(char * s,R_wchar_t cvalue,size_t n)1439 static size_t Rwcrtomb32(char *s, R_wchar_t cvalue, size_t n)
1440 {
1441 register size_t i, j;
1442 if (!n) return 0;
1443 if (s) *s = 0; /* Simplifies exit later */
1444 if(cvalue == 0) return 0;
1445 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
1446 if (cvalue <= utf8_table1[i]) break;
1447 if (i >= n - 1) return 0; /* need space for terminal null */
1448 if (s) {
1449 s += i;
1450 for (j = i; j > 0; j--) {
1451 *s-- = (char) (0x80 | (cvalue & 0x3f));
1452 cvalue >>= 6;
1453 }
1454 *s = (char) (utf8_table2[i] | cvalue);
1455 }
1456 return i + 1;
1457 }
1458
1459 /* On input, wc is a wide string encoded in UTF-16 or UCS-2 or UCS-4.
1460
1461 s can be a buffer of size n >= 0 chars, or NULL. If n = 0 or s =
1462 NULL, nothing is written.
1463
1464 The return value is the number of chars including the terminating
1465 null. If the buffer is not big enough, the result is truncated but
1466 still null-terminated
1467 */
1468 attribute_hidden // but used in windlgs
wcstoutf8(char * s,const wchar_t * wc,size_t n)1469 size_t wcstoutf8(char *s, const wchar_t *wc, size_t n)
1470 {
1471 size_t m, res = 0;
1472 char *t;
1473 const wchar_t *p;
1474 if (!n) return 0;
1475 for(p = wc, t = s; ; p++) {
1476 if (IS_SURROGATE_PAIR(*p, *(p+1))) {
1477 R_wchar_t cvalue = ((*p & 0x3FF) << 10) + (*(p+1) & 0x3FF) + 0x010000;
1478 m = Rwcrtomb32(t, cvalue, n - res);
1479 p++;
1480 } else {
1481 if (IS_HIGH_SURROGATE(*p) || IS_LOW_SURROGATE(*p))
1482 warning("unpaired surrogate Unicode point %x", *p);
1483 m = Rwcrtomb32(t, (R_wchar_t)(*p), n - res);
1484 }
1485 if (!m) break;
1486 res += m;
1487 if (t)
1488 t += m;
1489 }
1490 return res + 1;
1491 }
1492
1493 /* convert from R_wchar_t * (UCS-4) */
1494 attribute_hidden
wcs4toutf8(char * s,const R_wchar_t * wc,size_t n)1495 size_t wcs4toutf8(char *s, const R_wchar_t *wc, size_t n)
1496 {
1497 size_t m, res=0;
1498 char *t;
1499 const R_wchar_t *p;
1500 if (!n) return 0;
1501 for(p = wc, t = s; ; p++) {
1502 m = Rwcrtomb32(t, (*p), n - res);
1503 if (!m) break;
1504 res += m;
1505 if (t)
1506 t += m;
1507 }
1508 return res + 1;
1509 }
1510
1511 /* A version that reports failure as an error */
Mbrtowc(wchar_t * wc,const char * s,size_t n,mbstate_t * ps)1512 size_t Mbrtowc(wchar_t *wc, const char *s, size_t n, mbstate_t *ps)
1513 {
1514 size_t used;
1515
1516 if(n <= 0 || !*s) return (size_t)0;
1517 used = mbrtowc(wc, s, n, ps);
1518 if((int) used < 0) {
1519 /* This gets called from the menu setup in RGui */
1520 if (!R_Is_Running) return (size_t)-1;
1521 /* let's try to print out a readable version */
1522 R_CheckStack2(4*strlen(s) + 10);
1523 char err[4*strlen(s) + 1], *q;
1524 const char *p;
1525 for(p = s, q = err; *p; ) {
1526 /* don't do the first to keep ps state straight */
1527 if(p > s) used = mbrtowc(NULL, p, n, ps);
1528 if(used == 0) break;
1529 else if((int) used > 0) {
1530 memcpy(q, p, used);
1531 p += used;
1532 q += used;
1533 n -= used;
1534 } else {
1535 sprintf(q, "<%02x>", (unsigned char) *p++);
1536 q += 4;
1537 n--;
1538 }
1539 }
1540 *q = '\0';
1541 error(_("invalid multibyte string at '%s'"), err);
1542 }
1543 return used;
1544 }
1545
1546 /* Truncate a string in place (in native encoding) so that it only contains
1547 valid multi-byte characters. Has no effect in non-mbcs locales.
1548
1549 This function may be invoked by the error handler via
1550 REvprintf->Rvsnprintf_mbcs. Do not change it unless you are SURE that
1551 your changes are compatible with the error handling mechanism.
1552
1553 REvprintf is also used in R_Suicide on Unix.
1554 */
1555 attribute_hidden
mbcsTruncateToValid(char * s)1556 char* mbcsTruncateToValid(char *s)
1557 {
1558 if (!mbcslocale || *s == '\0')
1559 return s;
1560
1561 mbstate_t mb_st;
1562 size_t slen = strlen(s); /* at least 1 */
1563 size_t goodlen = 0;
1564
1565 mbs_init(&mb_st);
1566
1567 if (utf8locale) {
1568 /* UTF-8 is self-synchronizing so we can look back from the end
1569 for the first non-continuation byte */
1570 goodlen = slen - 1; /* at least 0 */
1571 /* for char == signed char we assume 2's complement representation */
1572 while (goodlen && ((s[goodlen] & '\xC0') == '\x80'))
1573 --goodlen;
1574 }
1575 while(goodlen < slen) {
1576 size_t res;
1577 res = mbrtowc(NULL, s + goodlen, slen - goodlen, &mb_st);
1578 if (res == (size_t) -1 || res == (size_t) -2) {
1579 /* strip off all remaining characters */
1580 for(;goodlen < slen; goodlen++)
1581 s[goodlen] = '\0';
1582 return s;
1583 }
1584 goodlen += res;
1585 }
1586 return s;
1587 }
1588
1589 attribute_hidden
mbcsValid(const char * str)1590 Rboolean mbcsValid(const char *str)
1591 {
1592 return ((int)mbstowcs(NULL, str, 0) >= 0);
1593 }
1594
1595
1596 /* used in src/library/grDevices/src/cairo/cairoFns.c */
1597 #include "valid_utf8.h"
utf8Valid(const char * str)1598 Rboolean utf8Valid(const char *str)
1599 {
1600 return valid_utf8(str, strlen(str)) == 0;
1601 }
1602
do_validUTF8(SEXP call,SEXP op,SEXP args,SEXP rho)1603 SEXP attribute_hidden do_validUTF8(SEXP call, SEXP op, SEXP args, SEXP rho)
1604 {
1605 checkArity(op, args);
1606 SEXP x = CAR(args);
1607 if (!isString(x))
1608 error(_("invalid '%s' argument"), "x");
1609 R_xlen_t n = XLENGTH(x);
1610 SEXP ans = allocVector(LGLSXP, n); // no allocation below
1611 int *lans = LOGICAL(ans);
1612 for (R_xlen_t i = 0; i < n; i++)
1613 lans[i] = utf8Valid(CHAR(STRING_ELT(x, i)));
1614 return ans;
1615 }
1616
do_validEnc(SEXP call,SEXP op,SEXP args,SEXP rho)1617 SEXP attribute_hidden do_validEnc(SEXP call, SEXP op, SEXP args, SEXP rho)
1618 {
1619 checkArity(op, args);
1620 SEXP x = CAR(args);
1621 if (!isString(x))
1622 error(_("invalid '%s' argument"), "x");
1623 R_xlen_t n = XLENGTH(x);
1624 SEXP ans = allocVector(LGLSXP, n); // no allocation below
1625 int *lans = LOGICAL(ans);
1626 for (R_xlen_t i = 0; i < n; i++) {
1627 SEXP p = STRING_ELT(x, i);
1628 if (IS_BYTES(p) || IS_LATIN1(p)) lans[i] = 1;
1629 else if (IS_UTF8(p) || utf8locale) lans[i] = utf8Valid(CHAR(p));
1630 else if(mbcslocale) lans[i] = mbcsValid(CHAR(p));
1631 else lans[i] = 1;
1632 }
1633 return ans;
1634 }
1635
1636
1637 /* MBCS-aware versions of common comparisons. Only used for ASCII c */
Rf_strchr(const char * s,int c)1638 char *Rf_strchr(const char *s, int c)
1639 {
1640 char *p = (char *)s;
1641 mbstate_t mb_st;
1642 size_t used;
1643
1644 if(!mbcslocale || utf8locale) return strchr(s, c);
1645 mbs_init(&mb_st);
1646 while( (used = Mbrtowc(NULL, p, R_MB_CUR_MAX, &mb_st)) ) {
1647 if(*p == c) return p;
1648 p += used;
1649 }
1650 return (char *)NULL;
1651 }
1652
Rf_strrchr(const char * s,int c)1653 char *Rf_strrchr(const char *s, int c)
1654 {
1655 char *p = (char *)s, *plast = NULL;
1656 mbstate_t mb_st;
1657 size_t used;
1658
1659 if(!mbcslocale || utf8locale) return strrchr(s, c);
1660 mbs_init(&mb_st);
1661 while( (used = Mbrtowc(NULL, p, R_MB_CUR_MAX, &mb_st)) ) {
1662 if(*p == c) plast = p;
1663 p += used;
1664 }
1665 return plast;
1666 }
1667
1668 #ifdef Win32
R_fixslash(char * s)1669 void R_fixslash(char *s)
1670 {
1671 char *p = s;
1672
1673 if(mbcslocale) {
1674 mbstate_t mb_st; int used;
1675 mbs_init(&mb_st);
1676 while((used = Mbrtowc(NULL, p, R_MB_CUR_MAX, &mb_st))) {
1677 if(*p == '\\') *p = '/';
1678 p += used;
1679 }
1680 } else
1681 for (; *p; p++) if (*p == '\\') *p = '/';
1682 /* preserve network shares */
1683 if(s[0] == '/' && s[1] == '/') s[0] = s[1] = '\\';
1684 }
1685
R_UTF8fixslash(char * s)1686 void R_UTF8fixslash(char *s)
1687 {
1688 char *p = s;
1689
1690 for (; *p; p++) if (*p == '\\') *p = '/';
1691 /* preserve network shares */
1692 if(s[0] == '/' && s[1] == '/') s[0] = s[1] = '\\';
1693 }
1694
R_wfixslash(wchar_t * s)1695 static void R_wfixslash(wchar_t *s)
1696 {
1697 wchar_t *p = s;
1698
1699 for (; *p; p++) if (*p == L'\\') *p = L'/';
1700 /* preserve network shares */
1701 if(s[0] == L'/' && s[1] == L'/') s[0] = s[1] = L'\\';
1702 }
1703
1704
R_fixbackslash(char * s)1705 void R_fixbackslash(char *s)
1706 {
1707 char *p = s;
1708
1709 if(mbcslocale) {
1710 mbstate_t mb_st; int used;
1711 mbs_init(&mb_st);
1712 while((used = Mbrtowc(NULL, p, R_MB_CUR_MAX, &mb_st))) {
1713 if(*p == '/') *p = '\\';
1714 p += used;
1715 }
1716 } else
1717 for (; *p; p++) if (*p == '/') *p = '\\';
1718 }
1719 #endif
1720
1721 #if defined FC_LEN_T
F77_SYMBOL(rexitc)1722 void NORET F77_SYMBOL(rexitc)(char *msg, int *nchar, FC_LEN_T msg_len)
1723 #else
1724 void NORET F77_SYMBOL(rexitc)(char *msg, int *nchar)
1725 #endif
1726 {
1727 int nc = *nchar;
1728 char buf[256];
1729 if(nc > 255) {
1730 warning(_("error message truncated to 255 chars"));
1731 nc = 255;
1732 }
1733 strncpy(buf, msg, (size_t) nc);
1734 buf[nc] = '\0';
1735 mbcsTruncateToValid(buf);
1736 error("%s", buf);
1737 }
1738
1739 #if defined FC_LEN_T
F77_SYMBOL(rwarnc)1740 void F77_SYMBOL(rwarnc)(char *msg, int *nchar, FC_LEN_T msg_len)
1741 #else
1742 void F77_SYMBOL(rwarnc)(char *msg, int *nchar)
1743 #endif
1744 {
1745 int nc = *nchar;
1746 char buf[256];
1747 if(nc > 255) {
1748 warning(_("warning message truncated to 255 chars"));
1749 nc = 255;
1750 }
1751 strncpy(buf, msg, (size_t) nc);
1752 buf[nc] = '\0';
1753 mbcsTruncateToValid(buf);
1754 warning("%s", buf);
1755 }
1756
F77_SYMBOL(rchkusr)1757 void F77_SYMBOL(rchkusr)(void)
1758 {
1759 R_CheckUserInterrupt();
1760 }
1761
1762 /* Return a copy of a string using memory from R_alloc.
1763 NB: caller has to manage R_alloc stack. Used in platform.c
1764 */
acopy_string(const char * in)1765 char *acopy_string(const char *in)
1766 {
1767 char *out;
1768 size_t len = strlen(in);
1769 if (len > 0) {
1770 out = (char *) R_alloc(1 + len, sizeof(char));
1771 strcpy(out, in);
1772 } else
1773 out = "";
1774 return out;
1775 }
1776
1777
1778
1779
1780 /* Table from
1781 http://unicode.org/Public/MAPPINGS/VENDORS/ADOBE/symbol.txt
1782 */
1783
1784 /* Conversion table that DOES use Private Usage Area
1785 * (should work better with specialised "symbol" fonts)
1786 */
1787 static int s2u[224] = {
1788 0x0020, 0x0021, 0x2200, 0x0023, 0x2203, 0x0025, 0x0026, 0x220D,
1789 0x0028, 0x0029, 0x2217, 0x002B, 0x002C, 0x2212, 0x002E, 0x002F,
1790 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
1791 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
1792 0x2245, 0x0391, 0x0392, 0x03A7, 0x0394, 0x0395, 0x03A6, 0x0393,
1793 0x0397, 0x0399, 0x03D1, 0x039A, 0x039B, 0x039C, 0x039D, 0x039F,
1794 0x03A0, 0x0398, 0x03A1, 0x03A3, 0x03A4, 0x03A5, 0x03C2, 0x03A9,
1795 0x039E, 0x03A8, 0x0396, 0x005B, 0x2234, 0x005D, 0x22A5, 0x005F,
1796 0xF8E5, 0x03B1, 0x03B2, 0x03C7, 0x03B4, 0x03B5, 0x03C6, 0x03B3,
1797 0x03B7, 0x03B9, 0x03D5, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BF,
1798 0x03C0, 0x03B8, 0x03C1, 0x03C3, 0x03C4, 0x03C5, 0x03D6, 0x03C9,
1799 0x03BE, 0x03C8, 0x03B6, 0x007B, 0x007C, 0x007D, 0x223C, 0x0020,
1800 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020,
1801 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020,
1802 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020,
1803 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020,
1804 0x20AC, 0x03D2, 0x2032, 0x2264, 0x2044, 0x221E, 0x0192, 0x2663,
1805 0x2666, 0x2665, 0x2660, 0x2194, 0x2190, 0x2191, 0x2192, 0x2193,
1806 0x00B0, 0x00B1, 0x2033, 0x2265, 0x00D7, 0x221D, 0x2202, 0x2022,
1807 0x00F7, 0x2260, 0x2261, 0x2248, 0x2026, 0xF8E6, 0xF8E7, 0x21B5,
1808 0x2135, 0x2111, 0x211C, 0x2118, 0x2297, 0x2295, 0x2205, 0x2229,
1809 0x222A, 0x2283, 0x2287, 0x2284, 0x2282, 0x2286, 0x2208, 0x2209,
1810 0x2220, 0x2207, 0xF6DA, 0xF6D9, 0xF6DB, 0x220F, 0x221A, 0x22C5,
1811 0x00AC, 0x2227, 0x2228, 0x21D4, 0x21D0, 0x21D1, 0x21D2, 0x21D3,
1812 0x25CA, 0x2329, 0xF8E8, 0xF8E9, 0xF8EA, 0x2211, 0xF8EB, 0xF8EC,
1813 0xF8ED, 0xF8EE, 0xF8EF, 0xF8F0, 0xF8F1, 0xF8F2, 0xF8F3, 0xF8F4,
1814 0x0020, 0x232A, 0x222B, 0x2320, 0xF8F5, 0x2321, 0xF8F6, 0xF8F7,
1815 0xF8F8, 0xF8F9, 0xF8FA, 0xF8FB, 0xF8FC, 0xF8FD, 0xF8FE, 0x0020
1816 };
1817
1818 /* Conversion table that does NOT use Private Usage Area (0xF8*)
1819 * (should work better with fonts that have good Unicode coverage)
1820 *
1821 * NOTE that ...
1822 * 23D0 VERTICAL LINE EXTENTION is used for VERTICAL ARROW EXTENDER
1823 * 23AF HORIZONTAL LINE EXTENSION is used for HORIZONTAL ARROW EXTENDER
1824 * ... neither of which may be very good AND ...
1825 * 23AF HORIZONTAL LINE EXTENSION is also used for RADICAL EXTENDER
1826 * ... and that is unlikely to be right for BOTH this use AND
1827 * HORIZONTAL ARROW EXTENDER (if either)
1828 */
1829 static int s2unicode[224] = {
1830 0x0020, 0x0021, 0x2200, 0x0023, 0x2203, 0x0025, 0x0026, 0x220D,
1831 0x0028, 0x0029, 0x2217, 0x002B, 0x002C, 0x2212, 0x002E, 0x002F,
1832 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
1833 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
1834 0x2245, 0x0391, 0x0392, 0x03A7, 0x0394, 0x0395, 0x03A6, 0x0393,
1835 0x0397, 0x0399, 0x03D1, 0x039A, 0x039B, 0x039C, 0x039D, 0x039F,
1836 0x03A0, 0x0398, 0x03A1, 0x03A3, 0x03A4, 0x03A5, 0x03C2, 0x03A9,
1837 0x039E, 0x03A8, 0x0396, 0x005B, 0x2234, 0x005D, 0x22A5, 0x005F,
1838 0x23AF, 0x03B1, 0x03B2, 0x03C7, 0x03B4, 0x03B5, 0x03C6, 0x03B3,
1839 0x03B7, 0x03B9, 0x03D5, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BF,
1840 0x03C0, 0x03B8, 0x03C1, 0x03C3, 0x03C4, 0x03C5, 0x03D6, 0x03C9,
1841 0x03BE, 0x03C8, 0x03B6, 0x007B, 0x007C, 0x007D, 0x223C, 0x0020,
1842 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020,
1843 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020,
1844 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020,
1845 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020,
1846 0x20AC, 0x03D2, 0x2032, 0x2264, 0x2044, 0x221E, 0x0192, 0x2663,
1847 0x2666, 0x2665, 0x2660, 0x2194, 0x2190, 0x2191, 0x2192, 0x2193,
1848 0x00B0, 0x00B1, 0x2033, 0x2265, 0x00D7, 0x221D, 0x2202, 0x2022,
1849 0x00F7, 0x2260, 0x2261, 0x2248, 0x2026, 0x23D0, 0x23AF, 0x21B5,
1850 0x2135, 0x2111, 0x211C, 0x2118, 0x2297, 0x2295, 0x2205, 0x2229,
1851 0x222A, 0x2283, 0x2287, 0x2284, 0x2282, 0x2286, 0x2208, 0x2209,
1852 0x2220, 0x2207, 0x00AE, 0x00A9, 0x2122, 0x220F, 0x221A, 0x22C5,
1853 0x00AC, 0x2227, 0x2228, 0x21D4, 0x21D0, 0x21D1, 0x21D2, 0x21D3,
1854 0x25CA, 0x2329, 0x00AE, 0x00A9, 0x2122, 0x2211, 0x239B, 0x239C,
1855 0x239D, 0x23A1, 0x23A2, 0x23A3, 0x23A7, 0x23A8, 0x23A9, 0x23AA,
1856 0x0020, 0x232A, 0x222B, 0x2320, 0x23AE, 0x2321, 0x239E, 0x239F,
1857 0x23A0, 0x23A4, 0x23A5, 0x23A6, 0x23AB, 0x23AC, 0x23AD, 0x0020
1858 };
1859
Rf_AdobeSymbol2utf8(char * work,const char * c0,size_t nwork,Rboolean usePUA)1860 void *Rf_AdobeSymbol2utf8(char *work, const char *c0, size_t nwork,
1861 Rboolean usePUA)
1862 {
1863 const unsigned char *c = (unsigned char *) c0;
1864 unsigned char *t = (unsigned char *) work;
1865 while (*c) {
1866 if (*c < 32) *t++ = ' ';
1867 else {
1868 unsigned int u;
1869 if (usePUA) {
1870 u = (unsigned int) s2u[*c - 32];
1871 } else {
1872 u = (unsigned int) s2unicode[*c - 32];
1873 }
1874 if (u < 128) *t++ = (unsigned char) u;
1875 else if (u < 0x800) {
1876 *t++ = (unsigned char) (0xc0 | (u >> 6));
1877 *t++ = (unsigned char) (0x80 | (u & 0x3f));
1878 } else {
1879 *t++ = (unsigned char) (0xe0 | (u >> 12));
1880 *t++ = (unsigned char) (0x80 | ((u >> 6) & 0x3f));
1881 *t++ = (unsigned char) (0x80 | (u & 0x3f));
1882 }
1883 }
1884 if (t+6 > (unsigned char *)(work + nwork)) break;
1885 c++;
1886 }
1887 *t = '\0';
1888 return (char*) work;
1889 }
1890
1891 /* Convert UTF8 symbol back to single-byte symbol
1892 * ASSUME fontface == 5 and 'str' is UTF8, i.e., we are dealing with
1893 * a UTF8 string that has been through Rf_AdobeSymbol2utf8(usePUA=TRUE)
1894 * (or through Rf_AdobeSymbol2ucs2() then Rf_ucstoutf8())
1895 * i.e., we are dealing with CE_UTF8 string that has come from CE_SYMBOL string.
1896 */
Rf_utf8toAdobeSymbol(char * out,const char * in)1897 int Rf_utf8toAdobeSymbol(char *out, const char *in) {
1898 int i, j, k, used, tmp, nc = 0, found;
1899 int *symbolint;
1900 const char *s = in;
1901 const char *p = in;
1902 for ( ; *p; p += utf8clen(*p)) nc++;
1903 symbolint = (int *) R_alloc(nc, sizeof(int));
1904 for (i = 0, j = 0; i < nc; i++, j++) {
1905 /* Convert UTF8 to int */
1906 used = mbrtoint(&tmp, s);
1907 if (used < 0)
1908 error(_("invalid UTF-8 string"));
1909 symbolint[j] = tmp;
1910 found = 0;
1911 /* Convert int to CE_SYMBOL char */
1912 for (k = 0; k < 224; k++) {
1913 if (symbolint[j] == s2u[k]) {
1914 out[j] = (char)(k + 32);
1915 found = 1;
1916 }
1917 if (found) break;
1918 }
1919 if (!found)
1920 error(_("Conversion failed"));
1921 s += used;
1922 }
1923 out[nc] = '\0';
1924 return nc;
1925 }
1926
Rf_utf8Toutf8NoPUA(const char * in)1927 const char* Rf_utf8Toutf8NoPUA(const char *in)
1928 {
1929 int i, j, used, tmp;
1930 /* At least enough because assumes each incoming char only one byte */
1931 int nChar = 3*(int)strlen(in) + 1;
1932 char *result = R_alloc(nChar, sizeof(char));
1933 const char *s = in;
1934 char *p = result;
1935 for (i = 0; i < nChar; i++) {
1936 /* Convert UTF8 char to int */
1937 used = mbrtoint(&tmp, s);
1938 /* Only re-encode if necessary
1939 * This is more efficient AND protects against input that is
1940 * NOT from Rf_AdobeSymbol2utf8(), e.g., plotmath on Windows
1941 * (which is from reEnc(CE_LATIN1, CE_UTF8))
1942 */
1943 if (tmp > 0xF600) {
1944 char inChar[4], symbolChar[2], utf8Char[4];
1945 char *q;
1946 for (j = 0; j < used; j++) {
1947 inChar[j] = *s++;
1948 }
1949 inChar[used] = '\0';
1950 Rf_utf8toAdobeSymbol(symbolChar, inChar);
1951 Rf_AdobeSymbol2utf8(utf8Char, symbolChar, 4, FALSE);
1952 q = utf8Char;
1953 while (*q) {
1954 *p++ = *q++;
1955 }
1956 } else {
1957 for (j = 0; j < used; j++) {
1958 *p++ = *s++;
1959 }
1960 }
1961 }
1962 *p = '\0';
1963 return result;
1964 }
1965
Rf_utf8ToLatin1AdobeSymbol2utf8(const char * in,Rboolean usePUA)1966 const char* Rf_utf8ToLatin1AdobeSymbol2utf8(const char *in, Rboolean usePUA)
1967 {
1968 const char *latinStr;
1969 char *utf8str;
1970 latinStr = reEnc(in, CE_UTF8, CE_LATIN1, 2);
1971 int nc = 3*(int)strlen(latinStr) + 1;
1972 utf8str = R_alloc(nc, sizeof(char));
1973 Rf_AdobeSymbol2utf8(utf8str, latinStr, nc, usePUA);
1974 return utf8str;
1975 }
1976
Rf_AdobeSymbol2ucs2(int n)1977 int attribute_hidden Rf_AdobeSymbol2ucs2(int n)
1978 {
1979 if(n >= 32 && n < 256) return s2u[n-32];
1980 else return 0;
1981 }
1982
R_strtod5(const char * str,char ** endptr,char dec,Rboolean NA,int exact)1983 double R_strtod5(const char *str, char **endptr, char dec,
1984 Rboolean NA, int exact)
1985 {
1986 LDOUBLE ans = 0.0;
1987 int sign = 1;
1988 const char *p = str;
1989
1990 /* optional whitespace */
1991 while (isspace(*p)) p++;
1992
1993 if (NA && strncmp(p, "NA", 2) == 0) {
1994 ans = NA_REAL;
1995 p += 2;
1996 goto done;
1997 }
1998
1999 /* optional sign */
2000 switch (*p) {
2001 case '-': sign = -1;
2002 case '+': p++;
2003 default: ;
2004 }
2005
2006 if (strncasecmp(p, "NaN", 3) == 0) {
2007 ans = R_NaN;
2008 p += 3;
2009 goto done;
2010 /* C99 specifies this: must come first to avoid 'inf' match */
2011 } else if (strncasecmp(p, "infinity", 8) == 0) {
2012 ans = R_PosInf;
2013 p += 8;
2014 goto done;
2015 } else if (strncasecmp(p, "Inf", 3) == 0) {
2016 ans = R_PosInf;
2017 p += 3;
2018 goto done;
2019 }
2020
2021 int n, expn = 0;
2022 if(strlen(p) > 2 && p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) { // Hexadecimal "0x....."
2023 int exph = -1;
2024
2025 /* This will overflow to Inf if appropriate */
2026 for(p += 2; p; p++) {
2027 if('0' <= *p && *p <= '9') ans = 16*ans + (*p -'0');
2028 else if('a' <= *p && *p <= 'f') ans = 16*ans + (*p -'a' + 10);
2029 else if('A' <= *p && *p <= 'F') ans = 16*ans + (*p -'A' + 10);
2030 else if(*p == dec) {exph = 0; continue;}
2031 else break;
2032 if (exph >= 0) exph += 4;
2033 }
2034 #define strtod_EXACT_CLAUSE \
2035 if(exact && ans > 0x1.fffffffffffffp52) { \
2036 if(exact == NA_LOGICAL) \
2037 warning(_( \
2038 "accuracy loss in conversion from \"%s\" to numeric"), \
2039 str); \
2040 else { \
2041 ans = NA_REAL; \
2042 p = str; /* back out */ \
2043 goto done; \
2044 } \
2045 }
2046 strtod_EXACT_CLAUSE;
2047 if (*p == 'p' || *p == 'P') {
2048 int expsign = 1;
2049 double p2 = 2.0;
2050 switch(*++p) {
2051 case '-': expsign = -1;
2052 case '+': p++;
2053 default: ;
2054 }
2055 /* The test for n is in response to PR#16358; it's not right if the exponent is
2056 very large, but the overflow or underflow below will handle it. */
2057 #define MAX_EXPONENT_PREFIX 9999
2058 for (n = 0; *p >= '0' && *p <= '9'; p++) n = (n < MAX_EXPONENT_PREFIX) ? n * 10 + (*p - '0') : n;
2059 if (ans != 0.0) { /* PR#15976: allow big exponents on 0 */
2060 LDOUBLE fac = 1.0;
2061 expn += expsign * n;
2062 if(exph > 0) {
2063 if (expn - exph < -122) { /* PR#17199: fac may overflow below if expn - exph is too small.
2064 2^-122 is a bit bigger than 1E-37, so should be fine on all systems */
2065 for (n = exph, fac = 1.0; n; n >>= 1, p2 *= p2)
2066 if (n & 1) fac *= p2;
2067 ans /= fac;
2068 p2 = 2.0;
2069 } else
2070 expn -= exph;
2071 }
2072 if (expn < 0) {
2073 for (n = -expn, fac = 1.0; n; n >>= 1, p2 *= p2)
2074 if (n & 1) fac *= p2;
2075 ans /= fac;
2076 } else {
2077 for (n = expn, fac = 1.0; n; n >>= 1, p2 *= p2)
2078 if (n & 1) fac *= p2;
2079 ans *= fac;
2080 }
2081 }
2082 }
2083 goto done;
2084 } // end {hexadecimal case}
2085
2086 int ndigits = 0;
2087 for ( ; *p >= '0' && *p <= '9'; p++, ndigits++) ans = 10*ans + (*p - '0');
2088 if (*p == dec)
2089 for (p++; *p >= '0' && *p <= '9'; p++, ndigits++, expn--)
2090 ans = 10*ans + (*p - '0');
2091 if (ndigits == 0) {
2092 ans = NA_REAL;
2093 p = str; /* back out */
2094 goto done;
2095 }
2096 strtod_EXACT_CLAUSE;
2097
2098 if (*p == 'e' || *p == 'E') {
2099 int expsign = 1;
2100 switch(*++p) {
2101 case '-': expsign = -1;
2102 case '+': p++;
2103 default: ;
2104 }
2105 for (n = 0; *p >= '0' && *p <= '9'; p++) n = (n < MAX_EXPONENT_PREFIX) ? n * 10 + (*p - '0') : n;
2106 expn += expsign * n;
2107 }
2108
2109 /* avoid unnecessary underflow for large negative exponents */
2110 if (expn + ndigits < -300) {
2111 for (n = 0; n < ndigits; n++) ans /= 10.0;
2112 expn += ndigits;
2113 }
2114 LDOUBLE p10 = 10., fac = 1.0;
2115 if (expn < -307) { /* use underflow, not overflow */
2116 for (n = -expn, fac = 1.0; n; n >>= 1, p10 *= p10)
2117 if (n & 1) fac /= p10;
2118 ans *= fac;
2119 } else if (expn < 0) { /* positive powers are exact */
2120 for (n = -expn, fac = 1.0; n; n >>= 1, p10 *= p10)
2121 if (n & 1) fac *= p10;
2122 ans /= fac;
2123 } else if (ans != 0.0) { /* PR#15976: allow big exponents on 0, e.g. 0E4933 */
2124 for (n = expn, fac = 1.0; n; n >>= 1, p10 *= p10)
2125 if (n & 1) fac *= p10;
2126 ans *= fac;
2127 }
2128
2129 /* explicit overflow to infinity */
2130 if (ans > DBL_MAX) {
2131 if (endptr) *endptr = (char *) p;
2132 return (sign > 0) ? R_PosInf : R_NegInf;
2133 }
2134
2135 done:
2136 if (endptr) *endptr = (char *) p;
2137 return sign * (double) ans;
2138 }
2139
2140
R_strtod4(const char * str,char ** endptr,char dec,Rboolean NA)2141 double R_strtod4(const char *str, char **endptr, char dec, Rboolean NA)
2142 {
2143 return R_strtod5(str, endptr, dec, NA, FALSE);
2144 }
2145
R_strtod(const char * str,char ** endptr)2146 double R_strtod(const char *str, char **endptr)
2147 {
2148 return R_strtod5(str, endptr, '.', FALSE, FALSE);
2149 }
2150
R_atof(const char * str)2151 double R_atof(const char *str)
2152 {
2153 return R_strtod5(str, NULL, '.', FALSE, FALSE);
2154 }
2155
2156 /* enc2native and enc2utf8, but they are the same in a UTF-8 locale */
2157 /* primitive */
do_enc2(SEXP call,SEXP op,SEXP args,SEXP env)2158 SEXP attribute_hidden do_enc2(SEXP call, SEXP op, SEXP args, SEXP env)
2159 {
2160 SEXP ans, el;
2161 R_xlen_t i;
2162 Rboolean duped = FALSE;
2163
2164 checkArity(op, args);
2165 check1arg(args, call, "x");
2166
2167 if (!isString(CAR(args)))
2168 errorcall(call, "argument is not a character vector");
2169 ans = CAR(args);
2170 for (i = 0; i < XLENGTH(ans); i++) {
2171 el = STRING_ELT(ans, i);
2172 if (el == NA_STRING) continue;
2173 if (PRIMVAL(op) || known_to_be_utf8) { /* enc2utf8 */
2174 if (IS_UTF8(el) || IS_ASCII(el) || IS_BYTES(el)) continue;
2175 if (!duped) { ans = PROTECT(duplicate(ans)); duped = TRUE; }
2176 SET_STRING_ELT(ans, i,
2177 mkCharCE(translateCharUTF8(el), CE_UTF8));
2178 } else if (ENC_KNOWN(el)) { /* enc2native */
2179 if (IS_ASCII(el) || IS_BYTES(el)) continue;
2180 if (known_to_be_latin1 && IS_LATIN1(el)) continue;
2181 if (!duped) { PROTECT(ans = duplicate(ans)); duped = TRUE; }
2182 if (known_to_be_latin1)
2183 SET_STRING_ELT(ans, i, mkCharCE(translateChar(el), CE_LATIN1));
2184 else
2185 SET_STRING_ELT(ans, i, mkChar(translateChar(el)));
2186 }
2187 }
2188 if(duped) UNPROTECT(1);
2189 return ans;
2190 }
2191
2192 #ifdef USE_ICU
2193 # include <locale.h>
2194 #ifdef USE_ICU_APPLE
2195 /* macOS is missing the headers */
2196 typedef int UErrorCode; /* really an enum these days */
2197 struct UCollator;
2198 typedef struct UCollator UCollator;
2199
2200 typedef enum {
2201 UCOL_EQUAL = 0,
2202 UCOL_GREATER = 1,
2203 UCOL_LESS = -1
2204 } UCollationResult ;
2205
2206 typedef enum {
2207 UCOL_DEFAULT = -1,
2208 UCOL_PRIMARY = 0,
2209 UCOL_SECONDARY = 1,
2210 UCOL_TERTIARY = 2,
2211 UCOL_DEFAULT_STRENGTH = UCOL_TERTIARY,
2212 UCOL_CE_STRENGTH_LIMIT,
2213 UCOL_QUATERNARY=3,
2214 UCOL_IDENTICAL=15,
2215 UCOL_STRENGTH_LIMIT,
2216 UCOL_OFF = 16,
2217 UCOL_ON = 17,
2218 UCOL_SHIFTED = 20,
2219 UCOL_NON_IGNORABLE = 21,
2220 UCOL_LOWER_FIRST = 24,
2221 UCOL_UPPER_FIRST = 25,
2222 UCOL_ATTRIBUTE_VALUE_COUNT
2223 } UColAttributeValue;
2224
2225 typedef UColAttributeValue UCollationStrength;
2226
2227 typedef enum {
2228 UCOL_FRENCH_COLLATION,
2229 UCOL_ALTERNATE_HANDLING,
2230 UCOL_CASE_FIRST,
2231 UCOL_CASE_LEVEL,
2232 UCOL_NORMALIZATION_MODE,
2233 UCOL_DECOMPOSITION_MODE = UCOL_NORMALIZATION_MODE,
2234 UCOL_STRENGTH,
2235 UCOL_HIRAGANA_QUATERNARY_MODE,
2236 UCOL_NUMERIC_COLLATION,
2237 UCOL_ATTRIBUTE_COUNT
2238 } UColAttribute;
2239
2240 /* UCharIterator struct has to be defined since we use its instances as
2241 local variables, but we don't actually use any of its members. */
2242 typedef struct UCharIterator {
2243 const void *context;
2244 int32_t length, start, index, limit, reservedField;
2245 void *fns[16]; /* we overshoot here (there is just 10 fns in ICU 3.6),
2246 but we have to make sure that enough stack space
2247 is allocated when used as a local var in future
2248 versions */
2249 } UCharIterator;
2250
2251 UCollator* ucol_open(const char *loc, UErrorCode *status);
2252 void ucol_close(UCollator *coll);
2253 void ucol_setAttribute(UCollator *coll, UColAttribute attr,
2254 UColAttributeValue value, UErrorCode *status);
2255 void ucol_setStrength(UCollator *coll, UCollationStrength strength);
2256 UCollationResult ucol_strcollIter(const UCollator *coll,
2257 UCharIterator *sIter,
2258 UCharIterator *tIter,
2259 UErrorCode *status);
2260 void uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length);
2261
2262 void uloc_setDefault(const char* localeID, UErrorCode* status);
2263
2264 typedef enum {
2265 ULOC_ACTUAL_LOCALE = 0,
2266 ULOC_VALID_LOCALE = 1,
2267 ULOC_DATA_LOCALE_TYPE_LIMIT = 3
2268 } ULocDataLocaleType ;
2269
2270
2271 const char* ucol_getLocaleByType(const UCollator *coll,
2272 ULocDataLocaleType type,
2273 UErrorCode *status);
2274
2275 #define U_ZERO_ERROR 0
2276 #define U_FAILURE(x) ((x)>U_ZERO_ERROR)
2277 #define ULOC_ACTUAL_LOCALE 0
2278
2279 #else
2280 #include <unicode/utypes.h>
2281 #include <unicode/ucol.h>
2282 #include <unicode/uloc.h>
2283 #include <unicode/uiter.h>
2284 #endif
2285
2286 static UCollator *collator = NULL;
2287 static int collationLocaleSet = 0;
2288
2289 /* called from platform.c */
resetICUcollator(Rboolean disable)2290 void attribute_hidden resetICUcollator(Rboolean disable)
2291 {
2292 if (collator) ucol_close(collator);
2293 collator = NULL;
2294 collationLocaleSet = disable ? 1 : 0;
2295 }
2296
2297 static const struct {
2298 const char * const str;
2299 int val;
2300 } ATtable[] = {
2301 { "case_first", UCOL_CASE_FIRST },
2302 { "upper", UCOL_UPPER_FIRST },
2303 { "lower", UCOL_LOWER_FIRST },
2304 { "default ", UCOL_DEFAULT },
2305 { "strength", 999 },
2306 { "primary ", UCOL_PRIMARY },
2307 { "secondary ", UCOL_SECONDARY },
2308 { "teritary ", UCOL_TERTIARY },
2309 { "guaternary ", UCOL_QUATERNARY },
2310 { "identical ", UCOL_IDENTICAL },
2311 { "french_collation", UCOL_FRENCH_COLLATION },
2312 { "on", UCOL_ON },
2313 { "off", UCOL_OFF },
2314 { "normalization", UCOL_NORMALIZATION_MODE },
2315 { "alternate_handling", UCOL_ALTERNATE_HANDLING },
2316 { "non_ignorable", UCOL_NON_IGNORABLE },
2317 { "shifted", UCOL_SHIFTED },
2318 { "case_level", UCOL_CASE_LEVEL },
2319 { "hiragana_quaternary", UCOL_HIRAGANA_QUATERNARY_MODE },
2320 { NULL, 0 }
2321 };
2322
2323 #ifdef Win32
2324 #define BUFFER_SIZE 512
2325 typedef int (WINAPI *PGSDLN)(LPWSTR, int);
2326
getLocale(void)2327 static const char *getLocale(void)
2328 {
2329 const char *p = getenv("R_ICU_LOCALE");
2330 if (p && p[0]) return p;
2331
2332 // This call is >= Vista/Server 2008
2333 // ICU should accept almost all of these, e.g. en-US and uz-Latn-UZ
2334 PGSDLN pGSDLN = (PGSDLN)
2335 GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")),
2336 "GetSystemDefaultLocaleName");
2337 if(pGSDLN) {
2338 WCHAR wcBuffer[BUFFER_SIZE];
2339 pGSDLN(wcBuffer, BUFFER_SIZE);
2340 static char locale[BUFFER_SIZE];
2341 WideCharToMultiByte(CP_ACP, 0, wcBuffer, -1,
2342 locale, BUFFER_SIZE, NULL, NULL);
2343 return locale;
2344 } else return "root";
2345 }
2346 #else
getLocale(void)2347 static const char *getLocale(void)
2348 {
2349 const char *p = getenv("R_ICU_LOCALE");
2350 return (p && p[0]) ? p : setlocale(LC_COLLATE, NULL);
2351 }
2352 #endif
2353
do_ICUset(SEXP call,SEXP op,SEXP args,SEXP rho)2354 SEXP attribute_hidden do_ICUset(SEXP call, SEXP op, SEXP args, SEXP rho)
2355 {
2356 SEXP x;
2357 UErrorCode status = U_ZERO_ERROR;
2358
2359 for (; args != R_NilValue; args = CDR(args)) {
2360 if (isNull(TAG(args))) error(_("all arguments must be named"));
2361 const char *this = CHAR(PRINTNAME(TAG(args)));
2362 const char *s;
2363
2364 x = CAR(args);
2365 if (!isString(x) || LENGTH(x) != 1)
2366 error(_("invalid '%s' argument"), this);
2367 s = CHAR(STRING_ELT(x, 0));
2368 if (streql(this, "locale")) {
2369 if (collator) {
2370 ucol_close(collator);
2371 collator = NULL;
2372 }
2373 if(streql(s, "ASCII")) {
2374 collationLocaleSet = 2;
2375 } else {
2376 if(strcmp(s, "none")) {
2377 if(streql(s, "default"))
2378 uloc_setDefault(getLocale(), &status);
2379 else uloc_setDefault(s, &status);
2380 if(U_FAILURE(status))
2381 error("failed to set ICU locale %s (%d)", s, status);
2382 collator = ucol_open(NULL, &status);
2383 if (U_FAILURE(status)) {
2384 collator = NULL;
2385 error("failed to open ICU collator (%d)", status);
2386 }
2387 }
2388 collationLocaleSet = 1;
2389 }
2390 } else {
2391 int i, at = -1, val = -1;
2392 for (i = 0; ATtable[i].str; i++)
2393 if (streql(this, ATtable[i].str)) {
2394 at = ATtable[i].val;
2395 break;
2396 }
2397 for (i = 0; ATtable[i].str; i++)
2398 if (streql(s, ATtable[i].str)) {
2399 val = ATtable[i].val;
2400 break;
2401 }
2402 if (collator && at == 999 && val >= 0) {
2403 ucol_setStrength(collator, val);
2404 } else if (collator && at >= 0 && val >= 0) {
2405 ucol_setAttribute(collator, at, val, &status);
2406 if (U_FAILURE(status))
2407 error("failed to set ICU collator attribute");
2408 }
2409 }
2410 }
2411
2412 return R_NilValue;
2413 }
2414
do_ICUget(SEXP call,SEXP op,SEXP args,SEXP rho)2415 SEXP attribute_hidden do_ICUget(SEXP call, SEXP op, SEXP args, SEXP rho)
2416 {
2417 const char *ans = "unknown", *res;
2418 checkArity(op, args);
2419
2420 if (collationLocaleSet == 2) {
2421 ans = "ASCII";
2422 } else if(collator) {
2423 UErrorCode status = U_ZERO_ERROR;
2424 int type = asInteger(CAR(args));
2425 if (type < 1 || type > 2)
2426 error(_("invalid '%s' value"), "type");
2427
2428 res = ucol_getLocaleByType(collator,
2429 type == 1 ? ULOC_ACTUAL_LOCALE : ULOC_VALID_LOCALE,
2430 &status);
2431 if(!U_FAILURE(status) && res) ans = res;
2432 } else ans = "ICU not in use";
2433 return mkString(ans);
2434 }
2435
2436 /* Caller has to manage the R_alloc stack */
2437 /* NB: strings can have equal collation weight without being identical */
2438 attribute_hidden
Scollate(SEXP a,SEXP b)2439 int Scollate(SEXP a, SEXP b)
2440 {
2441 if (!collationLocaleSet) {
2442 int errsv = errno; /* OSX may set errno in the operations below. */
2443 collationLocaleSet = 1;
2444
2445 /* A lot of code depends on that setting LC_ALL or LC_COLLATE to "C"
2446 via environment variables or Sys.setlocale ensures the "C" collation
2447 order. Originally, R_ICU_LOCALE always took precedence over LC_ALL
2448 and LC_COLLATE variables and over Sys.setlocale (except on Unix when
2449 R_ICU_LOCALE=C). This now adds an exception: when LC_ALL is set to "C"
2450 (or unset and LC_COLLATE is set to "C"), the "C" collation order will
2451 be used. */
2452 const char *envl = getenv("LC_ALL");
2453 if (!envl || !envl[0])
2454 envl = getenv("LC_COLLATE");
2455 int useC = envl && !strcmp(envl, "C");
2456
2457 #ifndef Win32
2458 if (!useC && strcmp("C", getLocale()) ) {
2459 #else
2460 /* On Windows, ICU is used for R_ICU_LOCALE=C, on Unix, it is not. */
2461 /* FIXME: as ICU does not support C as locale, could we use the Unix
2462 behavior on all systems? */
2463 const char *p = getenv("R_ICU_LOCALE");
2464 if(p && p[0] && (!useC || !strcmp(p, "C"))) {
2465 #endif
2466 UErrorCode status = U_ZERO_ERROR;
2467 uloc_setDefault(getLocale(), &status);
2468 if(U_FAILURE(status))
2469 error("failed to set ICU locale (%d)", status);
2470 collator = ucol_open(NULL, &status);
2471 if (U_FAILURE(status)) {
2472 collator = NULL;
2473 error("failed to open ICU collator (%d)", status);
2474 }
2475 }
2476 errno = errsv;
2477 }
2478 // translation may use escapes, but that is OK here
2479 if (collator == NULL)
2480 return collationLocaleSet == 2 ?
2481 strcmp(translateChar(a), translateChar(b)) :
2482 strcoll(translateChar(a), translateChar(b));
2483
2484 UCharIterator aIter, bIter;
2485 const char *as = translateCharUTF8(a), *bs = translateCharUTF8(b);
2486 int len1 = (int) strlen(as), len2 = (int) strlen(bs);
2487 uiter_setUTF8(&aIter, as, len1);
2488 uiter_setUTF8(&bIter, bs, len2);
2489 UErrorCode status = U_ZERO_ERROR;
2490 int result = ucol_strcollIter(collator, &aIter, &bIter, &status);
2491 if (U_FAILURE(status)) error("could not collate using ICU");
2492 return result;
2493 }
2494
2495 #else /* not USE_ICU */
2496
2497 SEXP attribute_hidden do_ICUset(SEXP call, SEXP op, SEXP args, SEXP rho)
2498 {
2499 warning(_("ICU is not supported on this build"));
2500 return R_NilValue;
2501 }
2502
2503 SEXP attribute_hidden do_ICUget(SEXP call, SEXP op, SEXP args, SEXP rho)
2504 {
2505 checkArity(op, args);
2506 return mkString("ICU not in use");
2507 }
2508
2509 void attribute_hidden resetICUcollator(Rboolean disable) {}
2510
2511 # ifdef Win32
2512
2513 static int Rstrcoll(const char *s1, const char *s2)
2514 {
2515 R_CheckStack2(sizeof(wchar_t) * (2 + strlen(s1) + strlen(s2)));
2516 wchar_t w1[strlen(s1)+1], w2[strlen(s2)+1];
2517 utf8towcs(w1, s1, strlen(s1));
2518 utf8towcs(w2, s2, strlen(s2));
2519 return wcscoll(w1, w2);
2520 }
2521
2522 int Scollate(SEXP a, SEXP b)
2523 {
2524 if(getCharCE(a) == CE_UTF8 || getCharCE(b) == CE_UTF8)
2525 return Rstrcoll(translateCharUTF8(a), translateCharUTF8(b));
2526 else
2527 return strcoll(translateChar(a), translateChar(b));
2528 }
2529
2530 # else
2531 attribute_hidden
2532 int Scollate(SEXP a, SEXP b)
2533 {
2534 return strcoll(translateChar(a), translateChar(b));
2535 }
2536
2537 # endif
2538 #endif
2539
2540 #include <lzma.h>
2541
2542 SEXP attribute_hidden do_crc64(SEXP call, SEXP op, SEXP args, SEXP rho)
2543 {
2544 checkArity(op, args);
2545 SEXP in = CAR(args);
2546 uint64_t crc = 0;
2547 char ans[17];
2548 if (!isString(in)) error("input must be a character string");
2549 const char *str = CHAR(STRING_ELT(in, 0));
2550
2551 /* Seems this is really 64-bit only on 64-bit platforms */
2552 crc = lzma_crc64((uint8_t *)str, strlen(str), crc);
2553 snprintf(ans, 17, "%lx", (long unsigned int) crc);
2554 return mkString(ans);
2555 }
2556
2557 static void
2558 bincode(double *x, R_xlen_t n, double *breaks, int nb,
2559 int *code, int right, int include_border)
2560 {
2561 int lo, hi, nb1 = nb - 1, new;
2562 int lft = !right;
2563
2564 /* This relies on breaks being sorted, so wise to check that */
2565 for(int i = 1; i < nb; i++)
2566 if(breaks[i-1] > breaks[i]) error(_("'breaks' is not sorted"));
2567
2568 for(R_xlen_t i = 0; i < n; i++) {
2569 code[i] = NA_INTEGER;
2570 if(!ISNAN(x[i])) {
2571 lo = 0;
2572 hi = nb1;
2573 if(x[i] < breaks[lo] || breaks[hi] < x[i] ||
2574 (x[i] == breaks[lft ? hi : lo] && ! include_border)) ;
2575 else {
2576 while(hi - lo >= 2) {
2577 new = (hi + lo)/2;
2578 if(x[i] > breaks[new] || (lft && x[i] == breaks[new]))
2579 lo = new;
2580 else
2581 hi = new;
2582 }
2583 code[i] = lo + 1;
2584 }
2585 }
2586 }
2587 }
2588
2589 /* 'breaks' cannot be a long vector as the return codes are integer. */
2590 SEXP attribute_hidden do_bincode(SEXP call, SEXP op, SEXP args, SEXP rho)
2591 {
2592 checkArity(op, args);
2593 SEXP x, breaks, right, lowest;
2594 x = CAR(args); args = CDR(args);
2595 breaks = CAR(args); args = CDR(args);
2596 right = CAR(args); args = CDR(args);
2597 lowest = CAR(args);
2598 #ifdef LONG_VECTOR_SUPPORT
2599 if (IS_LONG_VEC(breaks))
2600 error(_("long vector '%s' is not supported"), "breaks");
2601 #endif
2602 PROTECT(x = coerceVector(x, REALSXP));
2603 PROTECT(breaks = coerceVector(breaks, REALSXP));
2604 R_xlen_t n = XLENGTH(x);
2605 int nB = LENGTH(breaks), sr = asLogical(right), sl = asLogical(lowest);
2606 if (nB == NA_INTEGER) error(_("invalid '%s' argument"), "breaks");
2607 if (sr == NA_INTEGER) error(_("invalid '%s' argument"), "right");
2608 if (sl == NA_INTEGER) error(_("invalid '%s' argument"), "include.lowest");
2609 SEXP codes;
2610 PROTECT(codes = allocVector(INTSXP, n));
2611 bincode(REAL(x), n, REAL(breaks), nB, INTEGER(codes), sr, sl);
2612 UNPROTECT(3);
2613 return codes;
2614 }
2615
2616 SEXP attribute_hidden do_tabulate(SEXP call, SEXP op, SEXP args, SEXP rho)
2617 {
2618 checkArity(op, args);
2619 SEXP in = CAR(args), nbin = CADR(args);
2620 if (TYPEOF(in) != INTSXP) error("invalid input");
2621 R_xlen_t n = XLENGTH(in);
2622 int nb = asInteger(nbin);
2623 if (nb == NA_INTEGER || nb < 0)
2624 error(_("invalid '%s' argument"), "nbin");
2625 int *x = INTEGER(in);
2626 SEXP ans;
2627 #ifdef LONG_VECTOR_SUPPORT
2628 if (n > INT_MAX) {
2629 ans = allocVector(REALSXP, nb);
2630 double *y = REAL(ans);
2631 if (nb) memset(y, 0, nb * sizeof(double));
2632 for(R_xlen_t i = 0 ; i < n ; i++)
2633 if (x[i] != NA_INTEGER && x[i] > 0 && x[i] <= nb) y[x[i] - 1]++;
2634 } else
2635 #endif
2636 {
2637 ans = allocVector(INTSXP, nb);
2638 int *y = INTEGER(ans);
2639 if (nb) memset(y, 0, nb * sizeof(int));
2640 for(R_xlen_t i = 0 ; i < n ; i++)
2641 if (x[i] != NA_INTEGER && x[i] > 0 && x[i] <= nb) y[x[i] - 1]++;
2642 }
2643 return ans;
2644 }
2645
2646 /* Note: R's findInterval( x , vec, ...) has first two arguments swapped !
2647 * .Internal(findInterval(vec, x, rightmost.closed, all.inside, left.open))
2648 * xt x right inside leftOp
2649 * x can be a long vector but xt cannot since the result is integer
2650 */
2651 SEXP attribute_hidden do_findinterval(SEXP call, SEXP op, SEXP args, SEXP rho)
2652 {
2653 checkArity(op, args);
2654 SEXP xt, x, right, inside, leftOp;
2655 xt = CAR(args); args = CDR(args);
2656 x = CAR(args); args = CDR(args);
2657 right = CAR(args); args = CDR(args);
2658 inside = CAR(args);args = CDR(args);
2659 leftOp = CAR(args);
2660 if(TYPEOF(xt) != REALSXP || TYPEOF(x) != REALSXP) error("invalid input");
2661 #ifdef LONG_VECTOR_SUPPORT
2662 if (IS_LONG_VEC(xt))
2663 error(_("long vector '%s' is not supported"), "vec");
2664 #endif
2665 int n = LENGTH(xt);
2666 if (n == NA_INTEGER) error(_("invalid '%s' argument"), "vec");
2667 R_xlen_t nx = XLENGTH(x);
2668 int sr = asLogical(right), si = asLogical(inside), lO = asLogical(leftOp);
2669 if (sr == NA_INTEGER)
2670 error(_("invalid '%s' argument"), "rightmost.closed");
2671 if (si == NA_INTEGER)
2672 error(_("invalid '%s' argument"), "all.inside");
2673 SEXP ans = allocVector(INTSXP, nx);
2674 double *rxt = REAL(xt), *rx = REAL(x);
2675 int ii = 1;
2676 for(int i = 0; i < nx; i++) {
2677 if (ISNAN(rx[i]))
2678 ii = NA_INTEGER;
2679 else {
2680 int mfl;
2681 ii = findInterval2(rxt, n, rx[i], sr, si, lO, ii, &mfl); // -> ../appl/interv.c
2682 }
2683 INTEGER(ans)[i] = ii;
2684 }
2685 return ans;
2686 }
2687
2688 #ifdef Win32
2689 // this includes RS.h
2690 # undef ERROR
2691 #endif
2692 #include <R_ext/Applic.h>
2693 SEXP attribute_hidden do_pretty(SEXP call, SEXP op, SEXP args, SEXP rho)
2694 {
2695 checkArity(op, args);
2696 SEXP ans, nm, hi;
2697 double l = asReal(CAR(args)); args = CDR(args);
2698 if (!R_FINITE(l)) error(_("invalid '%s' argument"), "l");
2699 double u = asReal(CAR(args)); args = CDR(args);
2700 if (!R_FINITE(u)) error(_("invalid '%s' argument"), "u");
2701 int n = asInteger(CAR(args)); args = CDR(args);
2702 if (n == NA_INTEGER || n < 0) error(_("invalid '%s' argument"), "n");
2703 int min_n = asInteger(CAR(args)); args = CDR(args);
2704 if (min_n == NA_INTEGER || min_n < 0 || min_n > n)
2705 error(_("invalid '%s' argument"), "min.n");
2706 double shrink = asReal(CAR(args)); args = CDR(args);
2707 if (!R_FINITE(shrink) || shrink <= 0.)
2708 error(_("invalid '%s' argument"), "shrink.sml");
2709 PROTECT(hi = coerceVector(CAR(args), REALSXP)); args = CDR(args);
2710 double z;
2711 if (!R_FINITE(z = REAL(hi)[0]) || z < 0.)
2712 error(_("invalid '%s' argument"), "high.u.bias");
2713 if (!R_FINITE(z = REAL(hi)[1]) || z < 0.)
2714 error(_("invalid '%s' argument"), "u5.bias");
2715 int eps = asInteger(CAR(args)); /* eps.correct */
2716 if (eps == NA_INTEGER || eps < 0 || eps > 2)
2717 error(_("'eps.correct' must be 0, 1, or 2"));
2718 R_pretty(&l, &u, &n, min_n, shrink, REAL(hi), eps, 1);
2719 //------ (returns 'unit' which we do not need)
2720 PROTECT(ans = allocVector(VECSXP, 3));
2721 SET_VECTOR_ELT(ans, 0, ScalarReal(l));
2722 SET_VECTOR_ELT(ans, 1, ScalarReal(u));
2723 SET_VECTOR_ELT(ans, 2, ScalarInteger(n));
2724 nm = allocVector(STRSXP, 3);
2725 setAttrib(ans, R_NamesSymbol, nm);
2726 SET_STRING_ELT(nm, 0, mkChar("l"));
2727 SET_STRING_ELT(nm, 1, mkChar("u"));
2728 SET_STRING_ELT(nm, 2, mkChar("n"));
2729 UNPROTECT(2);
2730 return ans;
2731 }
2732
2733 /*
2734 r <- .Internal(formatC(x, as.character(mode), width, digits,
2735 as.character(format), as.character(flag), i.strlen))
2736 */
2737
2738 static void
2739 str_signif_sexp(SEXP x, const char *type, int width, int digits,
2740 const char *format, const char *flag, char **result);
2741
2742 SEXP attribute_hidden do_formatC(SEXP call, SEXP op, SEXP args, SEXP rho)
2743 {
2744 checkArity(op, args);
2745 SEXP x = CAR(args); args = CDR(args);
2746 if (!isVector(x)) error(_("'x' must be a vector"));
2747 R_xlen_t n = XLENGTH(x);
2748 const char *type = CHAR(STRING_ELT(CAR(args), 0)); args = CDR(args);
2749 int width = asInteger(CAR(args)); args = CDR(args);
2750 int digits = asInteger(CAR(args)); args = CDR(args);
2751 const char *fmt = CHAR(STRING_ELT(CAR(args), 0)); args = CDR(args);
2752 const char *flag = CHAR(STRING_ELT(CAR(args), 0)); args = CDR(args);
2753 SEXP i_strlen = PROTECT(coerceVector(CAR(args), INTSXP));
2754 char **cptr = (char **) R_alloc(n, sizeof(char*));
2755 for (R_xlen_t i = 0; i < n; i++) {
2756 int ix = INTEGER(i_strlen)[i] + 2;
2757 cptr[i] = (char *) R_alloc(ix + 1, sizeof(char));
2758 memset(cptr[i], ' ', ix);
2759 cptr[i][ix] = 0;
2760 }
2761 str_signif_sexp(x, type, width, digits, fmt, flag, cptr);
2762 SEXP ans = PROTECT(allocVector(STRSXP, n));
2763 for (R_xlen_t i = 0; i < n; i++) SET_STRING_ELT(ans, i, mkChar(cptr[i]));
2764 UNPROTECT(2);
2765 return ans;
2766 }
2767
2768 /* Former src/appl/strsignif.c
2769 *
2770 * Copyright (C) Martin Maechler, 1994, 1998
2771 * Copyright (C) 2001-2013 the R Core Team
2772 *
2773 * I want you to preserve the copyright of the original author(s),
2774 * and encourage you to send me any improvements by e-mail. (MM).
2775 *
2776 * Originally from Bill Dunlap
2777 * bill@stat.washington.edu
2778 * Wed Feb 21, 1990
2779 *
2780 * Much improved by Martin Maechler, including the "fg" format.
2781 *
2782 * Patched by Friedrich.Leisch@ci.tuwien.ac.at
2783 * Fri Nov 22, 1996
2784 *
2785 * Some fixes by Ross Ihaka
2786 * ihaka@stat.auckland.ac.nz
2787 * Sat Dec 21, 1996
2788 * Integer arguments changed from "long" to "int"
2789 * Bus error due to non-writable strings fixed
2790 *
2791 * BDR 2001-10-30 use R_alloc not Calloc as memory was not
2792 * reclaimed on error (and there are many error exits).
2793 *
2794 * type "double" or "integer" (R - numeric 'mode').
2795 *
2796 * width The total field width; width < 0 means to left justify
2797 * the number in this field (equivalent to flag = "-").
2798 * It is possible that the result will be longer than this,
2799 * but that should only happen in reasonable cases.
2800 *
2801 * digits The desired number of digits after the decimal point.
2802 * digits < 0 uses the default for C, namely 6 digits.
2803 *
2804 * format "d" (for integers) or "f", "e","E", "g", "G" (for 'real')
2805 * "f" gives numbers in the usual "xxx.xxx" format;
2806 * "e" and "E" give n.ddde<nn> or n.dddE<nn> (scientific format);
2807 * "g" and "G" puts them into scientific format if it saves
2808 * space to do so.
2809 * NEW: "fg" gives numbers in "xxx.xxx" format as "f",
2810 * ~~ however, digits are *significant* digits and,
2811 * if digits > 0, no trailing zeros are produced, as in "g".
2812 *
2813 * flag Format modifier as in K&R "C", 2nd ed., p.243;
2814 * e.g., "0" pads leading zeros; "-" does left adjustment
2815 * the other possible flags are "+", " ", and "#".
2816 * New (Feb.98): if flag has more than one character, all are passed..
2817 *
2818 * Gabe Becker (2019-05-21): Added str_signif_sexp which wraps
2819 * original DATAPTR based str_signif to support ALTREPs.
2820 *
2821 * Any future calls to str_signif on SEXP data should be via
2822 * str_signif_sexp to ensure ALTREP support.
2823 *
2824 */
2825
2826 /* <UTF8> char here is either ASCII or handled as a whole */
2827
2828 #ifdef Win32
2829 /* avoid latest MinGW's redefinition in stdio.h */
2830 #include <trioremap.h>
2831 #endif
2832 #include <Rmath.h> /* fround */
2833
2834 static
2835 void str_signif(void *x, R_xlen_t n, const char *type, int width, int digits,
2836 const char *format, const char *flag, char **result)
2837 {
2838 int dig = abs(digits);
2839 Rboolean rm_trailing_0 = digits >= 0;
2840 Rboolean do_fg = !strcmp("fg", format); /* TRUE iff format == "fg" */
2841 double xx;
2842 int iex;
2843 size_t j, len_flag = strlen(flag);
2844 const void *vmax = vmaxget();
2845
2846 char *f0 = R_alloc((size_t) do_fg ? 1+1+len_flag+3 : 1, sizeof(char));
2847 char *form = R_alloc((size_t) 1+1+len_flag+3 + strlen(format),
2848 sizeof(char));
2849
2850 if (width == 0)
2851 error("width cannot be zero");
2852
2853 if (strcmp("d", format) == 0) {
2854 if (len_flag == 0)
2855 strcpy(form, "%*d");
2856 else {
2857 strcpy(form, "%");
2858 strcat(form, flag);
2859 strcat(form, "*d");
2860 }
2861 if (strcmp("integer", type) == 0)
2862 for (R_xlen_t i = 0; i < n; i++)
2863 snprintf(result[i], strlen(result[i]) + 1,
2864 form, width, ((int *)x)[i]);
2865 else
2866 error("'type' must be \"integer\" for \"d\"-format");
2867 }
2868 else { /* --- floating point --- */
2869 if (len_flag == 0)
2870 strcpy(form, "%*.*");
2871 else {
2872 strcpy(form, "%");
2873 strcat(form, flag);
2874 strcat(form, "*.*");
2875 }
2876
2877 if(do_fg) {
2878 strcpy(f0, "%");
2879 strcat(f0, flag);
2880 strcat(f0, ".*f");
2881 strcat(form, "g");
2882 }
2883 else
2884 strcat(form, format);
2885 #ifdef DEBUG
2886 fprintf(stderr, "strsignif.c: form='%s', width=%d, dig=%d\n",
2887 form, width, dig);
2888 if(do_fg) fprintf(stderr, "\t\"fg\": f0='%s'.", f0);
2889 #endif
2890 if (strcmp("double", type) == 0) {
2891 if(do_fg) /* do smart "f" : */
2892 for (R_xlen_t i = 0; i < n; i++) {
2893 xx = ((double *)x)[i];
2894 if(xx == 0.)
2895 strcpy(result[i], "0");
2896 else {
2897 /* This was iex= (int)floor(log10(fabs(xx)))
2898 That's wrong, as xx might get rounded up,
2899 and we do need some fuzz or 99.5 is correct.
2900 */
2901 double xxx = fabs(xx), X;
2902 iex = (int)floor(log10(xxx) + 1e-12);
2903 X = fround(xxx/Rexp10((double)iex) + 1e-12,
2904 (double)(dig-1));
2905 if(iex > 0 && X >= 10) {
2906 xx = X * Rexp10((double)iex);
2907 iex++;
2908 }
2909 if(iex == -4 && fabs(xx)< 1e-4) {/* VERY rare case */
2910 iex = -5;
2911 }
2912 if(iex < -4) {
2913 /* "g" would result in 'e-' representation:*/
2914 snprintf(result[i], strlen(result[i]) + 1,
2915 f0, dig-1 + -iex, xx);
2916 #ifdef DEBUG
2917 fprintf(stderr, " x[%d]=%g, iex=%d\n", i, xx, iex);
2918 fprintf(stderr, "\tres. = '%s'; ", result[i]);
2919 #endif
2920 /* Remove trailing "0"s __ IFF flag has no '#': */
2921 if(rm_trailing_0) {
2922 j = strlen(result[i])-1;
2923 #ifdef DEBUG
2924 int jL = j;
2925 #endif
2926 while(result[i][j] == '0') j--;
2927 result[i][j+1] = '\0';
2928 #ifdef DEBUG
2929 fprintf(stderr, "\t>>> jL=%d, j=%d; new res= '%s'\n",
2930 jL, j, result[i]);
2931 #endif
2932 }
2933
2934 } else { /* iex >= -4: NOT "e-" */
2935 /* if iex >= dig, would have "e+" representation */
2936 #ifdef DEBUG
2937 fprintf(stderr, "\t iex >= -4; using %d for 'dig'\n",
2938 (iex >= dig) ? (iex+1) : dig);
2939 #endif
2940 snprintf(result[i], strlen(result[i]) + 1,
2941 form, width, (iex >= dig) ? (iex+1) : dig, xx);
2942 }
2943 } /* xx != 0 */
2944 } /* if(do_fg) for(i..) */
2945 else
2946 for (R_xlen_t i = 0; i < n; i++)
2947 snprintf(result[i], strlen(result[i]) + 1,
2948 form, width, dig, ((double *)x)[i]);
2949 } else
2950 error("'type' must be \"real\" for this format");
2951 }
2952 vmaxset(vmax);
2953 }
2954
2955
2956 /* wrap original DATAPTR based str_signif in ITERATE_BY_REGION calls to
2957 support ALTREPs
2958
2959 We still accept type because it is part of the defined API and only defaults
2960 to matching the SEXP type.
2961 */
2962 static
2963 void str_signif_sexp(SEXP x, const char *type, int width, int digits,
2964 const char *format, const char *flag, char **result)
2965 {
2966 /* result + idx is the overall position of the chunk we're populating */
2967 if(TYPEOF(x) == INTSXP) {
2968 ITERATE_BY_REGION(x, px, idx, nb, int, INTEGER,
2969 {
2970 str_signif((void *) px, nb, type, width, digits,
2971 format, flag, result + idx);
2972 });
2973 } else if (TYPEOF(x) == REALSXP) {
2974 ITERATE_BY_REGION(x, px, idx, nb, double, REAL,
2975 {
2976 str_signif((void *) px, nb, type, width, digits,
2977 format, flag, result + idx);
2978 });
2979 } else {
2980 error("unsupported type ");
2981 }
2982 }
2983
2984 /* added in R 4.1.0.
2985 This checks if it succeeds.
2986 FIXME: is this worth inlining?
2987 */
2988 char *Rstrdup(const char *s)
2989 {
2990 size_t nb = strlen(s) + 1;
2991 void *cpy = malloc(nb);
2992 if (cpy == NULL) error("allocation error in Rstrdup");
2993 memcpy (cpy, s, nb);
2994 return (char *) cpy;
2995 }
2996