1 /*
2  *  R : A Computer Language for Statistical Data Analysis
3  *  Copyright (C) 1997--2021  The R Core Team
4  *  Copyright (C) 1995, 1996  Robert Gentleman and Ross Ihaka
5  *
6  *  This program is free software; you can redistribute it and/or modify
7  *  it under the terms of the GNU General Public License as published by
8  *  the Free Software Foundation; either version 2 of the License, or
9  *  (at your option) any later version.
10  *
11  *  This program is distributed in the hope that it will be useful,
12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  *  GNU General Public License for more details.
15  *
16  *  You should have received a copy of the GNU General Public License
17  *  along with this program; if not, a copy is available at
18  *  https://www.R-project.org/Licenses/
19  */
20 
21 #ifdef HAVE_CONFIG_H
22 #include <config.h>
23 #endif
24 
25 #define R_USE_SIGNALS 1
26 #include <Defn.h>
27 #include <Internal.h>
28 #include <R_ext/Print.h>
29 #include <ctype.h>		/* for isspace */
30 #include <float.h>		/* for DBL_MAX */
31 #include <R_ext/Itermacros.h> /* for ITERATE_BY_REGION */
32 
33 #undef COMPILING_R
34 
35 #define R_imax2(x, y) ((x < y) ? y : x)
36 #include <Print.h>
37 
38 #ifdef HAVE_UNISTD_H
39 #include <unistd.h>
40 #endif
41 
42 #ifdef Win32
43 void R_UTF8fixslash(char *s);
44 static void R_wfixslash(wchar_t *s);
45 #endif
46 
47 #ifdef __cplusplus
48 #include "Clinkage.h"
49 
50 extern "C" {
51 #endif
52 
53 #if defined FC_LEN_T
54 # include <stddef.h>
55 void F77_SYMBOL(rwarnc)(char *msg, int *nchar, FC_LEN_T msg_len);
56 void NORET F77_SYMBOL(rexitc)(char *msg, int *nchar, FC_LEN_T msg_len);
57 #else
58 void F77_SYMBOL(rwarnc)(char *msg, int *nchar);
59 void NORET F77_SYMBOL(rexitc)(char *msg, int *nchar);
60 #endif
61 
62 #ifdef __cplusplus
63 }
64 #endif
65 
66 #include <rlocale.h>
67 
68 /* Many small functions are included from ../include/Rinlinedfuns.h */
69 
nrows(SEXP s)70 int nrows(SEXP s) // ~== NROW(.)  in R
71 {
72     SEXP t;
73     if (isVector(s) || isList(s)) {
74 	t = getAttrib(s, R_DimSymbol);
75 	if (t == R_NilValue) return LENGTH(s);
76 	return INTEGER(t)[0];
77     }
78     else if (isFrame(s)) {
79 	return nrows(CAR(s));
80     }
81     else error(_("object is not a matrix"));
82     return -1;
83 }
84 
85 
ncols(SEXP s)86 int ncols(SEXP s) // ~== NCOL(.)  in R
87 {
88     SEXP t;
89     if (isVector(s) || isList(s)) {
90 	t = getAttrib(s, R_DimSymbol);
91 	if (t == R_NilValue) return 1;
92 	if (LENGTH(t) >= 2) return INTEGER(t)[1];
93 	/* This is a 1D (or possibly 0D array) */
94 	return 1;
95     }
96     else if (isFrame(s)) {
97 	return length(s);
98     }
99     else error(_("object is not a matrix"));
100     return -1;/*NOTREACHED*/
101 }
102 
103 #ifdef UNUSED
104 const static char type_msg[] = "invalid type passed to internal function\n";
105 
internalTypeCheck(SEXP call,SEXP s,SEXPTYPE type)106 void internalTypeCheck(SEXP call, SEXP s, SEXPTYPE type)
107 {
108     if (TYPEOF(s) != type) {
109 	if (call)
110 	    errorcall(call, type_msg);
111 	else
112 	    error(type_msg);
113     }
114 }
115 #endif
116 
117 const static char * const truenames[] = {
118     "T",
119     "True",
120     "TRUE",
121     "true",
122     (char *) NULL,
123 };
124 
125 const static char * const falsenames[] = {
126     "F",
127     "False",
128     "FALSE",
129     "false",
130     (char *) NULL,
131 };
132 
asChar(SEXP x)133 SEXP asChar(SEXP x)
134 {
135 	if (isVectorAtomic(x) && XLENGTH(x) >= 1) {
136 	    int w, d, e, wi, di, ei;
137 	    char buf[MAXELTSIZE];  /* Probably 100 would suffice */
138 
139 	    switch (TYPEOF(x)) {
140 	    case LGLSXP:
141 		if (LOGICAL(x)[0] == NA_LOGICAL)
142 		    return NA_STRING;
143 		if (LOGICAL(x)[0])
144 		    sprintf(buf, "TRUE");
145 		else
146 		    sprintf(buf, "FALSE");
147 		return mkChar(buf);
148 	    case INTSXP:
149 		if (INTEGER(x)[0] == NA_INTEGER)
150 		    return NA_STRING;
151 		snprintf(buf, MAXELTSIZE, "%d", INTEGER(x)[0]);
152 		return mkChar(buf);
153 	    case REALSXP:
154 		PrintDefaults();
155 		formatReal(REAL(x), 1, &w, &d, &e, 0);
156 		return mkChar(EncodeReal0(REAL(x)[0], w, d, e, OutDec));
157 	    case CPLXSXP:
158 		PrintDefaults();
159 		formatComplex(COMPLEX(x), 1, &w, &d, &e, &wi, &di, &ei, 0);
160 		return mkChar(EncodeComplex(COMPLEX(x)[0], w, d, e, wi, di, ei, OutDec));
161 	    case STRSXP:
162 		return STRING_ELT(x, 0);
163 	    default:
164 		return NA_STRING;
165 	    }
166 	} else if(TYPEOF(x) == CHARSXP) {
167 	    return x;
168 	} else if(TYPEOF(x) == SYMSXP)
169 	    return PRINTNAME(x);
170     return NA_STRING;
171 }
172 
isUnordered(SEXP s)173 Rboolean isUnordered(SEXP s)
174 {
175     return (TYPEOF(s) == INTSXP
176 	    && inherits(s, "factor")
177 	    && !inherits(s, "ordered"));
178 }
179 
isOrdered(SEXP s)180 Rboolean isOrdered(SEXP s)
181 {
182     return (TYPEOF(s) == INTSXP
183 	    && inherits(s, "factor")
184 	    && inherits(s, "ordered"));
185 }
186 
187 
188 const static struct {
189     const char * const str;
190     const int type;
191 }
192 TypeTable[] = {
193     { "NULL",		NILSXP	   },  /* real types */
194     { "symbol",		SYMSXP	   },
195     { "pairlist",	LISTSXP	   },
196     { "closure",	CLOSXP	   },
197     { "environment",	ENVSXP	   },
198     { "promise",	PROMSXP	   },
199     { "language",	LANGSXP	   },
200     { "special",	SPECIALSXP },
201     { "builtin",	BUILTINSXP },
202     { "char",		CHARSXP	   },
203     { "logical",	LGLSXP	   },
204     { "integer",	INTSXP	   },
205     { "double",		REALSXP	   }, /*-  "real", for R <= 0.61.x */
206     { "complex",	CPLXSXP	   },
207     { "character",	STRSXP	   },
208     { "...",		DOTSXP	   },
209     { "any",		ANYSXP	   },
210     { "expression",	EXPRSXP	   },
211     { "list",		VECSXP	   },
212     { "externalptr",	EXTPTRSXP  },
213     { "bytecode",	BCODESXP   },
214     { "weakref",	WEAKREFSXP },
215     { "raw",		RAWSXP },
216     { "S4",		S4SXP },
217     /* aliases : */
218     { "numeric",	REALSXP	   },
219     { "name",		SYMSXP	   },
220 
221     { (char *)NULL,	-1	   }
222 };
223 
224 
str2type(const char * s)225 SEXPTYPE str2type(const char *s)
226 {
227     int i;
228     for (i = 0; TypeTable[i].str; i++) {
229 	if (!strcmp(s, TypeTable[i].str))
230 	    return (SEXPTYPE) TypeTable[i].type;
231     }
232     /* SEXPTYPE is an unsigned int, so the compiler warns us w/o the cast. */
233     return (SEXPTYPE) -1;
234 }
235 
236 static struct {
237     const char *cstrName;
238     SEXP rcharName;
239     SEXP rstrName;
240     SEXP rsymName;
241 } Type2Table[MAX_NUM_SEXPTYPE];
242 
243 
findTypeInTypeTable(SEXPTYPE t)244 static int findTypeInTypeTable(SEXPTYPE t)
245  {
246     for (int i = 0; TypeTable[i].str; i++)
247 	if (TypeTable[i].type == t) return i;
248 
249     return -1;
250 }
251 
252 // called from main.c
253 attribute_hidden
InitTypeTables(void)254 void InitTypeTables(void) {
255 
256     /* Type2Table */
257     for (int type = 0; type < MAX_NUM_SEXPTYPE; type++) {
258 	int j = findTypeInTypeTable(type);
259 
260 	if (j != -1) {
261 	    const char *cstr = TypeTable[j].str;
262 	    SEXP rchar = PROTECT(mkChar(cstr));
263 	    SEXP rstr = ScalarString(rchar);
264 	    MARK_NOT_MUTABLE(rstr);
265 	    R_PreserveObject(rstr);
266 	    SEXP rsym = install(cstr);
267 
268 	    Type2Table[type].cstrName = cstr;
269 	    Type2Table[type].rcharName = rchar;
270 	    Type2Table[type].rstrName = rstr;
271 	    Type2Table[type].rsymName = rsym;
272 	    UNPROTECT(1); /* rchar */
273 	} else {
274 	    Type2Table[type].cstrName = NULL;
275 	    Type2Table[type].rcharName = NULL;
276 	    Type2Table[type].rstrName = NULL;
277 	    Type2Table[type].rsymName = NULL;
278 	}
279     }
280 }
281 
type2str_nowarn(SEXPTYPE t)282 SEXP type2str_nowarn(SEXPTYPE t) /* returns a CHARSXP */
283 {
284     // if (t >= 0 && t < MAX_NUM_SEXPTYPE) { /* branch not really needed */
285 	SEXP res = Type2Table[t].rcharName;
286 	if (res != NULL) return res;
287     // }
288     return R_NilValue;
289 }
290 
type2str(SEXPTYPE t)291 SEXP type2str(SEXPTYPE t) /* returns a CHARSXP */
292 {
293     SEXP s = type2str_nowarn(t);
294     if (s != R_NilValue) {
295 	return s;
296     }
297     warning(_("type %d is unimplemented in '%s'"), t, "type2str");
298     char buf[50];
299     snprintf(buf, 50, "unknown type #%d", t);
300     return mkChar(buf);
301 }
302 
type2rstr(SEXPTYPE t)303 SEXP type2rstr(SEXPTYPE t) /* returns a STRSXP */
304 {
305     // if (t < MAX_NUM_SEXPTYPE) {
306 	SEXP res = Type2Table[t].rstrName;
307 	if (res != NULL) return res;
308     // }
309     error(_("type %d is unimplemented in '%s'"), t,
310 	  "type2ImmutableScalarString");
311     return R_NilValue; /* for -Wall */
312 }
313 
type2char(SEXPTYPE t)314 const char *type2char(SEXPTYPE t) /* returns a char* */
315 {
316     // if (t >=0 && t < MAX_NUM_SEXPTYPE) { /* branch not really needed */
317 	const char * res = Type2Table[t].cstrName;
318 	if (res != NULL) return res;
319     // }
320     warning(_("type %d is unimplemented in '%s'"), t, "type2char");
321     static char buf[50];
322     snprintf(buf, 50, "unknown type #%d", t);
323     return buf;
324 }
325 
326 #ifdef UNUSED
type2symbol(SEXPTYPE t)327 SEXP NORET type2symbol(SEXPTYPE t)
328 {
329     // if (t >= 0 && t < MAX_NUM_SEXPTYPE) { /* branch not really needed */
330 	SEXP res = Type2Table[t].rsymName;
331 	if (res != NULL) return res;
332     // }
333     error(_("type %d is unimplemented in '%s'"), t, "type2symbol");
334 }
335 #endif
336 
337 attribute_hidden
UNIMPLEMENTED_TYPEt(const char * s,SEXPTYPE t)338 void NORET UNIMPLEMENTED_TYPEt(const char *s, SEXPTYPE t)
339 {
340     int i;
341 
342     for (i = 0; TypeTable[i].str; i++) {
343 	if (TypeTable[i].type == t)
344 	    error(_("unimplemented type '%s' in '%s'\n"), TypeTable[i].str, s);
345     }
346     error(_("unimplemented type (%d) in '%s'\n"), t, s);
347 }
348 
UNIMPLEMENTED_TYPE(const char * s,SEXP x)349 void NORET UNIMPLEMENTED_TYPE(const char *s, SEXP x)
350 {
351     UNIMPLEMENTED_TYPEt(s, TYPEOF(x));
352 }
353 
354 # include <R_ext/Riconv.h>
355 # include <sys/param.h>
356 # include <errno.h>
357 
358 
359 /* Previous versions of R (< 2.3.0) assumed wchar_t was in Unicode
360    (and it commonly is).  These functions do not. */
361 # ifdef WORDS_BIGENDIAN
362 static const char UCS2ENC[] = "UCS-2BE";
363 # else
364 static const char UCS2ENC[] = "UCS-2LE";
365 # endif
366 
367 
368 /*
369  * out=NULL returns the number of the MBCS chars
370  */
371 /* Note: this does not terminate out, as all current uses are to look
372  * at 'out' a wchar at a time, and sometimes just one char.
373  */
mbcsToUcs2(const char * in,R_ucs2_t * out,int nout,int enc)374 size_t mbcsToUcs2(const char *in, R_ucs2_t *out, int nout, int enc)
375 {
376     void   *cd = NULL ;
377     const char *i_buf;
378     char *o_buf;
379     size_t  i_len, o_len, status, wc_len;
380     /* out length */
381     wc_len = (enc == CE_UTF8)? utf8towcs(NULL, in, 0) : mbstowcs(NULL, in, 0);
382     if (out == NULL || (int)wc_len < 0) return wc_len;
383 
384     if ((void*)-1 == (cd = Riconv_open(UCS2ENC, (enc == CE_UTF8) ? "UTF-8": "")))
385 	return (size_t) -1;
386 
387     i_buf = (char *)in;
388     i_len = strlen(in); /* not including terminator */
389     o_buf = (char *)out;
390     o_len = ((size_t) nout) * sizeof(R_ucs2_t);
391     status = Riconv(cd, &i_buf, (size_t *)&i_len, &o_buf, (size_t *)&o_len);
392     int serrno = errno;
393     Riconv_close(cd);
394     if (status == (size_t)-1) {
395 	switch(serrno){
396 	case EINVAL:
397 	    return (size_t) -2;
398 	case EILSEQ:
399 	    return (size_t) -1;
400 	case E2BIG:
401 	    break;
402 	default:
403 	    errno = EILSEQ;
404 	    return (size_t) -1;
405 	}
406     }
407     return wc_len; /* status would be better? */
408 }
409 
410 
411 #include <wctype.h>
412 
413 /* This one is not in Rinternals.h, but is used in internet module */
isBlankString(const char * s)414 Rboolean isBlankString(const char *s)
415 {
416     if(mbcslocale) {
417 	wchar_t wc; size_t used; mbstate_t mb_st;
418 	mbs_init(&mb_st);
419 	// This does not allow for surrogate pairs, but all blanks are in BMP
420 	while( (used = Mbrtowc(&wc, s, R_MB_CUR_MAX, &mb_st)) ) {
421 	    if(!iswspace((wint_t) wc)) return FALSE;
422 	    s += used;
423 	}
424     } else
425 	while (*s)
426 	    if (!isspace((int)*s++)) return FALSE;
427     return TRUE;
428 }
429 
StringBlank(SEXP x)430 Rboolean StringBlank(SEXP x)
431 {
432     if (x == R_NilValue) return TRUE;
433     else return CHAR(x)[0] == '\0';
434 }
435 
436 /* Function to test whether a string is a true value */
437 
StringTrue(const char * name)438 Rboolean StringTrue(const char *name)
439 {
440     int i;
441     for (i = 0; truenames[i]; i++)
442 	if (!strcmp(name, truenames[i]))
443 	    return TRUE;
444     return FALSE;
445 }
446 
StringFalse(const char * name)447 Rboolean StringFalse(const char *name)
448 {
449     int i;
450     for (i = 0; falsenames[i]; i++)
451 	if (!strcmp(name, falsenames[i]))
452 	    return TRUE;
453     return FALSE;
454 }
455 
456 /* used in bind.c and options.c */
EnsureString(SEXP s)457 SEXP attribute_hidden EnsureString(SEXP s)
458 {
459     switch(TYPEOF(s)) {
460     case SYMSXP:
461 	s = PRINTNAME(s);
462 	break;
463     case STRSXP:
464 	s = STRING_ELT(s, 0);
465 	break;
466     case CHARSXP:
467 	break;
468     case NILSXP:
469 	s = R_BlankString;
470 	break;
471     default:
472 	error(_("invalid tag in name extraction"));
473     }
474     return s;
475 }
476 
477 // NB: have  checkArity(a,b) :=  Rf_checkArityCall(a,b,call)
Rf_checkArityCall(SEXP op,SEXP args,SEXP call)478 void Rf_checkArityCall(SEXP op, SEXP args, SEXP call)
479 {
480     if (PRIMARITY(op) >= 0 && PRIMARITY(op) != length(args)) {
481 	/* FIXME: ngettext reguires unsigned long, but %u would seem appropriate */
482 	if (PRIMINTERNAL(op))
483 	    error(ngettext("%d argument passed to .Internal(%s) which requires %d",
484 		     "%d arguments passed to .Internal(%s) which requires %d",
485 			   (unsigned long) length(args)),
486 		  length(args), PRIMNAME(op), PRIMARITY(op));
487 	else
488 	    errorcall(call,
489 		      ngettext("%d argument passed to '%s' which requires %d",
490 			       "%d arguments passed to '%s' which requires %d",
491 			       (unsigned long) length(args)),
492 		      length(args), PRIMNAME(op), PRIMARITY(op));
493     }
494 }
495 
Rf_check1arg(SEXP arg,SEXP call,const char * formal)496 void attribute_hidden Rf_check1arg(SEXP arg, SEXP call, const char *formal)
497 {
498     SEXP tag = TAG(arg);
499     if (tag == R_NilValue) return;
500     const char *supplied = CHAR(PRINTNAME(tag));
501     size_t ns = strlen(supplied);
502     if (ns > strlen(formal) || strncmp(supplied, formal, ns))
503 	errorcall(call, _("supplied argument name '%s' does not match '%s'"),
504 		  supplied, formal);
505 }
506 
507 
nthcdr(SEXP s,int n)508 SEXP nthcdr(SEXP s, int n)
509 {
510     if (isList(s) || isLanguage(s) || isFrame(s) || TYPEOF(s) == DOTSXP ) {
511 	while( n-- > 0 ) {
512 	    if (s == R_NilValue)
513 		error(_("'nthcdr' list shorter than %d"), n);
514 	    s = CDR(s);
515 	}
516 	return s;
517     }
518     else error(_("'nthcdr' needs a list to CDR down"));
519     return R_NilValue;/* for -Wall */
520 }
521 
522 /* Destructively removes R_NilValue ('NULL') elements from a pairlist. */
R_listCompact(SEXP s,Rboolean keep_initial)523 SEXP R_listCompact(SEXP s, Rboolean keep_initial) {
524     if(!keep_initial)
525     // skip initial NULL values
526 	while (s != R_NilValue && CAR(s) == R_NilValue)
527 	    s = CDR(s);
528 
529     SEXP val = s;
530     SEXP prev = s;
531     while (s != R_NilValue) {
532 	s = CDR(s);
533 	if (CAR(s) == R_NilValue) // skip it
534 	    SETCDR(prev, CDR(s));
535 	else
536 	    prev = s;
537     }
538     return val;
539 }
540 
541 
542 /* This is a primitive (with no arguments) */
do_nargs(SEXP call,SEXP op,SEXP args,SEXP rho)543 SEXP attribute_hidden do_nargs(SEXP call, SEXP op, SEXP args, SEXP rho)
544 {
545     RCNTXT *cptr;
546     int nargs = NA_INTEGER;
547 
548     checkArity(op, args);
549     for (cptr = R_GlobalContext; cptr != NULL; cptr = cptr->nextcontext) {
550 	if ((cptr->callflag & CTXT_FUNCTION) && cptr->cloenv == rho) {
551 	    nargs = length(cptr->promargs);
552 	    break;
553 	}
554     }
555     return ScalarInteger(nargs);
556 }
557 
558 
559 /* formerly used in subscript.c, in Utils.h */
setIVector(int * vec,int len,int val)560 void attribute_hidden setIVector(int * vec, int len, int val)
561 {
562     for (int i = 0; i < len; i++) vec[i] = val;
563 }
564 
565 
566 /* unused in R, in Utils.h, may have been used in Rcpp at some point,
567       but not any more (as per Nov. 2018)  */
setRVector(double * vec,int len,double val)568 void attribute_hidden setRVector(double * vec, int len, double val)
569 {
570     for (int i = 0; i < len; i++) vec[i] = val;
571 }
572 
573 /* unused in R, in Rinternals.h */
setSVector(SEXP * vec,int len,SEXP val)574 void setSVector(SEXP * vec, int len, SEXP val)
575 {
576     for (int i = 0; i < len; i++) vec[i] = val;
577 }
578 
579 
isFree(SEXP val)580 Rboolean isFree(SEXP val)
581 {
582     SEXP t;
583     for (t = R_FreeSEXP; t != R_NilValue; t = CAR(t))
584 	if (val == t)
585 	    return TRUE;
586     return FALSE;
587 }
588 
589 
590 /* Debugging functions (hence the d-prefix). */
591 /* These are intended to be called interactively from */
592 /* a debugger such as gdb, so you don't have to remember */
593 /* the names of the data structure components. */
594 
dtype(SEXP q)595 int dtype(SEXP q)
596 {
597     return((int)TYPEOF(q));
598 }
599 
600 
dcar(SEXP l)601 SEXP dcar(SEXP l)
602 {
603     return(CAR(l));
604 }
605 
606 
dcdr(SEXP l)607 SEXP dcdr(SEXP l)
608 {
609     return(CDR(l));
610 }
611 
612 
isort_with_index(int * x,int * indx,int n)613 static void isort_with_index(int *x, int *indx, int n)
614 {
615     int i, j, h, iv, v;
616 
617     for (h = 1; h <= n / 9; h = 3 * h + 1);
618     for (; h > 0; h /= 3)
619 	for (i = h; i < n; i++) {
620 	    v = x[i]; iv = indx[i];
621 	    j = i;
622 	    while (j >= h && x[j - h] > v)
623 		 { x[j] = x[j - h]; indx[j] = indx[j-h]; j -= h; }
624 	    x[j] = v; indx[j] = iv;
625 	}
626 }
627 
628 
629 // body(x) without attributes "srcref", "srcfile", "wholeSrcref" :
630 // NOTE: Callers typically need  PROTECT(R_body_no_src(.))
R_body_no_src(SEXP x)631 SEXP R_body_no_src(SEXP x) {
632     SEXP b = PROTECT(duplicate(BODY_EXPR(x)));
633     /* R's removeSource() works *recursively* on the body()
634        in  ../library/utils/R/sourceutils.R  but that seems unneeded (?) */
635     setAttrib(b, R_SrcrefSymbol, R_NilValue);
636     setAttrib(b, R_SrcfileSymbol, R_NilValue);
637     setAttrib(b, R_WholeSrcrefSymbol, R_NilValue);
638     UNPROTECT(1);
639     return b;
640 }
641 
642 /* merge(xinds, yinds, all.x, all.y) */
643 /* xinds, yinds are along x and y rows matching into the (numeric)
644    common indices, with 0 for non-matches.
645 
646    all.x and all.y are boolean.
647 
648    The return value is a list with 4 elements (xi, yi, x.alone, y.alone),
649    which are index vectors for rows of x or y.
650 */
do_merge(SEXP call,SEXP op,SEXP args,SEXP rho)651 SEXP attribute_hidden do_merge(SEXP call, SEXP op, SEXP args, SEXP rho)
652 {
653     SEXP xi, yi, ansx, ansy, ans;
654     int nx = 0, ny = 0, i, j, k, nx_lone = 0, ny_lone = 0;
655     int all_x = 0, all_y = 0, ll = 0/* "= 0" : for -Wall */;
656     int nnx, nny;
657 
658     checkArity(op, args);
659     xi = CAR(args);
660     // NB: long vectors are not supported for input
661     if ( !isInteger(xi) || !(nx = LENGTH(xi)) )
662 	error(_("invalid '%s' argument"), "xinds");
663     yi = CADR(args);
664     if ( !isInteger(yi) || !(ny = LENGTH(yi)) )
665 	error(_("invalid '%s' argument"), "yinds");
666     if(!LENGTH(ans = CADDR(args)) || NA_LOGICAL == (all_x = asLogical(ans)))
667 	error(_("'all.x' must be TRUE or FALSE"));
668     if(!LENGTH(ans = CADDDR(args))|| NA_LOGICAL == (all_y = asLogical(ans)))
669 	error(_("'all.y' must be TRUE or FALSE"));
670 
671     /* 0. sort the indices */
672     int *ix = (int *) R_alloc((size_t) nx, sizeof(int));
673     int *iy = (int *) R_alloc((size_t) ny, sizeof(int));
674     for(i = 0; i < nx; i++) ix[i] = i+1;
675     for(i = 0; i < ny; i++) iy[i] = i+1;
676     isort_with_index(INTEGER(xi), ix, nx);
677     isort_with_index(INTEGER(yi), iy, ny);
678 
679     /* 1. determine result sizes */
680     for (i = 0; i < nx; i++)
681 	if (INTEGER(xi)[i] > 0) break;
682     nx_lone = i;
683     for (i = 0; i < ny; i++)
684 	if (INTEGER(yi)[i] > 0) break;
685     ny_lone = i;
686     double dnans = 0;
687     for (i = nx_lone, j = ny_lone; i < nx; i = nnx, j = nny) {
688 	int tmp = INTEGER(xi)[i];
689 	for(nnx = i; nnx < nx; nnx++) if(INTEGER(xi)[nnx] != tmp) break;
690 	/* the next is not in theory necessary,
691 	   since we have the common values only */
692 	for(; j < ny; j++) if(INTEGER(yi)[j] >= tmp) break;
693 	for(nny = j; nny < ny; nny++) if(INTEGER(yi)[nny] != tmp) break;
694 	/* printf("i %d nnx %d j %d nny %d\n", i, nnx, j, nny); */
695 	dnans += ((double)(nnx-i))*(nny-j);
696     }
697     if (dnans > R_XLEN_T_MAX)
698 	error(_("number of rows in the result exceeds maximum vector length"));
699     R_xlen_t nans = (int) dnans;
700 
701 
702     /* 2. allocate and store result components */
703 
704     const char *nms[] = {"xi", "yi", "x.alone", "y.alone", ""};
705     ans = PROTECT(mkNamed(VECSXP, nms));
706     ansx = allocVector(INTSXP, nans);    SET_VECTOR_ELT(ans, 0, ansx);
707     ansy = allocVector(INTSXP, nans);    SET_VECTOR_ELT(ans, 1, ansy);
708 
709     if(all_x) {
710 	SEXP x_lone = allocVector(INTSXP, nx_lone);
711 	SET_VECTOR_ELT(ans, 2, x_lone);
712 	for (i = 0, ll = 0; i < nx_lone; i++)
713 	    INTEGER(x_lone)[ll++] = ix[i];
714     }
715 
716     if(all_y) {
717 	SEXP y_lone = allocVector(INTSXP, ny_lone);
718 	SET_VECTOR_ELT(ans, 3, y_lone);
719 	for (i = 0, ll = 0; i < ny_lone; i++)
720 	    INTEGER(y_lone)[ll++] = iy[i];
721     }
722 
723     for (i = nx_lone, j = ny_lone, k = 0; i < nx; i = nnx, j = nny) {
724 	int tmp = INTEGER(xi)[i];
725 	for(nnx = i; nnx < nx; nnx++) if(INTEGER(xi)[nnx] != tmp) break;
726 	for(; j < ny; j++) if(INTEGER(yi)[j] >= tmp) break;
727 	for(nny = j; nny < ny; nny++) if(INTEGER(yi)[nny] != tmp) break;
728 	for(int i0 = i; i0 < nnx; i0++)
729 	    for(int j0 = j; j0 < nny; j0++) {
730 		INTEGER(ansx)[k]   = ix[i0];
731 		INTEGER(ansy)[k++] = iy[j0];
732 	    }
733     }
734 
735     UNPROTECT(1);
736     return ans;
737 }
738 
739 
740 /* Functions for getting and setting the working directory. */
741 #ifdef Win32
742 # define WIN32_LEAN_AND_MEAN 1
743 # include <windows.h>
744 #endif
745 
intern_getwd(void)746 SEXP static intern_getwd(void)
747 {
748     SEXP rval = R_NilValue;
749     char buf[4*PATH_MAX+1];
750 
751 #ifdef Win32
752     {
753 	wchar_t wbuf[PATH_MAX+1];
754 	int res = GetCurrentDirectoryW(PATH_MAX, wbuf);
755 	if(res > 0) {
756 	    wcstoutf8(buf, wbuf, sizeof(buf));
757 	    R_UTF8fixslash(buf);
758 	    PROTECT(rval = allocVector(STRSXP, 1));
759 	    SET_STRING_ELT(rval, 0, mkCharCE(buf, CE_UTF8));
760 	    UNPROTECT(1);
761 	}
762     }
763 #else
764     char *res = getcwd(buf, PATH_MAX); /* can return NULL */
765     if(res) rval = mkString(buf);
766 #endif
767     return(rval);
768 }
769 
do_getwd(SEXP call,SEXP op,SEXP args,SEXP rho)770 SEXP attribute_hidden do_getwd(SEXP call, SEXP op, SEXP args, SEXP rho)
771 {
772     checkArity(op, args);
773 
774     return(intern_getwd());
775 }
776 
777 
778 #if defined(Win32) && defined(_MSC_VER)
779 # include <direct.h> /* for chdir, via io.h */
780 #endif
781 
do_setwd(SEXP call,SEXP op,SEXP args,SEXP rho)782 SEXP attribute_hidden do_setwd(SEXP call, SEXP op, SEXP args, SEXP rho)
783 {
784     SEXP s = R_NilValue, wd = R_NilValue;	/* -Wall */
785 
786     checkArity(op, args);
787     if (!isPairList(args) || !isValidString(s = CAR(args)))
788 	error(_("character argument expected"));
789     if (STRING_ELT(s, 0) == NA_STRING)
790 	error(_("missing value is invalid"));
791 
792     /* get current directory to return */
793     PROTECT(wd = intern_getwd());
794 
795 #ifdef Win32
796     {
797 	const wchar_t *path = filenameToWchar(STRING_ELT(s, 0), TRUE);
798 	if(_wchdir(path) < 0)
799 	    error(_("cannot change working directory"));
800     }
801 #else
802     {
803 	const char *path
804 	    = R_ExpandFileName(translateCharFP(STRING_ELT(s, 0)));
805     if(chdir(path) < 0)
806 	error(_("cannot change working directory"));
807     }
808 #endif
809     UNPROTECT(1); /* wd */
810     return(wd);
811 }
812 
813 /* remove portion of path before file separator if one exists */
814 
815 #ifdef Win32
do_basename(SEXP call,SEXP op,SEXP args,SEXP rho)816 SEXP attribute_hidden do_basename(SEXP call, SEXP op, SEXP args, SEXP rho)
817 {
818     SEXP ans, s = R_NilValue;	/* -Wall */
819     char sp[4*PATH_MAX+1];
820     wchar_t  buf[PATH_MAX], *p;
821     const wchar_t *pp;
822     int i, n;
823 
824     checkArity(op, args);
825     if (TYPEOF(s = CAR(args)) != STRSXP)
826 	error(_("a character vector argument expected"));
827     PROTECT(ans = allocVector(STRSXP, n = LENGTH(s)));
828     for(i = 0; i < n; i++) {
829 	if (STRING_ELT(s, i) == NA_STRING)
830 	    SET_STRING_ELT(ans, i, NA_STRING);
831 	else {
832 	    pp = filenameToWchar(STRING_ELT(s, i), TRUE);
833 	    if (wcslen(pp) > PATH_MAX - 1) error(_("path too long"));
834 	    wcscpy(buf, pp);
835 	    R_wfixslash(buf);
836 	    /* remove trailing file separator(s) */
837 	    if (*buf) {
838 		p = buf + wcslen(buf) - 1;
839 		while (p >= buf && *p == L'/') *(p--) = L'\0';
840 	    }
841 	    if ((p = wcsrchr(buf, L'/'))) p++; else p = buf;
842 	    wcstoutf8(sp, p, sizeof(sp));
843 	    SET_STRING_ELT(ans, i, mkCharCE(sp, CE_UTF8));
844 	}
845     }
846     UNPROTECT(1);
847     return(ans);
848 }
849 #else
do_basename(SEXP call,SEXP op,SEXP args,SEXP rho)850 SEXP attribute_hidden do_basename(SEXP call, SEXP op, SEXP args, SEXP rho)
851 {
852     SEXP ans, s = R_NilValue;	/* -Wall */
853     char  buf[PATH_MAX], *p, fsp = FILESEP[0];
854     const char *pp;
855     int i, n;
856 
857     checkArity(op, args);
858     if (TYPEOF(s = CAR(args)) != STRSXP)
859 	error(_("a character vector argument expected"));
860     PROTECT(ans = allocVector(STRSXP, n = LENGTH(s)));
861     for(i = 0; i < n; i++) {
862 	if (STRING_ELT(s, i) == NA_STRING)
863 	    SET_STRING_ELT(ans, i, NA_STRING);
864 	else {
865 	    pp = R_ExpandFileName(translateCharFP(STRING_ELT(s, i)));
866 	    if (strlen(pp) > PATH_MAX - 1)
867 		error(_("path too long"));
868 	    strcpy (buf, pp);
869 	    if (*buf) {
870 		p = buf + strlen(buf) - 1;
871 		while (p >= buf && *p == fsp) *(p--) = '\0';
872 	    }
873 	    if ((p = Rf_strrchr(buf, fsp)))
874 		p++;
875 	    else
876 		p = buf;
877 	    SET_STRING_ELT(ans, i, mkChar(p));
878 	}
879     }
880     UNPROTECT(1);
881     return(ans);
882 }
883 #endif
884 
885 /* remove portion of path after last file separator if one exists, else
886    return "."
887    */
888 
889 #ifdef Win32
do_dirname(SEXP call,SEXP op,SEXP args,SEXP rho)890 SEXP attribute_hidden do_dirname(SEXP call, SEXP op, SEXP args, SEXP rho)
891 {
892     SEXP ans, s = R_NilValue;	/* -Wall */
893     wchar_t buf[PATH_MAX], *p;
894     const wchar_t *pp;
895     char sp[4*PATH_MAX+1];
896     int i, n;
897 
898     checkArity(op, args);
899     if (TYPEOF(s = CAR(args)) != STRSXP)
900 	error(_("a character vector argument expected"));
901     PROTECT(ans = allocVector(STRSXP, n = LENGTH(s)));
902     for(i = 0; i < n; i++) {
903 	if (STRING_ELT(s, i) == NA_STRING)
904 	    SET_STRING_ELT(ans, i, NA_STRING);
905 	else {
906 	    memset(sp, 0, 4*PATH_MAX);
907 	    pp = filenameToWchar(STRING_ELT(s, i), TRUE);
908 	    if (wcslen(pp) > PATH_MAX - 1)
909 		error(_("path too long"));
910 	    if (wcslen(pp)) {
911 		wcscpy (buf, pp);
912 		R_wfixslash(buf);
913 		/* remove trailing file separator(s) */
914 		p = buf + wcslen(buf) - 1;
915 		while (p > buf && *p == L'/'
916 		       && (p > buf+2 || *(p-1) != L':')) *p-- = L'\0';
917 		p = wcsrchr(buf, L'/');
918 		if(p == NULL) wcscpy(buf, L".");
919 		else {
920 		    while(p > buf && *p == L'/'
921 			  /* this covers both drives and network shares */
922 			  && (p > buf+2 || *(p-1) != L':')) --p;
923 		    p[1] = L'\0';
924 		}
925 		wcstoutf8(sp, buf, sizeof(sp));
926 	    }
927 	    SET_STRING_ELT(ans, i, mkCharCE(sp, CE_UTF8));
928 	}
929     }
930     UNPROTECT(1);
931     return(ans);
932 }
933 #else
do_dirname(SEXP call,SEXP op,SEXP args,SEXP rho)934 SEXP attribute_hidden do_dirname(SEXP call, SEXP op, SEXP args, SEXP rho)
935 {
936     SEXP ans, s = R_NilValue;	/* -Wall */
937     char buf[PATH_MAX], *p, fsp = FILESEP[0];
938     const char *pp;
939     int i, n;
940 
941     checkArity(op, args);
942     if (TYPEOF(s = CAR(args)) != STRSXP)
943 	error(_("a character vector argument expected"));
944     PROTECT(ans = allocVector(STRSXP, n = LENGTH(s)));
945     for(i = 0; i < n; i++) {
946 	if (STRING_ELT(s, i) == NA_STRING)
947 	    SET_STRING_ELT(ans, i, NA_STRING);
948 	else {
949 	    pp = R_ExpandFileName(translateCharFP(STRING_ELT(s, i)));
950 	    if (strlen(pp) > PATH_MAX - 1)
951 		error(_("path too long"));
952 	    size_t ll = strlen(pp);
953 	    if (ll) { // svMisc calls this with ""
954 		strcpy (buf, pp);
955 		/* remove trailing file separator(s) */
956 		while ( *(p = buf + ll - 1) == fsp  && p > buf) *p = '\0';
957 		p = Rf_strrchr(buf, fsp);
958 		if(p == NULL)
959 		    strcpy(buf, ".");
960 		else {
961 		    while(p > buf && *p == fsp) --p;
962 		    p[1] = '\0';
963 		}
964 	    } else buf[0] = '\0';
965 	    SET_STRING_ELT(ans, i, mkChar(buf));
966 	}
967     }
968     UNPROTECT(1);
969     return(ans);
970 }
971 #endif
972 
973 
974 #ifndef Win32 /* Windows version is in src/gnuwin32/extra.c */
975 #ifndef HAVE_DECL_REALPATH
976 extern char *realpath(const char *path, char *resolved_path);
977 #endif
978 
do_normalizepath(SEXP call,SEXP op,SEXP args,SEXP rho)979 SEXP attribute_hidden do_normalizepath(SEXP call, SEXP op, SEXP args, SEXP rho)
980 {
981     SEXP ans, paths = CAR(args), elp;
982     int i, n = LENGTH(paths);
983     const char *path;
984     char abspath[PATH_MAX+1];
985 
986     checkArity(op, args);
987     if (!isString(paths))
988 	error(_("'path' must be a character vector"));
989 
990     int mustWork = asLogical(CADDR(args)); /* 1, NA_LOGICAL or 0 */
991 
992 /* Does any platform not have this? */
993 #ifdef HAVE_REALPATH
994     PROTECT(ans = allocVector(STRSXP, n));
995     for (i = 0; i < n; i++) {
996 	elp = STRING_ELT(paths, i);
997 	if (elp == NA_STRING) {
998 	    SET_STRING_ELT(ans, i, NA_STRING);
999 	    if (mustWork == 1)
1000 		error("path[%d]=NA", i+1);
1001 	    else if (mustWork == NA_LOGICAL)
1002 		warning("path[%d]=NA", i+1);
1003 	    continue;
1004 	}
1005 	path = translateCharFP2(elp);
1006 	if (path) {
1007 	    char *res = realpath(path, abspath);
1008 	    if (res)
1009 		SET_STRING_ELT(ans, i, mkChar(abspath));
1010 	    else {
1011 		SET_STRING_ELT(ans, i, elp);
1012 		/* and report the problem */
1013 		if (mustWork == 1)
1014 		    error("path[%d]=\"%s\": %s", i+1, path, strerror(errno));
1015 		else if (mustWork == NA_LOGICAL)
1016 		    warning("path[%d]=\"%s\": %s", i+1, path, strerror(errno));
1017 	    }
1018 	}
1019 	else if (mustWork == 1) error("fatal translation error");
1020 	else SET_STRING_ELT(ans, i, elp);
1021     }
1022 #else
1023     Rboolean OK;
1024     warning("this platform does not have realpath so the results may not be canonical");
1025     PROTECT(ans = allocVector(STRSXP, n));
1026     for (i = 0; i < n; i++) {
1027 	elp = STRING_ELT(paths, i);
1028 	if (elp == NA_STRING) {
1029 	    SET_STRING_ELT(ans, i, NA_STRING);
1030 	    if (mustWork == 1)
1031 		error("path[%d]=NA", i+1);
1032 	    else if (mustWork == NA_LOGICAL)
1033 		warning("path[%d]=NA", i+1);
1034 	    continue;
1035 	}
1036 	path = translateCharFP(elp);
1037 	OK = strlen(path) <= PATH_MAX;
1038 	if (OK) {
1039 	    if (path[0] == '/') strncpy(abspath, path, PATH_MAX);
1040 	    else {
1041 		OK = getcwd(abspath, PATH_MAX) != NULL;
1042 		OK = OK && (strlen(path) + strlen(abspath) + 1 <= PATH_MAX);
1043 		if (OK) {strcat(abspath, "/"); strcat(abspath, path);}
1044 	    }
1045 	}
1046 	/* we need to check that this exists */
1047 	if (OK) OK = (access(abspath, 0 /* F_OK */) == 0);
1048 	if (OK) SET_STRING_ELT(ans, i, mkChar(abspath));
1049 	else {
1050 	    SET_STRING_ELT(ans, i, elp);
1051 	    /* and report the problem */
1052 	    if (mustWork == 1)
1053 		error("path[%d]=\"%s\": %s", i+1, path, strerror(errno));
1054 	    else if (mustWork == NA_LOGICAL)
1055 		warning("path[%d]=\"%s\": %s", i+1, path, strerror(errno));
1056 	}
1057     }
1058 #endif
1059     UNPROTECT(1);
1060     return ans;
1061 }
1062 
1063 #ifdef USE_INTERNAL_MKTIME
getTZinfo(void)1064 const char *getTZinfo(void)
1065 {
1066     static char def_tz[PATH_MAX+1] = "";
1067     if (def_tz[0]) return def_tz;
1068 
1069     // call Sys.timezone()
1070     SEXP expr = PROTECT(install("Sys.timezone"));
1071     SEXP call = PROTECT(lang1(expr));
1072     SEXP ans = PROTECT(eval(call, R_GlobalEnv));
1073     if(TYPEOF(ans) == STRSXP && LENGTH(ans) == 1) {
1074 	SEXP el = STRING_ELT(ans, 0);
1075 	if (el != NA_STRING) {
1076 	    strcpy(def_tz, CHAR(el));
1077 	    // printf("tz is %s\n", CHAR(el));
1078 	    UNPROTECT(3);
1079 	    return def_tz;
1080 	}
1081     }
1082     UNPROTECT(3);
1083     warning("system timezone name is unknown: set environment variable TZ");
1084     strcpy(def_tz, "unknown");  // code will then use TZDEFAULT, which is "UTC"
1085     return def_tz;
1086 }
1087 #endif
1088 
1089 #endif // not Win32
1090 
1091 
1092 #ifdef Win32
encode_cleanup(void * data)1093 static void encode_cleanup(void *data)
1094 {
1095     WinUTF8out = TRUE;
1096 }
1097 #endif
1098 
1099 /* encodeString(x, w, quote, justify) */
do_encodeString(SEXP call,SEXP op,SEXP args,SEXP rho)1100 SEXP attribute_hidden do_encodeString(SEXP call, SEXP op, SEXP args, SEXP rho)
1101 {
1102     SEXP ans, x, s;
1103     R_xlen_t i, len;
1104     int w, quote = 0, justify, na;
1105     const char *cs;
1106     Rboolean findWidth;
1107 
1108     checkArity(op, args);
1109     if (TYPEOF(x = CAR(args)) != STRSXP)
1110 	error(_("a character vector argument expected"));
1111     if(isNull(CADR(args))) w = NA_INTEGER;
1112     else {
1113 	w = asInteger(CADR(args));
1114 	if(w != NA_INTEGER && w < 0)
1115 	    error(_("invalid '%s' value"), "width");
1116     }
1117     findWidth = (w == NA_INTEGER);
1118     s = CADDR(args);
1119     if(LENGTH(s) != 1 || TYPEOF(s) != STRSXP)
1120 	error(_("invalid '%s' value"), "quote");
1121     cs = translateChar(STRING_ELT(s, 0));
1122     if(strlen(cs) > 0) quote = cs[0];
1123     if(strlen(cs) > 1)
1124 	warning(_("only the first character of 'quote' will be used"));
1125     justify = asInteger(CADDDR(args));
1126     if(justify == NA_INTEGER || justify < 0 || justify > 3)
1127 	error(_("invalid '%s' value"), "justify");
1128     if(justify == 3) w = 0;
1129     na = asLogical(CAD4R(args));
1130     if(na == NA_LOGICAL) error(_("invalid '%s' value"), "na.encode");
1131 
1132     len = XLENGTH(x);
1133     if(findWidth && justify < 3) {
1134 	w  = 0;
1135 	for(i = 0; i < len; i++) {
1136 	    s = STRING_ELT(x, i);
1137 	    if(na || s != NA_STRING)
1138 		w = R_imax2(w, Rstrlen(s, quote));
1139 	}
1140 	if(quote) w +=2; /* for surrounding quotes */
1141     }
1142     PROTECT(ans = duplicate(x));
1143 #ifdef Win32
1144     RCNTXT cntxt;
1145     Rboolean havecontext = FALSE;
1146     /* do_encodeString is not printing, but returning a string, it therefore
1147        must not produce Rgui escapes (do_encodeString may get called as part
1148        of print dispatch with WinUTF8out being already set to TRUE). */
1149     if (WinUTF8out) {
1150 	begincontext(&cntxt, CTXT_CCODE, R_NilValue, R_BaseEnv, R_BaseEnv,
1151 		     R_NilValue, R_NilValue);
1152 	cntxt.cend = &encode_cleanup;
1153 	havecontext = TRUE;
1154 	WinUTF8out = FALSE;
1155     }
1156 #endif
1157     for(i = 0; i < len; i++) {
1158 	s = STRING_ELT(x, i);
1159 	if(na || s != NA_STRING) {
1160 	    cetype_t ienc = getCharCE(s);
1161 	    if(ienc == CE_UTF8) {
1162 		const char *ss = EncodeString(s, w-1000000, quote,
1163 					      (Rprt_adj) justify);
1164 		SET_STRING_ELT(ans, i, mkCharCE(ss, ienc));
1165 	    } else {
1166 		const char *ss = EncodeString(s, w, quote, (Rprt_adj) justify);
1167 		SET_STRING_ELT(ans, i, mkChar(ss));
1168 	    }
1169 	}
1170     }
1171 #ifdef Win32
1172     if (havecontext) {
1173 	encode_cleanup(NULL);
1174 	endcontext(&cntxt);
1175     }
1176 #endif
1177     UNPROTECT(1);
1178     return ans;
1179 }
1180 
do_encoding(SEXP call,SEXP op,SEXP args,SEXP rho)1181 SEXP attribute_hidden do_encoding(SEXP call, SEXP op, SEXP args, SEXP rho)
1182 {
1183     SEXP ans, x;
1184     R_xlen_t i, n;
1185     char *tmp;
1186 
1187     checkArity(op, args);
1188     if (TYPEOF(x = CAR(args)) != STRSXP)
1189 	error(_("a character vector argument expected"));
1190     n = XLENGTH(x);
1191     PROTECT(ans = allocVector(STRSXP, n));
1192     for (i = 0; i < n; i++) {
1193 	if(IS_BYTES(STRING_ELT(x, i))) tmp = "bytes";
1194 	else if(IS_LATIN1(STRING_ELT(x, i))) tmp = "latin1";
1195 	else if(IS_UTF8(STRING_ELT(x, i))) tmp = "UTF-8";
1196 	else tmp = "unknown";
1197 	SET_STRING_ELT(ans, i, mkChar(tmp));
1198     }
1199     UNPROTECT(1);
1200     return ans;
1201 }
1202 
1203 #define IS_NATIVE(tmp) \
1204     (! IS_LATIN1(tmp) && ! IS_UTF8(tmp) && ! IS_BYTES(tmp))
1205 
do_setencoding(SEXP call,SEXP op,SEXP args,SEXP rho)1206 SEXP attribute_hidden do_setencoding(SEXP call, SEXP op, SEXP args, SEXP rho)
1207 {
1208     SEXP x, enc, tmp;
1209     int m;
1210     R_xlen_t i, n;
1211     const char *this;
1212 
1213     checkArity(op, args);
1214     if (TYPEOF(x = CAR(args)) != STRSXP)
1215 	error(_("a character vector argument expected"));
1216     if (TYPEOF(enc = CADR(args)) != STRSXP)
1217 	error(_("a character vector 'value' expected"));
1218     m = LENGTH(enc);
1219     if(m == 0)
1220 	error(_("'value' must be of positive length"));
1221     if(MAYBE_REFERENCED(x)) x = duplicate(x);
1222     PROTECT(x);
1223     n = XLENGTH(x);
1224     for(i = 0; i < n; i++) {
1225 	cetype_t ienc = CE_NATIVE;
1226 	this = CHAR(STRING_ELT(enc, i % m)); /* ASCII */
1227 	if(streql(this, "latin1")) ienc = CE_LATIN1;
1228 	else if(streql(this, "UTF-8")) ienc = CE_UTF8;
1229 	else if(streql(this, "bytes")) ienc = CE_BYTES;
1230 	tmp = STRING_ELT(x, i);
1231 	if(tmp == NA_STRING) continue;
1232 	if (! ((ienc == CE_LATIN1 && IS_LATIN1(tmp)) ||
1233 	       (ienc == CE_UTF8   && IS_UTF8(tmp))   ||
1234 	       (ienc == CE_BYTES  && IS_BYTES(tmp))  ||
1235 	       (ienc == CE_NATIVE && IS_NATIVE(tmp))))
1236 	    SET_STRING_ELT(x, i, mkCharLenCE(CHAR(tmp), LENGTH(tmp), ienc));
1237     }
1238     UNPROTECT(1);
1239     return x;
1240 }
1241 
markKnown(const char * s,SEXP ref)1242 SEXP attribute_hidden markKnown(const char *s, SEXP ref)
1243 {
1244     int ienc = 0;
1245     if(ENC_KNOWN(ref)) {
1246 	if(known_to_be_latin1) ienc = CE_LATIN1;
1247 	if(known_to_be_utf8) ienc = CE_UTF8;
1248     }
1249     return mkCharCE(s, ienc);
1250 }
1251 
strIsASCII(const char * str)1252 Rboolean strIsASCII(const char *str)
1253 {
1254     const char *p;
1255     for(p = str; *p; p++)
1256 	if((unsigned int)*p > 0x7F) return FALSE;
1257     return TRUE;
1258 }
1259 
1260 /* Number of additional bytes */
1261 static const unsigned char utf8_table4[] = {
1262   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1263   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1264   2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
1265   3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
1266 
utf8clen(char c)1267 int attribute_hidden utf8clen(char c)
1268 {
1269     /* This allows through 8-bit chars 10xxxxxx, which are invalid */
1270     if ((c & 0xc0) != 0xc0) return 1;
1271     return 1 + utf8_table4[c & 0x3f];
1272 }
1273 
1274 /* These are misnamed: they convert a single char */
1275 static R_wchar_t
utf16toucs(wchar_t high,wchar_t low)1276 utf16toucs(wchar_t high, wchar_t low)
1277 {
1278     return 0x10000 + ((int) (high & 0x3FF) << 10 ) + (int) (low & 0x3FF);
1279 }
1280 
1281 /* Return the low UTF-16 surrogate from a UTF-8 string; assumes all testing has been done. */
1282 static wchar_t
utf8toutf16low(const char * s)1283 utf8toutf16low(const char *s)
1284 {
1285     return (unsigned int) LOW_SURROGATE_START | ((s[2] & 0x0F) << 6) | (s[3] & 0x3F);
1286 }
1287 
1288 R_wchar_t attribute_hidden
utf8toucs32(wchar_t high,const char * s)1289 utf8toucs32(wchar_t high, const char *s)
1290 {
1291     return utf16toucs(high, utf8toutf16low(s));
1292 }
1293 
1294 /* These return the result in wchar_t.  If wchar_t is 16 bit (e.g. UTF-16LE on Windows)
1295    only the high surrogate is returned; call utf8toutf16low next. */
1296 size_t attribute_hidden
utf8toucs(wchar_t * wc,const char * s)1297 utf8toucs(wchar_t *wc, const char *s)
1298 {
1299     unsigned int byte;
1300     wchar_t local, *w;
1301     byte = *((unsigned char *)s);
1302     w = wc ? wc: &local;
1303 
1304     if (byte == 0) {
1305 	*w = (wchar_t) 0;
1306 	return 0;
1307     } else if (byte < 0xC0) {
1308 	*w = (wchar_t) byte;
1309 	return 1;
1310     } else if (byte < 0xE0) {
1311 	if(strlen(s) < 2) return (size_t)-2;
1312 	if ((s[1] & 0xC0) == 0x80) {
1313 	    *w = (wchar_t) (((byte & 0x1F) << 6) | (s[1] & 0x3F));
1314 	    return 2;
1315 	} else return (size_t)-1;
1316     } else if (byte < 0xF0) {
1317 	if(strlen(s) < 3) return (size_t)-2;
1318 	if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)) {
1319 	    *w = (wchar_t) (((byte & 0x0F) << 12)
1320 			    | (unsigned int) ((s[1] & 0x3F) << 6)
1321 			    | (s[2] & 0x3F));
1322 	    byte = (unsigned int) *w;
1323 	    /* Surrogates range */
1324 	    if(byte >= 0xD800 && byte <= 0xDFFF) return (size_t)-1;
1325 	    if(byte == 0xFFFE || byte == 0xFFFF) return (size_t)-1;
1326 	    return 3;
1327 	} else return (size_t)-1;
1328 
1329     } else if (byte < 0xf8) {
1330 	if(strlen(s) < 4) return (size_t)-2;
1331 	if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80) && ((s[3] & 0xC0) == 0x80)) {
1332 	    unsigned int cvalue = (((byte & 0x0F) << 18)
1333 			| (unsigned int) ((s[1] & 0x3F) << 12)
1334 			| (unsigned int) ((s[2] & 0x3F) << 6)
1335 			| (s[3] & 0x3F));
1336 	    if(sizeof(wchar_t) < 4) /* Assume UTF-16 and return high surrogate.  Users need to call utf8toutf16low next. */
1337 		*w = (wchar_t) ((cvalue - 0x10000) >> 10) | 0xD800;
1338 	    else
1339 		*w = (wchar_t) cvalue;
1340 	    return 4;
1341 	} else return (size_t)-1;
1342     }
1343     if(sizeof(wchar_t) < 4) return (size_t)-2;
1344     /* So now handle 5.6 byte sequences with no testing */
1345     if (byte < 0xFC) {
1346 	if(strlen(s) < 5) return (size_t)-2;
1347 	*w = (wchar_t) (((byte & 0x0F) << 24)
1348 			| (unsigned int) ((s[1] & 0x3F) << 12)
1349 			| (unsigned int) ((s[2] & 0x3F) << 12)
1350 			| (unsigned int) ((s[3] & 0x3F) << 6)
1351 			| (s[4] & 0x3F));
1352 	return 5;
1353     } else {
1354 	if(strlen(s) < 6) return (size_t)-2;
1355 	*w = (wchar_t) (((byte & 0x0F) << 30)
1356 			| (unsigned int) ((s[1] & 0x3F) << 24)
1357 			| (unsigned int) ((s[2] & 0x3F) << 18)
1358 			| (unsigned int) ((s[3] & 0x3F) << 12)
1359 			| (unsigned int) ((s[4] & 0x3F) << 6)
1360 			| (s[5] & 0x3F));
1361 	return 6;
1362     }
1363 }
1364 
1365 /* despite its name this translates to UTF-16 if there are (invalid)
1366  * UTF-8 codings for surrogates in the input */
1367 size_t
utf8towcs(wchar_t * wc,const char * s,size_t n)1368 utf8towcs(wchar_t *wc, const char *s, size_t n)
1369 {
1370     ssize_t m, res = 0;
1371     const char *t;
1372     wchar_t *p;
1373     wchar_t local;
1374 
1375     if(wc)
1376 	for(p = wc, t = s; ; p++, t += m) {
1377 	    m  = (ssize_t) utf8toucs(p, t);
1378 	    if (m < 0) error(_("invalid input '%s' in 'utf8towcs'"), s);
1379 	    if (m == 0) break;
1380 	    res ++;
1381 	    if (res >= n) break;
1382 	    if (IS_HIGH_SURROGATE(*p)) {
1383 		*(++p) = utf8toutf16low(t);
1384 		res ++;
1385 		if (res >= n) break;
1386 	    }
1387 	}
1388     else
1389 	for(t = s; ; t += m) {
1390 	    m  = (ssize_t) utf8toucs(&local, t);
1391 	    if (m < 0) error(_("invalid input '%s' in 'utf8towcs'"), s);
1392 	    if (m == 0) break;
1393 	    res ++;
1394 	    if (IS_HIGH_SURROGATE(local))
1395 		res ++;
1396 	}
1397     return (size_t) res;
1398 }
1399 
1400 size_t
utf8towcs4(R_wchar_t * wc,const char * s,size_t n)1401 utf8towcs4(R_wchar_t *wc, const char *s, size_t n)
1402 {
1403     ssize_t m, res = 0;
1404     const char *t;
1405     R_wchar_t *p;
1406 
1407     if(wc)
1408 	for(p = wc, t = s; ; p++, t += m) {
1409 	    // FIXME this gives a warning on Windows.
1410 	    m  = (ssize_t) utf8toucs(p, t);
1411 	    if (m < 0) error(_("invalid input '%s' in 'utf8towcs32'"), s);
1412 	    if (m == 0) break;
1413 	    if (IS_HIGH_SURROGATE(*p)) *p = utf8toucs32(*p, s);
1414 	    res ++;
1415 	    if (res >= n) break;
1416 	}
1417     else
1418 	for(t = s; ; t += m) {
1419 	    wchar_t local;
1420 	    m  = (ssize_t) utf8toucs(&local, t);
1421 	    if (m < 0) error(_("invalid input '%s' in 'utf8towcs32'"), s);
1422 	    if (m == 0) break;
1423 	    res ++;
1424 	}
1425     return (size_t) res;
1426 }
1427 
1428 /* based on pcre.c */
1429 static const unsigned int utf8_table1[] =
1430   { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
1431 static const unsigned int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
1432 
1433 /* s is NULL, or it contains at least n bytes.  Just write a
1434    terminator if it's not big enough.
1435 
1436    Strangely named: converts from UCS-4 to UTF-8.
1437 */
1438 
Rwcrtomb32(char * s,R_wchar_t cvalue,size_t n)1439 static size_t Rwcrtomb32(char *s, R_wchar_t cvalue, size_t n)
1440 {
1441     register size_t i, j;
1442     if (!n) return 0;
1443     if (s) *s = 0;    /* Simplifies exit later */
1444     if(cvalue == 0) return 0;
1445     for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
1446 	if (cvalue <= utf8_table1[i]) break;
1447     if (i >= n - 1) return 0;  /* need space for terminal null */
1448     if (s) {
1449 	s += i;
1450 	for (j = i; j > 0; j--) {
1451 	    *s-- = (char) (0x80 | (cvalue & 0x3f));
1452 	    cvalue >>= 6;
1453 	}
1454 	*s = (char) (utf8_table2[i] | cvalue);
1455     }
1456     return i + 1;
1457 }
1458 
1459 /* On input, wc is a wide string encoded in UTF-16 or UCS-2 or UCS-4.
1460 
1461    s can be a buffer of size n >= 0 chars, or NULL.  If n = 0 or s =
1462    NULL, nothing is written.
1463 
1464    The return value is the number of chars including the terminating
1465    null.  If the buffer is not big enough, the result is truncated but
1466    still null-terminated
1467 */
1468 attribute_hidden // but used in windlgs
wcstoutf8(char * s,const wchar_t * wc,size_t n)1469 size_t wcstoutf8(char *s, const wchar_t *wc, size_t n)
1470 {
1471     size_t m, res = 0;
1472     char *t;
1473     const wchar_t *p;
1474     if (!n) return 0;
1475     for(p = wc, t = s; ; p++) {
1476 	if (IS_SURROGATE_PAIR(*p, *(p+1))) {
1477 	    R_wchar_t cvalue =  ((*p & 0x3FF) << 10) + (*(p+1) & 0x3FF) + 0x010000;
1478 	    m = Rwcrtomb32(t, cvalue, n - res);
1479 	    p++;
1480 	} else {
1481 	    if (IS_HIGH_SURROGATE(*p) || IS_LOW_SURROGATE(*p))
1482 		warning("unpaired surrogate Unicode point %x", *p);
1483 	    m = Rwcrtomb32(t, (R_wchar_t)(*p), n - res);
1484 	}
1485 	if (!m) break;
1486 	res += m;
1487 	if (t)
1488 	    t += m;
1489     }
1490     return res + 1;
1491 }
1492 
1493 /* convert from R_wchar_t * (UCS-4) */
1494 attribute_hidden
wcs4toutf8(char * s,const R_wchar_t * wc,size_t n)1495 size_t wcs4toutf8(char *s, const R_wchar_t *wc, size_t n)
1496 {
1497     size_t m, res=0;
1498     char *t;
1499     const R_wchar_t *p;
1500     if (!n) return 0;
1501     for(p = wc, t = s; ; p++) {
1502 	m = Rwcrtomb32(t, (*p), n - res);
1503 	if (!m) break;
1504 	res += m;
1505 	if (t)
1506 	    t += m;
1507     }
1508     return res + 1;
1509 }
1510 
1511 /* A version that reports failure as an error */
Mbrtowc(wchar_t * wc,const char * s,size_t n,mbstate_t * ps)1512 size_t Mbrtowc(wchar_t *wc, const char *s, size_t n, mbstate_t *ps)
1513 {
1514     size_t used;
1515 
1516     if(n <= 0 || !*s) return (size_t)0;
1517     used = mbrtowc(wc, s, n, ps);
1518     if((int) used < 0) {
1519 	/* This gets called from the menu setup in RGui */
1520 	if (!R_Is_Running) return (size_t)-1;
1521 	/* let's try to print out a readable version */
1522 	R_CheckStack2(4*strlen(s) + 10);
1523 	char err[4*strlen(s) + 1], *q;
1524 	const char *p;
1525 	for(p = s, q = err; *p; ) {
1526 	    /* don't do the first to keep ps state straight */
1527 	    if(p > s) used = mbrtowc(NULL, p, n, ps);
1528 	    if(used == 0) break;
1529 	    else if((int) used > 0) {
1530 		memcpy(q, p, used);
1531 		p += used;
1532 		q += used;
1533 		n -= used;
1534 	    } else {
1535 		sprintf(q, "<%02x>", (unsigned char) *p++);
1536 		q += 4;
1537 		n--;
1538 	    }
1539 	}
1540 	*q = '\0';
1541 	error(_("invalid multibyte string at '%s'"), err);
1542     }
1543     return used;
1544 }
1545 
1546 /* Truncate a string in place (in native encoding) so that it only contains
1547    valid multi-byte characters. Has no effect in non-mbcs locales.
1548 
1549    This function may be invoked by the error handler via
1550    REvprintf->Rvsnprintf_mbcs.  Do not change it unless you are SURE that
1551    your changes are compatible with the error handling mechanism.
1552 
1553    REvprintf is also used in R_Suicide on Unix.
1554    */
1555 attribute_hidden
mbcsTruncateToValid(char * s)1556 char* mbcsTruncateToValid(char *s)
1557 {
1558     if (!mbcslocale || *s == '\0')
1559 	return s;
1560 
1561     mbstate_t mb_st;
1562     size_t slen = strlen(s); /* at least 1 */
1563     size_t goodlen = 0;
1564 
1565     mbs_init(&mb_st);
1566 
1567     if (utf8locale) {
1568 	/* UTF-8 is self-synchronizing so we can look back from the end
1569 	   for the first non-continuation byte */
1570 	goodlen = slen - 1; /* at least 0 */
1571 	/* for char == signed char we assume 2's complement representation */
1572 	while (goodlen && ((s[goodlen] & '\xC0') == '\x80'))
1573 	    --goodlen;
1574     }
1575     while(goodlen < slen) {
1576 	size_t res;
1577 	res = mbrtowc(NULL, s + goodlen, slen - goodlen, &mb_st);
1578 	if (res == (size_t) -1 || res == (size_t) -2) {
1579 	    /* strip off all remaining characters */
1580 	    for(;goodlen < slen; goodlen++)
1581 		s[goodlen] = '\0';
1582 	    return s;
1583 	}
1584 	goodlen += res;
1585     }
1586     return s;
1587 }
1588 
1589 attribute_hidden
mbcsValid(const char * str)1590 Rboolean mbcsValid(const char *str)
1591 {
1592     return  ((int)mbstowcs(NULL, str, 0) >= 0);
1593 }
1594 
1595 
1596 /* used in src/library/grDevices/src/cairo/cairoFns.c */
1597 #include "valid_utf8.h"
utf8Valid(const char * str)1598 Rboolean utf8Valid(const char *str)
1599 {
1600     return valid_utf8(str, strlen(str)) == 0;
1601 }
1602 
do_validUTF8(SEXP call,SEXP op,SEXP args,SEXP rho)1603 SEXP attribute_hidden do_validUTF8(SEXP call, SEXP op, SEXP args, SEXP rho)
1604 {
1605     checkArity(op, args);
1606     SEXP x = CAR(args);
1607     if (!isString(x))
1608 	error(_("invalid '%s' argument"), "x");
1609     R_xlen_t n = XLENGTH(x);
1610     SEXP ans = allocVector(LGLSXP, n); // no allocation below
1611     int *lans = LOGICAL(ans);
1612     for (R_xlen_t i = 0; i < n; i++)
1613 	lans[i] = utf8Valid(CHAR(STRING_ELT(x, i)));
1614     return ans;
1615 }
1616 
do_validEnc(SEXP call,SEXP op,SEXP args,SEXP rho)1617 SEXP attribute_hidden do_validEnc(SEXP call, SEXP op, SEXP args, SEXP rho)
1618 {
1619     checkArity(op, args);
1620     SEXP x = CAR(args);
1621     if (!isString(x))
1622 	error(_("invalid '%s' argument"), "x");
1623     R_xlen_t n = XLENGTH(x);
1624     SEXP ans = allocVector(LGLSXP, n); // no allocation below
1625     int *lans = LOGICAL(ans);
1626     for (R_xlen_t i = 0; i < n; i++) {
1627 	SEXP p = STRING_ELT(x, i);
1628 	if (IS_BYTES(p) || IS_LATIN1(p)) lans[i] = 1;
1629 	else if (IS_UTF8(p) || utf8locale) lans[i] = utf8Valid(CHAR(p));
1630 	else if(mbcslocale) lans[i] = mbcsValid(CHAR(p));
1631 	else lans[i] = 1;
1632     }
1633     return ans;
1634 }
1635 
1636 
1637 /* MBCS-aware versions of common comparisons.  Only used for ASCII c */
Rf_strchr(const char * s,int c)1638 char *Rf_strchr(const char *s, int c)
1639 {
1640     char *p = (char *)s;
1641     mbstate_t mb_st;
1642     size_t used;
1643 
1644     if(!mbcslocale || utf8locale) return strchr(s, c);
1645     mbs_init(&mb_st);
1646     while( (used = Mbrtowc(NULL, p, R_MB_CUR_MAX, &mb_st)) ) {
1647 	if(*p == c) return p;
1648 	p += used;
1649     }
1650     return (char *)NULL;
1651 }
1652 
Rf_strrchr(const char * s,int c)1653 char *Rf_strrchr(const char *s, int c)
1654 {
1655     char *p = (char *)s, *plast = NULL;
1656     mbstate_t mb_st;
1657     size_t used;
1658 
1659     if(!mbcslocale || utf8locale) return strrchr(s, c);
1660     mbs_init(&mb_st);
1661     while( (used = Mbrtowc(NULL, p, R_MB_CUR_MAX, &mb_st)) ) {
1662 	if(*p == c) plast = p;
1663 	p += used;
1664     }
1665     return plast;
1666 }
1667 
1668 #ifdef Win32
R_fixslash(char * s)1669 void R_fixslash(char *s)
1670 {
1671     char *p = s;
1672 
1673     if(mbcslocale) {
1674 	mbstate_t mb_st; int used;
1675 	mbs_init(&mb_st);
1676 	while((used = Mbrtowc(NULL, p, R_MB_CUR_MAX, &mb_st))) {
1677 	    if(*p == '\\') *p = '/';
1678 	    p += used;
1679 	}
1680     } else
1681 	for (; *p; p++) if (*p == '\\') *p = '/';
1682     /* preserve network shares */
1683     if(s[0] == '/' && s[1] == '/') s[0] = s[1] = '\\';
1684 }
1685 
R_UTF8fixslash(char * s)1686 void R_UTF8fixslash(char *s)
1687 {
1688     char *p = s;
1689 
1690 	for (; *p; p++) if (*p == '\\') *p = '/';
1691 	/* preserve network shares */
1692 	if(s[0] == '/' && s[1] == '/') s[0] = s[1] = '\\';
1693 }
1694 
R_wfixslash(wchar_t * s)1695 static void R_wfixslash(wchar_t *s)
1696 {
1697     wchar_t *p = s;
1698 
1699     for (; *p; p++) if (*p == L'\\') *p = L'/';
1700     /* preserve network shares */
1701     if(s[0] == L'/' && s[1] == L'/') s[0] = s[1] = L'\\';
1702 }
1703 
1704 
R_fixbackslash(char * s)1705 void R_fixbackslash(char *s)
1706 {
1707     char *p = s;
1708 
1709     if(mbcslocale) {
1710 	mbstate_t mb_st; int used;
1711 	mbs_init(&mb_st);
1712 	while((used = Mbrtowc(NULL, p, R_MB_CUR_MAX, &mb_st))) {
1713 	    if(*p == '/') *p = '\\';
1714 	    p += used;
1715 	}
1716     } else
1717 	for (; *p; p++) if (*p == '/') *p = '\\';
1718 }
1719 #endif
1720 
1721 #if defined FC_LEN_T
F77_SYMBOL(rexitc)1722 void NORET F77_SYMBOL(rexitc)(char *msg, int *nchar, FC_LEN_T msg_len)
1723 #else
1724 void NORET F77_SYMBOL(rexitc)(char *msg, int *nchar)
1725 #endif
1726 {
1727     int nc = *nchar;
1728     char buf[256];
1729     if(nc > 255) {
1730 	warning(_("error message truncated to 255 chars"));
1731 	nc = 255;
1732     }
1733     strncpy(buf, msg, (size_t) nc);
1734     buf[nc] = '\0';
1735     mbcsTruncateToValid(buf);
1736     error("%s", buf);
1737 }
1738 
1739 #if defined FC_LEN_T
F77_SYMBOL(rwarnc)1740 void F77_SYMBOL(rwarnc)(char *msg, int *nchar, FC_LEN_T msg_len)
1741 #else
1742 void F77_SYMBOL(rwarnc)(char *msg, int *nchar)
1743 #endif
1744 {
1745     int nc = *nchar;
1746     char buf[256];
1747     if(nc > 255) {
1748 	warning(_("warning message truncated to 255 chars"));
1749 	nc = 255;
1750     }
1751     strncpy(buf, msg, (size_t) nc);
1752     buf[nc] = '\0';
1753     mbcsTruncateToValid(buf);
1754     warning("%s", buf);
1755 }
1756 
F77_SYMBOL(rchkusr)1757 void F77_SYMBOL(rchkusr)(void)
1758 {
1759     R_CheckUserInterrupt();
1760 }
1761 
1762 /* Return a copy of a string using memory from R_alloc.
1763    NB: caller has to manage R_alloc stack.  Used in platform.c
1764 */
acopy_string(const char * in)1765 char *acopy_string(const char *in)
1766 {
1767     char *out;
1768     size_t len = strlen(in);
1769     if (len > 0) {
1770 	out = (char *) R_alloc(1 + len, sizeof(char));
1771 	strcpy(out, in);
1772     } else
1773 	out = "";
1774     return out;
1775 }
1776 
1777 
1778 
1779 
1780 /* Table from
1781 http://unicode.org/Public/MAPPINGS/VENDORS/ADOBE/symbol.txt
1782 */
1783 
1784 /* Conversion table that DOES use Private Usage Area
1785  * (should work better with specialised "symbol" fonts)
1786  */
1787 static int s2u[224] = {
1788     0x0020, 0x0021, 0x2200, 0x0023, 0x2203, 0x0025, 0x0026, 0x220D,
1789     0x0028, 0x0029, 0x2217, 0x002B, 0x002C, 0x2212, 0x002E, 0x002F,
1790     0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
1791     0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
1792     0x2245, 0x0391, 0x0392, 0x03A7, 0x0394, 0x0395, 0x03A6, 0x0393,
1793     0x0397, 0x0399, 0x03D1, 0x039A, 0x039B, 0x039C, 0x039D, 0x039F,
1794     0x03A0, 0x0398, 0x03A1, 0x03A3, 0x03A4, 0x03A5, 0x03C2, 0x03A9,
1795     0x039E, 0x03A8, 0x0396, 0x005B, 0x2234, 0x005D, 0x22A5, 0x005F,
1796     0xF8E5, 0x03B1, 0x03B2, 0x03C7, 0x03B4, 0x03B5, 0x03C6, 0x03B3,
1797     0x03B7, 0x03B9, 0x03D5, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BF,
1798     0x03C0, 0x03B8, 0x03C1, 0x03C3, 0x03C4, 0x03C5, 0x03D6, 0x03C9,
1799     0x03BE, 0x03C8, 0x03B6, 0x007B, 0x007C, 0x007D, 0x223C, 0x0020,
1800     0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020,
1801     0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020,
1802     0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020,
1803     0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020,
1804     0x20AC, 0x03D2, 0x2032, 0x2264, 0x2044, 0x221E, 0x0192, 0x2663,
1805     0x2666, 0x2665, 0x2660, 0x2194, 0x2190, 0x2191, 0x2192, 0x2193,
1806     0x00B0, 0x00B1, 0x2033, 0x2265, 0x00D7, 0x221D, 0x2202, 0x2022,
1807     0x00F7, 0x2260, 0x2261, 0x2248, 0x2026, 0xF8E6, 0xF8E7, 0x21B5,
1808     0x2135, 0x2111, 0x211C, 0x2118, 0x2297, 0x2295, 0x2205, 0x2229,
1809     0x222A, 0x2283, 0x2287, 0x2284, 0x2282, 0x2286, 0x2208, 0x2209,
1810     0x2220, 0x2207, 0xF6DA, 0xF6D9, 0xF6DB, 0x220F, 0x221A, 0x22C5,
1811     0x00AC, 0x2227, 0x2228, 0x21D4, 0x21D0, 0x21D1, 0x21D2, 0x21D3,
1812     0x25CA, 0x2329, 0xF8E8, 0xF8E9, 0xF8EA, 0x2211, 0xF8EB, 0xF8EC,
1813     0xF8ED, 0xF8EE, 0xF8EF, 0xF8F0, 0xF8F1, 0xF8F2, 0xF8F3, 0xF8F4,
1814     0x0020, 0x232A, 0x222B, 0x2320, 0xF8F5, 0x2321, 0xF8F6, 0xF8F7,
1815     0xF8F8, 0xF8F9, 0xF8FA, 0xF8FB, 0xF8FC, 0xF8FD, 0xF8FE, 0x0020
1816 };
1817 
1818 /* Conversion table that does NOT use Private Usage Area (0xF8*)
1819  * (should work better with fonts that have good Unicode coverage)
1820  *
1821  * NOTE that ...
1822  *   23D0 VERTICAL LINE EXTENTION is used for VERTICAL ARROW EXTENDER
1823  *   23AF HORIZONTAL LINE EXTENSION is used for HORIZONTAL ARROW EXTENDER
1824  * ... neither of which may be very good AND ...
1825  *   23AF HORIZONTAL LINE EXTENSION is also used for RADICAL EXTENDER
1826  * ... and that is unlikely to be right for BOTH this use AND
1827  * HORIZONTAL ARROW EXTENDER (if either)
1828  */
1829 static int s2unicode[224] = {
1830     0x0020, 0x0021, 0x2200, 0x0023, 0x2203, 0x0025, 0x0026, 0x220D,
1831     0x0028, 0x0029, 0x2217, 0x002B, 0x002C, 0x2212, 0x002E, 0x002F,
1832     0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
1833     0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
1834     0x2245, 0x0391, 0x0392, 0x03A7, 0x0394, 0x0395, 0x03A6, 0x0393,
1835     0x0397, 0x0399, 0x03D1, 0x039A, 0x039B, 0x039C, 0x039D, 0x039F,
1836     0x03A0, 0x0398, 0x03A1, 0x03A3, 0x03A4, 0x03A5, 0x03C2, 0x03A9,
1837     0x039E, 0x03A8, 0x0396, 0x005B, 0x2234, 0x005D, 0x22A5, 0x005F,
1838     0x23AF, 0x03B1, 0x03B2, 0x03C7, 0x03B4, 0x03B5, 0x03C6, 0x03B3,
1839     0x03B7, 0x03B9, 0x03D5, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BF,
1840     0x03C0, 0x03B8, 0x03C1, 0x03C3, 0x03C4, 0x03C5, 0x03D6, 0x03C9,
1841     0x03BE, 0x03C8, 0x03B6, 0x007B, 0x007C, 0x007D, 0x223C, 0x0020,
1842     0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020,
1843     0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020,
1844     0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020,
1845     0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020,
1846     0x20AC, 0x03D2, 0x2032, 0x2264, 0x2044, 0x221E, 0x0192, 0x2663,
1847     0x2666, 0x2665, 0x2660, 0x2194, 0x2190, 0x2191, 0x2192, 0x2193,
1848     0x00B0, 0x00B1, 0x2033, 0x2265, 0x00D7, 0x221D, 0x2202, 0x2022,
1849     0x00F7, 0x2260, 0x2261, 0x2248, 0x2026, 0x23D0, 0x23AF, 0x21B5,
1850     0x2135, 0x2111, 0x211C, 0x2118, 0x2297, 0x2295, 0x2205, 0x2229,
1851     0x222A, 0x2283, 0x2287, 0x2284, 0x2282, 0x2286, 0x2208, 0x2209,
1852     0x2220, 0x2207, 0x00AE, 0x00A9, 0x2122, 0x220F, 0x221A, 0x22C5,
1853     0x00AC, 0x2227, 0x2228, 0x21D4, 0x21D0, 0x21D1, 0x21D2, 0x21D3,
1854     0x25CA, 0x2329, 0x00AE, 0x00A9, 0x2122, 0x2211, 0x239B, 0x239C,
1855     0x239D, 0x23A1, 0x23A2, 0x23A3, 0x23A7, 0x23A8, 0x23A9, 0x23AA,
1856     0x0020, 0x232A, 0x222B, 0x2320, 0x23AE, 0x2321, 0x239E, 0x239F,
1857     0x23A0, 0x23A4, 0x23A5, 0x23A6, 0x23AB, 0x23AC, 0x23AD, 0x0020
1858 };
1859 
Rf_AdobeSymbol2utf8(char * work,const char * c0,size_t nwork,Rboolean usePUA)1860 void *Rf_AdobeSymbol2utf8(char *work, const char *c0, size_t nwork,
1861 			  Rboolean usePUA)
1862 {
1863     const unsigned char *c = (unsigned char *) c0;
1864     unsigned char *t = (unsigned char *) work;
1865     while (*c) {
1866 	if (*c < 32) *t++ = ' ';
1867 	else {
1868 	    unsigned int u;
1869 	    if (usePUA) {
1870 		u = (unsigned int) s2u[*c - 32];
1871 	    } else {
1872 		u = (unsigned int) s2unicode[*c - 32];
1873 	    }
1874 	    if (u < 128) *t++ = (unsigned char) u;
1875 	    else if (u < 0x800) {
1876 		*t++ = (unsigned char) (0xc0 | (u >> 6));
1877 		*t++ = (unsigned char) (0x80 | (u & 0x3f));
1878 	    } else {
1879 		*t++ = (unsigned char) (0xe0 | (u >> 12));
1880 		*t++ = (unsigned char) (0x80 | ((u >> 6) & 0x3f));
1881 		*t++ = (unsigned char) (0x80 | (u & 0x3f));
1882 	    }
1883 	}
1884 	if (t+6 > (unsigned char *)(work + nwork)) break;
1885 	c++;
1886     }
1887     *t = '\0';
1888     return (char*) work;
1889 }
1890 
1891 /* Convert UTF8 symbol back to single-byte symbol
1892  * ASSUME fontface == 5 and 'str' is UTF8, i.e., we are dealing with
1893  * a UTF8 string that has been through Rf_AdobeSymbol2utf8(usePUA=TRUE)
1894  * (or through Rf_AdobeSymbol2ucs2() then Rf_ucstoutf8())
1895  * i.e., we are dealing with CE_UTF8 string that has come from CE_SYMBOL string.
1896 */
Rf_utf8toAdobeSymbol(char * out,const char * in)1897 int Rf_utf8toAdobeSymbol(char *out, const char *in) {
1898     int i, j, k, used, tmp, nc = 0, found;
1899     int *symbolint;
1900     const char *s = in;
1901     const char *p = in;
1902     for ( ; *p; p += utf8clen(*p)) nc++;
1903     symbolint = (int *) R_alloc(nc, sizeof(int));
1904     for (i = 0, j = 0; i < nc; i++, j++) {
1905 	/* Convert UTF8 to int */
1906 	used = mbrtoint(&tmp, s);
1907 	if (used < 0)
1908 	    error(_("invalid UTF-8 string"));
1909 	symbolint[j] = tmp;
1910 	found = 0;
1911 	/* Convert int to CE_SYMBOL char */
1912 	for (k = 0; k < 224; k++) {
1913 	    if (symbolint[j] == s2u[k]) {
1914 		out[j] = (char)(k + 32);
1915 		found = 1;
1916 	    }
1917 	    if (found) break;
1918 	}
1919 	if (!found)
1920 	    error(_("Conversion failed"));
1921 	s += used;
1922     }
1923     out[nc] = '\0';
1924     return nc;
1925 }
1926 
Rf_utf8Toutf8NoPUA(const char * in)1927 const char* Rf_utf8Toutf8NoPUA(const char *in)
1928 {
1929     int i, j, used, tmp;
1930     /* At least enough because assumes each incoming char only one byte */
1931     int nChar = 3*(int)strlen(in) + 1;
1932     char *result = R_alloc(nChar, sizeof(char));
1933     const char *s = in;
1934     char *p = result;
1935     for (i = 0; i < nChar; i++) {
1936 	/* Convert UTF8 char to int */
1937 	used = mbrtoint(&tmp, s);
1938 	/* Only re-encode if necessary
1939 	 * This is more efficient AND protects against input that is
1940 	 * NOT from Rf_AdobeSymbol2utf8(), e.g., plotmath on Windows
1941 	 * (which is from reEnc(CE_LATIN1, CE_UTF8))
1942 	 */
1943 	if (tmp > 0xF600) {
1944 	    char inChar[4], symbolChar[2], utf8Char[4];
1945 	    char *q;
1946 	    for (j = 0; j < used; j++) {
1947 		inChar[j] = *s++;
1948 	    }
1949 	    inChar[used] = '\0';
1950 	    Rf_utf8toAdobeSymbol(symbolChar, inChar);
1951 	    Rf_AdobeSymbol2utf8(utf8Char, symbolChar, 4, FALSE);
1952 	    q = utf8Char;
1953 	    while (*q) {
1954 		*p++ = *q++;
1955 	    }
1956 	} else {
1957 	    for (j = 0; j < used; j++) {
1958 		*p++ = *s++;
1959 	    }
1960 	}
1961     }
1962     *p = '\0';
1963     return result;
1964 }
1965 
Rf_utf8ToLatin1AdobeSymbol2utf8(const char * in,Rboolean usePUA)1966 const char* Rf_utf8ToLatin1AdobeSymbol2utf8(const char *in, Rboolean usePUA)
1967 {
1968   const char *latinStr;
1969   char *utf8str;
1970   latinStr = reEnc(in, CE_UTF8, CE_LATIN1, 2);
1971   int nc = 3*(int)strlen(latinStr) + 1;
1972   utf8str = R_alloc(nc, sizeof(char));
1973   Rf_AdobeSymbol2utf8(utf8str, latinStr, nc, usePUA);
1974   return utf8str;
1975 }
1976 
Rf_AdobeSymbol2ucs2(int n)1977 int attribute_hidden Rf_AdobeSymbol2ucs2(int n)
1978 {
1979     if(n >= 32 && n < 256) return s2u[n-32];
1980     else return 0;
1981 }
1982 
R_strtod5(const char * str,char ** endptr,char dec,Rboolean NA,int exact)1983 double R_strtod5(const char *str, char **endptr, char dec,
1984 		 Rboolean NA, int exact)
1985 {
1986     LDOUBLE ans = 0.0;
1987     int sign = 1;
1988     const char *p = str;
1989 
1990     /* optional whitespace */
1991     while (isspace(*p)) p++;
1992 
1993     if (NA && strncmp(p, "NA", 2) == 0) {
1994 	ans = NA_REAL;
1995 	p += 2;
1996 	goto done;
1997     }
1998 
1999    /* optional sign */
2000     switch (*p) {
2001     case '-': sign = -1;
2002     case '+': p++;
2003     default: ;
2004     }
2005 
2006     if (strncasecmp(p, "NaN", 3) == 0) {
2007 	ans = R_NaN;
2008 	p += 3;
2009 	goto done;
2010     /* C99 specifies this: must come first to avoid 'inf' match */
2011     } else if (strncasecmp(p, "infinity", 8) == 0) {
2012 	ans = R_PosInf;
2013 	p += 8;
2014 	goto done;
2015     } else if (strncasecmp(p, "Inf", 3) == 0) {
2016 	ans = R_PosInf;
2017 	p += 3;
2018 	goto done;
2019     }
2020 
2021     int n, expn = 0;
2022     if(strlen(p) > 2 && p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) { // Hexadecimal "0x....."
2023 	int exph = -1;
2024 
2025 	/* This will overflow to Inf if appropriate */
2026 	for(p += 2; p; p++) {
2027 	    if('0' <= *p && *p <= '9') ans = 16*ans + (*p -'0');
2028 	    else if('a' <= *p && *p <= 'f') ans = 16*ans + (*p -'a' + 10);
2029 	    else if('A' <= *p && *p <= 'F') ans = 16*ans + (*p -'A' + 10);
2030 	    else if(*p == dec) {exph = 0; continue;}
2031 	    else break;
2032 	    if (exph >= 0) exph += 4;
2033 	}
2034 #define strtod_EXACT_CLAUSE						\
2035 	if(exact && ans > 0x1.fffffffffffffp52) {			\
2036 	    if(exact == NA_LOGICAL)					\
2037 		warning(_(						\
2038 		"accuracy loss in conversion from \"%s\" to numeric"),	\
2039 			str);						\
2040 	    else {							\
2041 		ans = NA_REAL;						\
2042 		p = str; /* back out */					\
2043 		goto done;						\
2044 	    }								\
2045 	}
2046 	strtod_EXACT_CLAUSE;
2047 	if (*p == 'p' || *p == 'P') {
2048 	    int expsign = 1;
2049 	    double p2 = 2.0;
2050 	    switch(*++p) {
2051 	    case '-': expsign = -1;
2052 	    case '+': p++;
2053 	    default: ;
2054 	    }
2055 	    /* The test for n is in response to PR#16358; it's not right if the exponent is
2056 	       very large, but the overflow or underflow below will handle it. */
2057 #define MAX_EXPONENT_PREFIX 9999
2058 	    for (n = 0; *p >= '0' && *p <= '9'; p++) n = (n < MAX_EXPONENT_PREFIX) ? n * 10 + (*p - '0') : n;
2059 	    if (ans != 0.0) { /* PR#15976:  allow big exponents on 0 */
2060 		LDOUBLE fac = 1.0;
2061 		expn += expsign * n;
2062 		if(exph > 0) {
2063 		    if (expn - exph < -122) {	/* PR#17199:  fac may overflow below if expn - exph is too small.
2064 						   2^-122 is a bit bigger than 1E-37, so should be fine on all systems */
2065 			for (n = exph, fac = 1.0; n; n >>= 1, p2 *= p2)
2066 			    if (n & 1) fac *= p2;
2067 			ans /= fac;
2068 			p2 = 2.0;
2069 		    } else
2070 			expn -= exph;
2071 		}
2072 		if (expn < 0) {
2073 		    for (n = -expn, fac = 1.0; n; n >>= 1, p2 *= p2)
2074 			if (n & 1) fac *= p2;
2075 		    ans /= fac;
2076 		} else {
2077 		    for (n = expn, fac = 1.0; n; n >>= 1, p2 *= p2)
2078 			if (n & 1) fac *= p2;
2079 		    ans *= fac;
2080 		}
2081 	    }
2082 	}
2083 	goto done;
2084     } // end {hexadecimal case}
2085 
2086     int ndigits = 0;
2087     for ( ; *p >= '0' && *p <= '9'; p++, ndigits++) ans = 10*ans + (*p - '0');
2088     if (*p == dec)
2089 	for (p++; *p >= '0' && *p <= '9'; p++, ndigits++, expn--)
2090 	    ans = 10*ans + (*p - '0');
2091     if (ndigits == 0) {
2092 	ans = NA_REAL;
2093 	p = str; /* back out */
2094 	goto done;
2095     }
2096     strtod_EXACT_CLAUSE;
2097 
2098     if (*p == 'e' || *p == 'E') {
2099 	int expsign = 1;
2100 	switch(*++p) {
2101 	case '-': expsign = -1;
2102 	case '+': p++;
2103 	default: ;
2104 	}
2105 	for (n = 0; *p >= '0' && *p <= '9'; p++) n = (n < MAX_EXPONENT_PREFIX) ? n * 10 + (*p - '0') : n;
2106 	expn += expsign * n;
2107     }
2108 
2109     /* avoid unnecessary underflow for large negative exponents */
2110     if (expn + ndigits < -300) {
2111 	for (n = 0; n < ndigits; n++) ans /= 10.0;
2112 	expn += ndigits;
2113     }
2114     LDOUBLE p10 = 10., fac = 1.0;
2115     if (expn < -307) { /* use underflow, not overflow */
2116 	for (n = -expn, fac = 1.0; n; n >>= 1, p10 *= p10)
2117 	    if (n & 1) fac /= p10;
2118 	ans *= fac;
2119     } else if (expn < 0) { /* positive powers are exact */
2120 	for (n = -expn, fac = 1.0; n; n >>= 1, p10 *= p10)
2121 	    if (n & 1) fac *= p10;
2122 	ans /= fac;
2123     } else if (ans != 0.0) { /* PR#15976:  allow big exponents on 0, e.g. 0E4933 */
2124 	for (n = expn, fac = 1.0; n; n >>= 1, p10 *= p10)
2125 	    if (n & 1) fac *= p10;
2126 	ans *= fac;
2127     }
2128 
2129     /* explicit overflow to infinity */
2130     if (ans > DBL_MAX) {
2131 	if (endptr) *endptr = (char *) p;
2132 	return (sign > 0) ? R_PosInf : R_NegInf;
2133     }
2134 
2135 done:
2136     if (endptr) *endptr = (char *) p;
2137     return sign * (double) ans;
2138 }
2139 
2140 
R_strtod4(const char * str,char ** endptr,char dec,Rboolean NA)2141 double R_strtod4(const char *str, char **endptr, char dec, Rboolean NA)
2142 {
2143     return R_strtod5(str, endptr, dec, NA, FALSE);
2144 }
2145 
R_strtod(const char * str,char ** endptr)2146 double R_strtod(const char *str, char **endptr)
2147 {
2148     return R_strtod5(str, endptr, '.', FALSE, FALSE);
2149 }
2150 
R_atof(const char * str)2151 double R_atof(const char *str)
2152 {
2153     return R_strtod5(str, NULL, '.', FALSE, FALSE);
2154 }
2155 
2156 /* enc2native and enc2utf8, but they are the same in a UTF-8 locale */
2157 /* primitive */
do_enc2(SEXP call,SEXP op,SEXP args,SEXP env)2158 SEXP attribute_hidden do_enc2(SEXP call, SEXP op, SEXP args, SEXP env)
2159 {
2160     SEXP ans, el;
2161     R_xlen_t i;
2162     Rboolean duped = FALSE;
2163 
2164     checkArity(op, args);
2165     check1arg(args, call, "x");
2166 
2167     if (!isString(CAR(args)))
2168 	errorcall(call, "argument is not a character vector");
2169     ans = CAR(args);
2170     for (i = 0; i < XLENGTH(ans); i++) {
2171 	el = STRING_ELT(ans, i);
2172 	if (el == NA_STRING) continue;
2173 	if (PRIMVAL(op) || known_to_be_utf8) { /* enc2utf8 */
2174 	    if (IS_UTF8(el) || IS_ASCII(el) || IS_BYTES(el)) continue;
2175 	    if (!duped) { ans = PROTECT(duplicate(ans)); duped = TRUE; }
2176 	    SET_STRING_ELT(ans, i,
2177 			   mkCharCE(translateCharUTF8(el), CE_UTF8));
2178 	} else if (ENC_KNOWN(el)) { /* enc2native */
2179 	    if (IS_ASCII(el) || IS_BYTES(el)) continue;
2180 	    if (known_to_be_latin1 && IS_LATIN1(el)) continue;
2181 	    if (!duped) { PROTECT(ans = duplicate(ans)); duped = TRUE; }
2182 	    if (known_to_be_latin1)
2183 		SET_STRING_ELT(ans, i, mkCharCE(translateChar(el), CE_LATIN1));
2184 	    else
2185 		SET_STRING_ELT(ans, i, mkChar(translateChar(el)));
2186 	}
2187     }
2188     if(duped) UNPROTECT(1);
2189     return ans;
2190 }
2191 
2192 #ifdef USE_ICU
2193 # include <locale.h>
2194 #ifdef USE_ICU_APPLE
2195 /* macOS is missing the headers */
2196 typedef int UErrorCode; /* really an enum these days */
2197 struct UCollator;
2198 typedef struct UCollator UCollator;
2199 
2200 typedef enum {
2201   UCOL_EQUAL    = 0,
2202   UCOL_GREATER    = 1,
2203   UCOL_LESS    = -1
2204 } UCollationResult ;
2205 
2206 typedef enum {
2207   UCOL_DEFAULT = -1,
2208   UCOL_PRIMARY = 0,
2209   UCOL_SECONDARY = 1,
2210   UCOL_TERTIARY = 2,
2211   UCOL_DEFAULT_STRENGTH = UCOL_TERTIARY,
2212   UCOL_CE_STRENGTH_LIMIT,
2213   UCOL_QUATERNARY=3,
2214   UCOL_IDENTICAL=15,
2215   UCOL_STRENGTH_LIMIT,
2216   UCOL_OFF = 16,
2217   UCOL_ON = 17,
2218   UCOL_SHIFTED = 20,
2219   UCOL_NON_IGNORABLE = 21,
2220   UCOL_LOWER_FIRST = 24,
2221   UCOL_UPPER_FIRST = 25,
2222   UCOL_ATTRIBUTE_VALUE_COUNT
2223 } UColAttributeValue;
2224 
2225 typedef UColAttributeValue UCollationStrength;
2226 
2227 typedef enum {
2228       UCOL_FRENCH_COLLATION,
2229       UCOL_ALTERNATE_HANDLING,
2230       UCOL_CASE_FIRST,
2231       UCOL_CASE_LEVEL,
2232       UCOL_NORMALIZATION_MODE,
2233       UCOL_DECOMPOSITION_MODE = UCOL_NORMALIZATION_MODE,
2234       UCOL_STRENGTH,
2235       UCOL_HIRAGANA_QUATERNARY_MODE,
2236       UCOL_NUMERIC_COLLATION,
2237       UCOL_ATTRIBUTE_COUNT
2238 } UColAttribute;
2239 
2240 /* UCharIterator struct has to be defined since we use its instances as
2241    local variables, but we don't actually use any of its members. */
2242 typedef struct UCharIterator {
2243   const void *context;
2244   int32_t length, start, index, limit, reservedField;
2245   void *fns[16]; /* we overshoot here (there is just 10 fns in ICU 3.6),
2246 		    but we have to make sure that enough stack space
2247 		    is allocated when used as a local var in future
2248 		    versions */
2249 } UCharIterator;
2250 
2251 UCollator* ucol_open(const char *loc, UErrorCode *status);
2252 void ucol_close(UCollator *coll);
2253 void ucol_setAttribute(UCollator *coll, UColAttribute attr,
2254 		       UColAttributeValue value, UErrorCode *status);
2255 void ucol_setStrength(UCollator *coll, UCollationStrength strength);
2256 UCollationResult ucol_strcollIter(const UCollator *coll,
2257 				  UCharIterator *sIter,
2258 				  UCharIterator *tIter,
2259 				  UErrorCode *status);
2260 void uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length);
2261 
2262 void uloc_setDefault(const char* localeID, UErrorCode* status);
2263 
2264 typedef enum {
2265     ULOC_ACTUAL_LOCALE = 0,
2266     ULOC_VALID_LOCALE = 1,
2267     ULOC_DATA_LOCALE_TYPE_LIMIT = 3
2268 } ULocDataLocaleType ;
2269 
2270 
2271 const char* ucol_getLocaleByType(const UCollator *coll,
2272 				 ULocDataLocaleType type,
2273 				 UErrorCode *status);
2274 
2275 #define U_ZERO_ERROR 0
2276 #define U_FAILURE(x) ((x)>U_ZERO_ERROR)
2277 #define ULOC_ACTUAL_LOCALE 0
2278 
2279 #else
2280 #include <unicode/utypes.h>
2281 #include <unicode/ucol.h>
2282 #include <unicode/uloc.h>
2283 #include <unicode/uiter.h>
2284 #endif
2285 
2286 static UCollator *collator = NULL;
2287 static int collationLocaleSet = 0;
2288 
2289 /* called from platform.c */
resetICUcollator(Rboolean disable)2290 void attribute_hidden resetICUcollator(Rboolean disable)
2291 {
2292     if (collator) ucol_close(collator);
2293     collator = NULL;
2294     collationLocaleSet = disable ? 1 : 0;
2295 }
2296 
2297 static const struct {
2298     const char * const str;
2299     int val;
2300 } ATtable[] = {
2301     { "case_first", UCOL_CASE_FIRST },
2302     { "upper", UCOL_UPPER_FIRST },
2303     { "lower", UCOL_LOWER_FIRST },
2304     { "default ", UCOL_DEFAULT },
2305     { "strength", 999 },
2306     { "primary ", UCOL_PRIMARY },
2307     { "secondary ", UCOL_SECONDARY },
2308     { "teritary ", UCOL_TERTIARY },
2309     { "guaternary ", UCOL_QUATERNARY },
2310     { "identical ", UCOL_IDENTICAL },
2311     { "french_collation", UCOL_FRENCH_COLLATION },
2312     { "on", UCOL_ON },
2313     { "off", UCOL_OFF },
2314     { "normalization", UCOL_NORMALIZATION_MODE },
2315     { "alternate_handling", UCOL_ALTERNATE_HANDLING },
2316     { "non_ignorable", UCOL_NON_IGNORABLE },
2317     { "shifted", UCOL_SHIFTED },
2318     { "case_level", UCOL_CASE_LEVEL },
2319     { "hiragana_quaternary", UCOL_HIRAGANA_QUATERNARY_MODE },
2320     { NULL,  0 }
2321 };
2322 
2323 #ifdef Win32
2324 #define BUFFER_SIZE 512
2325 typedef int (WINAPI *PGSDLN)(LPWSTR, int);
2326 
getLocale(void)2327 static const char *getLocale(void)
2328 {
2329     const char *p = getenv("R_ICU_LOCALE");
2330     if (p && p[0]) return p;
2331 
2332     // This call is >= Vista/Server 2008
2333     // ICU should accept almost all of these, e.g. en-US and uz-Latn-UZ
2334     PGSDLN pGSDLN = (PGSDLN)
2335 	GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")),
2336 		       "GetSystemDefaultLocaleName");
2337     if(pGSDLN) {
2338 	WCHAR wcBuffer[BUFFER_SIZE];
2339 	pGSDLN(wcBuffer, BUFFER_SIZE);
2340 	static char locale[BUFFER_SIZE];
2341 	WideCharToMultiByte(CP_ACP, 0, wcBuffer, -1,
2342 			    locale, BUFFER_SIZE, NULL, NULL);
2343 	return locale;
2344     } else return "root";
2345 }
2346 #else
getLocale(void)2347 static const char *getLocale(void)
2348 {
2349     const char *p = getenv("R_ICU_LOCALE");
2350     return (p && p[0]) ? p : setlocale(LC_COLLATE, NULL);
2351 }
2352 #endif
2353 
do_ICUset(SEXP call,SEXP op,SEXP args,SEXP rho)2354 SEXP attribute_hidden do_ICUset(SEXP call, SEXP op, SEXP args, SEXP rho)
2355 {
2356     SEXP x;
2357     UErrorCode  status = U_ZERO_ERROR;
2358 
2359     for (; args != R_NilValue; args = CDR(args)) {
2360 	if (isNull(TAG(args))) error(_("all arguments must be named"));
2361 	const char *this = CHAR(PRINTNAME(TAG(args)));
2362 	const char *s;
2363 
2364 	x = CAR(args);
2365 	if (!isString(x) || LENGTH(x) != 1)
2366 	    error(_("invalid '%s' argument"), this);
2367 	s = CHAR(STRING_ELT(x, 0));
2368 	if (streql(this, "locale")) {
2369 	    if (collator) {
2370 		ucol_close(collator);
2371 		collator = NULL;
2372 	    }
2373 	    if(streql(s, "ASCII")) {
2374 		collationLocaleSet = 2;
2375 	    } else {
2376 		if(strcmp(s, "none")) {
2377 		    if(streql(s, "default"))
2378 			uloc_setDefault(getLocale(), &status);
2379 		    else uloc_setDefault(s, &status);
2380 		    if(U_FAILURE(status))
2381 			error("failed to set ICU locale %s (%d)", s, status);
2382 		    collator = ucol_open(NULL, &status);
2383 		    if (U_FAILURE(status)) {
2384 			collator = NULL;
2385 			error("failed to open ICU collator (%d)", status);
2386 		    }
2387 		}
2388 		collationLocaleSet = 1;
2389 	    }
2390 	} else {
2391 	    int i, at = -1, val = -1;
2392 	    for (i = 0; ATtable[i].str; i++)
2393 		if (streql(this, ATtable[i].str)) {
2394 		    at = ATtable[i].val;
2395 		    break;
2396 		}
2397 	    for (i = 0; ATtable[i].str; i++)
2398 		if (streql(s, ATtable[i].str)) {
2399 		    val = ATtable[i].val;
2400 		    break;
2401 		}
2402 	    if (collator && at == 999 && val >= 0) {
2403 		ucol_setStrength(collator, val);
2404 	    } else if (collator && at >= 0 && val >= 0) {
2405 		ucol_setAttribute(collator, at, val, &status);
2406 		if (U_FAILURE(status))
2407 		    error("failed to set ICU collator attribute");
2408 	    }
2409 	}
2410     }
2411 
2412     return R_NilValue;
2413 }
2414 
do_ICUget(SEXP call,SEXP op,SEXP args,SEXP rho)2415 SEXP attribute_hidden do_ICUget(SEXP call, SEXP op, SEXP args, SEXP rho)
2416 {
2417     const char *ans = "unknown", *res;
2418     checkArity(op, args);
2419 
2420     if (collationLocaleSet == 2) {
2421 	ans = "ASCII";
2422     } else if(collator) {
2423 	UErrorCode  status = U_ZERO_ERROR;
2424 	int type = asInteger(CAR(args));
2425 	if (type < 1 || type > 2)
2426 	    error(_("invalid '%s' value"), "type");
2427 
2428 	res = ucol_getLocaleByType(collator,
2429 				   type == 1 ? ULOC_ACTUAL_LOCALE : ULOC_VALID_LOCALE,
2430 				   &status);
2431 	if(!U_FAILURE(status) && res) ans = res;
2432     } else ans = "ICU not in use";
2433     return mkString(ans);
2434 }
2435 
2436 /* Caller has to manage the R_alloc stack */
2437 /* NB: strings can have equal collation weight without being identical */
2438 attribute_hidden
Scollate(SEXP a,SEXP b)2439 int Scollate(SEXP a, SEXP b)
2440 {
2441     if (!collationLocaleSet) {
2442 	int errsv = errno;      /* OSX may set errno in the operations below. */
2443 	collationLocaleSet = 1;
2444 
2445 	/* A lot of code depends on that setting LC_ALL or LC_COLLATE to "C"
2446 	   via environment variables or Sys.setlocale ensures the "C" collation
2447 	   order. Originally, R_ICU_LOCALE always took precedence over LC_ALL
2448 	   and LC_COLLATE variables and over Sys.setlocale (except on Unix when
2449 	   R_ICU_LOCALE=C). This now adds an exception: when LC_ALL is set to "C"
2450 	   (or unset and LC_COLLATE is set to "C"), the "C" collation order will
2451 	   be used. */
2452 	const char *envl = getenv("LC_ALL");
2453 	if (!envl || !envl[0])
2454 	    envl = getenv("LC_COLLATE");
2455 	int useC = envl && !strcmp(envl, "C");
2456 
2457 #ifndef Win32
2458 	if (!useC && strcmp("C", getLocale()) ) {
2459 #else
2460 	/* On Windows, ICU is used for R_ICU_LOCALE=C, on Unix, it is not. */
2461 	/* FIXME: as ICU does not support C as locale, could we use the Unix
2462 	   behavior on all systems? */
2463 	const char *p = getenv("R_ICU_LOCALE");
2464 	if(p && p[0] && (!useC || !strcmp(p, "C"))) {
2465 #endif
2466 	    UErrorCode status = U_ZERO_ERROR;
2467 	    uloc_setDefault(getLocale(), &status);
2468 	    if(U_FAILURE(status))
2469 		error("failed to set ICU locale (%d)", status);
2470 	    collator = ucol_open(NULL, &status);
2471 	    if (U_FAILURE(status)) {
2472 		collator = NULL;
2473 		error("failed to open ICU collator (%d)", status);
2474 	    }
2475 	}
2476 	errno = errsv;
2477     }
2478     // translation may use escapes, but that is OK here
2479     if (collator == NULL)
2480 	return collationLocaleSet == 2 ?
2481 	    strcmp(translateChar(a), translateChar(b)) :
2482 	    strcoll(translateChar(a), translateChar(b));
2483 
2484     UCharIterator aIter, bIter;
2485     const char *as = translateCharUTF8(a), *bs = translateCharUTF8(b);
2486     int len1 = (int) strlen(as), len2 = (int) strlen(bs);
2487     uiter_setUTF8(&aIter, as, len1);
2488     uiter_setUTF8(&bIter, bs, len2);
2489     UErrorCode status = U_ZERO_ERROR;
2490     int result = ucol_strcollIter(collator, &aIter, &bIter, &status);
2491     if (U_FAILURE(status)) error("could not collate using ICU");
2492     return result;
2493 }
2494 
2495 #else /* not USE_ICU */
2496 
2497 SEXP attribute_hidden do_ICUset(SEXP call, SEXP op, SEXP args, SEXP rho)
2498 {
2499     warning(_("ICU is not supported on this build"));
2500     return R_NilValue;
2501 }
2502 
2503 SEXP attribute_hidden do_ICUget(SEXP call, SEXP op, SEXP args, SEXP rho)
2504 {
2505     checkArity(op, args);
2506     return mkString("ICU not in use");
2507 }
2508 
2509 void attribute_hidden resetICUcollator(Rboolean disable) {}
2510 
2511 # ifdef Win32
2512 
2513 static int Rstrcoll(const char *s1, const char *s2)
2514 {
2515     R_CheckStack2(sizeof(wchar_t) * (2 + strlen(s1) + strlen(s2)));
2516     wchar_t w1[strlen(s1)+1], w2[strlen(s2)+1];
2517     utf8towcs(w1, s1, strlen(s1));
2518     utf8towcs(w2, s2, strlen(s2));
2519     return wcscoll(w1, w2);
2520 }
2521 
2522 int Scollate(SEXP a, SEXP b)
2523 {
2524     if(getCharCE(a) == CE_UTF8 || getCharCE(b) == CE_UTF8)
2525 	return Rstrcoll(translateCharUTF8(a), translateCharUTF8(b));
2526     else
2527 	return strcoll(translateChar(a), translateChar(b));
2528 }
2529 
2530 # else
2531 attribute_hidden
2532 int Scollate(SEXP a, SEXP b)
2533 {
2534     return strcoll(translateChar(a), translateChar(b));
2535 }
2536 
2537 # endif
2538 #endif
2539 
2540 #include <lzma.h>
2541 
2542 SEXP attribute_hidden do_crc64(SEXP call, SEXP op, SEXP args, SEXP rho)
2543 {
2544     checkArity(op, args);
2545     SEXP in = CAR(args);
2546     uint64_t crc = 0;
2547     char ans[17];
2548     if (!isString(in)) error("input must be a character string");
2549     const char *str = CHAR(STRING_ELT(in, 0));
2550 
2551     /* Seems this is really 64-bit only on 64-bit platforms */
2552     crc = lzma_crc64((uint8_t *)str, strlen(str), crc);
2553     snprintf(ans, 17, "%lx", (long unsigned int) crc);
2554     return mkString(ans);
2555 }
2556 
2557 static void
2558 bincode(double *x, R_xlen_t n, double *breaks, int nb,
2559 	int *code, int right, int include_border)
2560 {
2561     int lo, hi, nb1 = nb - 1, new;
2562     int lft = !right;
2563 
2564     /* This relies on breaks being sorted, so wise to check that */
2565     for(int i = 1; i < nb; i++)
2566 	if(breaks[i-1] > breaks[i]) error(_("'breaks' is not sorted"));
2567 
2568     for(R_xlen_t i = 0; i < n; i++) {
2569 	code[i] = NA_INTEGER;
2570 	if(!ISNAN(x[i])) {
2571 	    lo = 0;
2572 	    hi = nb1;
2573 	    if(x[i] <  breaks[lo] || breaks[hi] < x[i] ||
2574 	       (x[i] == breaks[lft ? hi : lo] && ! include_border)) ;
2575 	    else {
2576 		while(hi - lo >= 2) {
2577 		    new = (hi + lo)/2;
2578 		    if(x[i] > breaks[new] || (lft && x[i] == breaks[new]))
2579 			lo = new;
2580 		    else
2581 			hi = new;
2582 		}
2583 		code[i] = lo + 1;
2584 	    }
2585 	}
2586     }
2587 }
2588 
2589 /* 'breaks' cannot be a long vector as the return codes are integer. */
2590 SEXP attribute_hidden do_bincode(SEXP call, SEXP op, SEXP args, SEXP rho)
2591 {
2592     checkArity(op, args);
2593     SEXP x, breaks, right, lowest;
2594     x = CAR(args); args = CDR(args);
2595     breaks = CAR(args); args = CDR(args);
2596     right = CAR(args); args = CDR(args);
2597     lowest = CAR(args);
2598 #ifdef LONG_VECTOR_SUPPORT
2599     if (IS_LONG_VEC(breaks))
2600 	error(_("long vector '%s' is not supported"), "breaks");
2601 #endif
2602     PROTECT(x = coerceVector(x, REALSXP));
2603     PROTECT(breaks = coerceVector(breaks, REALSXP));
2604     R_xlen_t n = XLENGTH(x);
2605     int nB = LENGTH(breaks), sr = asLogical(right), sl = asLogical(lowest);
2606     if (nB == NA_INTEGER) error(_("invalid '%s' argument"), "breaks");
2607     if (sr == NA_INTEGER) error(_("invalid '%s' argument"), "right");
2608     if (sl == NA_INTEGER) error(_("invalid '%s' argument"), "include.lowest");
2609     SEXP codes;
2610     PROTECT(codes = allocVector(INTSXP, n));
2611     bincode(REAL(x), n, REAL(breaks), nB, INTEGER(codes), sr, sl);
2612     UNPROTECT(3);
2613     return codes;
2614 }
2615 
2616 SEXP attribute_hidden do_tabulate(SEXP call, SEXP op, SEXP args, SEXP rho)
2617 {
2618     checkArity(op, args);
2619     SEXP in = CAR(args), nbin = CADR(args);
2620     if (TYPEOF(in) != INTSXP)  error("invalid input");
2621     R_xlen_t n = XLENGTH(in);
2622     int nb = asInteger(nbin);
2623     if (nb == NA_INTEGER || nb < 0)
2624 	error(_("invalid '%s' argument"), "nbin");
2625     int *x = INTEGER(in);
2626     SEXP ans;
2627 #ifdef LONG_VECTOR_SUPPORT
2628     if (n > INT_MAX) {
2629 	ans = allocVector(REALSXP, nb);
2630 	double *y = REAL(ans);
2631 	if (nb) memset(y, 0, nb * sizeof(double));
2632 	for(R_xlen_t i = 0 ; i < n ; i++)
2633 	    if (x[i] != NA_INTEGER && x[i] > 0 && x[i] <= nb) y[x[i] - 1]++;
2634     } else
2635 #endif
2636     {
2637 	ans = allocVector(INTSXP, nb);
2638 	int *y = INTEGER(ans);
2639 	if (nb) memset(y, 0, nb * sizeof(int));
2640 	for(R_xlen_t i = 0 ; i < n ; i++)
2641 	    if (x[i] != NA_INTEGER && x[i] > 0 && x[i] <= nb) y[x[i] - 1]++;
2642     }
2643     return ans;
2644 }
2645 
2646 /* Note: R's findInterval( x , vec, ...)  has first two arguments swapped !
2647  * .Internal(findInterval(vec, x, rightmost.closed, all.inside,  left.open))
2648  *                         xt  x    right             inside       leftOp
2649  * x can be a long vector but xt cannot since the result is integer
2650 */
2651 SEXP attribute_hidden do_findinterval(SEXP call, SEXP op, SEXP args, SEXP rho)
2652 {
2653     checkArity(op, args);
2654     SEXP xt, x, right, inside, leftOp;
2655     xt = CAR(args); args = CDR(args);
2656     x = CAR(args); args = CDR(args);
2657     right = CAR(args); args = CDR(args);
2658     inside = CAR(args);args = CDR(args);
2659     leftOp = CAR(args);
2660     if(TYPEOF(xt) != REALSXP || TYPEOF(x) != REALSXP) error("invalid input");
2661 #ifdef LONG_VECTOR_SUPPORT
2662     if (IS_LONG_VEC(xt))
2663 	error(_("long vector '%s' is not supported"), "vec");
2664 #endif
2665     int n = LENGTH(xt);
2666     if (n == NA_INTEGER) error(_("invalid '%s' argument"), "vec");
2667     R_xlen_t nx = XLENGTH(x);
2668     int sr = asLogical(right), si = asLogical(inside), lO = asLogical(leftOp);
2669     if (sr == NA_INTEGER)
2670 	error(_("invalid '%s' argument"), "rightmost.closed");
2671     if (si == NA_INTEGER)
2672 	error(_("invalid '%s' argument"), "all.inside");
2673     SEXP ans = allocVector(INTSXP, nx);
2674     double *rxt = REAL(xt), *rx = REAL(x);
2675     int ii = 1;
2676     for(int i = 0; i < nx; i++) {
2677 	if (ISNAN(rx[i]))
2678 	    ii = NA_INTEGER;
2679 	else {
2680 	    int mfl;
2681 	    ii = findInterval2(rxt, n, rx[i], sr, si, lO, ii, &mfl); // -> ../appl/interv.c
2682 	}
2683 	INTEGER(ans)[i] = ii;
2684     }
2685     return ans;
2686 }
2687 
2688 #ifdef Win32
2689 // this includes RS.h
2690 # undef ERROR
2691 #endif
2692 #include <R_ext/Applic.h>
2693 SEXP attribute_hidden do_pretty(SEXP call, SEXP op, SEXP args, SEXP rho)
2694 {
2695     checkArity(op, args);
2696     SEXP ans, nm, hi;
2697     double l = asReal(CAR(args)); args = CDR(args);
2698     if (!R_FINITE(l)) error(_("invalid '%s' argument"), "l");
2699     double u = asReal(CAR(args)); args = CDR(args);
2700     if (!R_FINITE(u)) error(_("invalid '%s' argument"), "u");
2701     int n = asInteger(CAR(args)); args = CDR(args);
2702     if (n == NA_INTEGER || n < 0) error(_("invalid '%s' argument"), "n");
2703     int min_n = asInteger(CAR(args)); args = CDR(args);
2704     if (min_n == NA_INTEGER || min_n < 0 || min_n > n)
2705 	error(_("invalid '%s' argument"), "min.n");
2706     double shrink = asReal(CAR(args)); args = CDR(args);
2707     if (!R_FINITE(shrink) || shrink <= 0.)
2708 	error(_("invalid '%s' argument"), "shrink.sml");
2709     PROTECT(hi = coerceVector(CAR(args), REALSXP)); args = CDR(args);
2710     double z;
2711     if (!R_FINITE(z = REAL(hi)[0]) || z < 0.)
2712 	error(_("invalid '%s' argument"), "high.u.bias");
2713     if (!R_FINITE(z = REAL(hi)[1]) || z < 0.)
2714 	error(_("invalid '%s' argument"), "u5.bias");
2715     int eps = asInteger(CAR(args)); /* eps.correct */
2716     if (eps == NA_INTEGER || eps < 0 || eps > 2)
2717 	error(_("'eps.correct' must be 0, 1, or 2"));
2718     R_pretty(&l, &u, &n, min_n, shrink, REAL(hi), eps, 1);
2719     //------ (returns 'unit' which we do not need)
2720     PROTECT(ans = allocVector(VECSXP, 3));
2721     SET_VECTOR_ELT(ans, 0, ScalarReal(l));
2722     SET_VECTOR_ELT(ans, 1, ScalarReal(u));
2723     SET_VECTOR_ELT(ans, 2, ScalarInteger(n));
2724     nm = allocVector(STRSXP, 3);
2725     setAttrib(ans, R_NamesSymbol, nm);
2726     SET_STRING_ELT(nm, 0, mkChar("l"));
2727     SET_STRING_ELT(nm, 1, mkChar("u"));
2728     SET_STRING_ELT(nm, 2, mkChar("n"));
2729     UNPROTECT(2);
2730     return ans;
2731 }
2732 
2733 /*
2734     r <- .Internal(formatC(x, as.character(mode), width, digits,
2735 		   as.character(format), as.character(flag), i.strlen))
2736 */
2737 
2738 static void
2739 str_signif_sexp(SEXP x, const char *type, int width, int digits,
2740 	   const char *format, const char *flag, char **result);
2741 
2742 SEXP attribute_hidden do_formatC(SEXP call, SEXP op, SEXP args, SEXP rho)
2743 {
2744     checkArity(op, args);
2745     SEXP x = CAR(args); args = CDR(args);
2746     if (!isVector(x)) error(_("'x' must be a vector"));
2747     R_xlen_t n = XLENGTH(x);
2748     const char *type = CHAR(STRING_ELT(CAR(args), 0)); args = CDR(args);
2749     int width = asInteger(CAR(args)); args = CDR(args);
2750     int digits = asInteger(CAR(args)); args = CDR(args);
2751     const char *fmt = CHAR(STRING_ELT(CAR(args), 0)); args = CDR(args);
2752     const char *flag = CHAR(STRING_ELT(CAR(args), 0)); args = CDR(args);
2753     SEXP i_strlen = PROTECT(coerceVector(CAR(args), INTSXP));
2754     char **cptr = (char **) R_alloc(n, sizeof(char*));
2755     for (R_xlen_t i = 0; i < n; i++) {
2756 	int ix = INTEGER(i_strlen)[i] + 2;
2757 	cptr[i] = (char *) R_alloc(ix + 1, sizeof(char));
2758 	memset(cptr[i], ' ', ix);
2759 	cptr[i][ix] = 0;
2760     }
2761     str_signif_sexp(x, type, width, digits, fmt, flag, cptr);
2762     SEXP ans = PROTECT(allocVector(STRSXP, n));
2763     for (R_xlen_t i = 0; i < n; i++) SET_STRING_ELT(ans, i, mkChar(cptr[i]));
2764     UNPROTECT(2);
2765     return ans;
2766 }
2767 
2768 /* Former src/appl/strsignif.c
2769  *
2770  *  Copyright (C) Martin Maechler, 1994, 1998
2771  *  Copyright (C) 2001-2013 the R Core Team
2772  *
2773  *  I want you to preserve the copyright of the original author(s),
2774  *  and encourage you to send me any improvements by e-mail. (MM).
2775  *
2776  *  Originally from Bill Dunlap
2777  *  bill@stat.washington.edu
2778  *  Wed Feb 21, 1990
2779  *
2780  *  Much improved by Martin Maechler, including the "fg" format.
2781  *
2782  *  Patched by Friedrich.Leisch@ci.tuwien.ac.at
2783  *  Fri Nov 22, 1996
2784  *
2785  *  Some fixes by Ross Ihaka
2786  *  ihaka@stat.auckland.ac.nz
2787  *  Sat Dec 21, 1996
2788  *  Integer arguments changed from "long" to "int"
2789  *  Bus error due to non-writable strings fixed
2790  *
2791  *  BDR 2001-10-30 use R_alloc not Calloc as memory was not
2792  *  reclaimed on error (and there are many error exits).
2793  *
2794  *	type	"double" or "integer" (R - numeric 'mode').
2795  *
2796  *	width	The total field width; width < 0 means to left justify
2797  *		the number in this field (equivalent to flag = "-").
2798  *		It is possible that the result will be longer than this,
2799  *		but that should only happen in reasonable cases.
2800  *
2801  *	digits	The desired number of digits after the decimal point.
2802  *		digits < 0 uses the default for C, namely 6 digits.
2803  *
2804  *	format	"d" (for integers) or "f", "e","E", "g", "G" (for 'real')
2805  *		"f" gives numbers in the usual "xxx.xxx" format;
2806  *		"e" and "E" give n.ddde<nn> or n.dddE<nn> (scientific format);
2807  *		"g" and "G" puts them into scientific format if it saves
2808  *		space to do so.
2809  *	    NEW: "fg" gives numbers in "xxx.xxx" format as "f",
2810  *		  ~~  however, digits are *significant* digits and,
2811  *		      if digits > 0, no trailing zeros are produced, as in "g".
2812  *
2813  *	flag	Format modifier as in K&R "C", 2nd ed., p.243;
2814  *		e.g., "0" pads leading zeros; "-" does left adjustment
2815  *		the other possible flags are  "+", " ", and "#".
2816  *	  New (Feb.98): if flag has more than one character, all are passed..
2817  *
2818  *  Gabe Becker (2019-05-21): Added str_signif_sexp which wraps
2819  *  original DATAPTR based str_signif to support ALTREPs.
2820  *
2821  *     Any future calls to str_signif on SEXP data should be via
2822  *     str_signif_sexp to ensure ALTREP support.
2823  *
2824  */
2825 
2826 /* <UTF8> char here is either ASCII or handled as a whole */
2827 
2828 #ifdef Win32
2829 /* avoid latest MinGW's redefinition in stdio.h */
2830 #include <trioremap.h>
2831 #endif
2832 #include <Rmath.h>		/* fround */
2833 
2834 static
2835 void str_signif(void *x, R_xlen_t n, const char *type, int width, int digits,
2836 		const char *format, const char *flag, char **result)
2837 {
2838     int dig = abs(digits);
2839     Rboolean rm_trailing_0 = digits >= 0;
2840     Rboolean do_fg = !strcmp("fg", format); /* TRUE  iff  format == "fg" */
2841     double xx;
2842     int iex;
2843     size_t j, len_flag = strlen(flag);
2844     const void *vmax = vmaxget();
2845 
2846     char *f0  =	 R_alloc((size_t) do_fg ? 1+1+len_flag+3 : 1, sizeof(char));
2847     char *form = R_alloc((size_t) 1+1+len_flag+3 + strlen(format),
2848 			 sizeof(char));
2849 
2850     if (width == 0)
2851 	error("width cannot be zero");
2852 
2853     if (strcmp("d", format) == 0) {
2854 	if (len_flag == 0)
2855 	    strcpy(form, "%*d");
2856 	else {
2857 	    strcpy(form, "%");
2858 	    strcat(form, flag);
2859 	    strcat(form, "*d");
2860 	}
2861 	if (strcmp("integer", type) == 0)
2862 	    for (R_xlen_t i = 0; i < n; i++)
2863 		snprintf(result[i], strlen(result[i]) + 1,
2864 			 form, width, ((int *)x)[i]);
2865 	else
2866 	    error("'type' must be \"integer\" for  \"d\"-format");
2867     }
2868     else { /* --- floating point --- */
2869 	if (len_flag == 0)
2870 	    strcpy(form, "%*.*");
2871 	else {
2872 	    strcpy(form, "%");
2873 	    strcat(form, flag);
2874 	    strcat(form, "*.*");
2875 	}
2876 
2877 	if(do_fg) {
2878 	    strcpy(f0, "%");
2879 	    strcat(f0, flag);
2880 	    strcat(f0, ".*f");
2881 	    strcat(form, "g");
2882 	}
2883 	else
2884 	    strcat(form, format);
2885 #ifdef DEBUG
2886 	fprintf(stderr, "strsignif.c: form='%s', width=%d, dig=%d\n",
2887 		form, width, dig);
2888 	if(do_fg) fprintf(stderr, "\t\"fg\": f0='%s'.", f0);
2889 #endif
2890 	if (strcmp("double", type) == 0) {
2891 	    if(do_fg) /* do smart "f" : */
2892 		for (R_xlen_t i = 0; i < n; i++) {
2893 		    xx = ((double *)x)[i];
2894 		    if(xx == 0.)
2895 			strcpy(result[i], "0");
2896 		    else {
2897 			/* This was iex= (int)floor(log10(fabs(xx)))
2898 			   That's wrong, as xx might get rounded up,
2899 			   and we do need some fuzz or 99.5 is correct.
2900 			*/
2901 			double xxx = fabs(xx), X;
2902 			iex = (int)floor(log10(xxx) + 1e-12);
2903 			X = fround(xxx/Rexp10((double)iex) + 1e-12,
2904 				   (double)(dig-1));
2905 			if(iex > 0 &&  X >= 10) {
2906 			    xx = X * Rexp10((double)iex);
2907 			    iex++;
2908 			}
2909 			if(iex == -4 && fabs(xx)< 1e-4) {/* VERY rare case */
2910 			    iex = -5;
2911 			}
2912 			if(iex < -4) {
2913 				/* "g" would result in 'e-' representation:*/
2914 			    snprintf(result[i], strlen(result[i]) + 1,
2915 				     f0, dig-1 + -iex, xx);
2916 #ifdef DEBUG
2917 			    fprintf(stderr, " x[%d]=%g, iex=%d\n", i, xx, iex);
2918 			    fprintf(stderr, "\tres. = '%s'; ", result[i]);
2919 #endif
2920 			    /* Remove trailing  "0"s __ IFF flag has no '#': */
2921 			    if(rm_trailing_0) {
2922 				j = strlen(result[i])-1;
2923 #ifdef DEBUG
2924 				int jL = j;
2925 #endif
2926 				while(result[i][j] == '0') j--;
2927 				result[i][j+1] = '\0';
2928 #ifdef DEBUG
2929 				fprintf(stderr, "\t>>> jL=%d, j=%d; new res= '%s'\n",
2930 					jL, j, result[i]);
2931 #endif
2932 			    }
2933 
2934 			} else { /* iex >= -4:	NOT "e-" */
2935 				/* if iex >= dig, would have "e+" representation */
2936 #ifdef DEBUG
2937 			    fprintf(stderr, "\t  iex >= -4; using %d for 'dig'\n",
2938 				    (iex >= dig) ? (iex+1) : dig);
2939 #endif
2940 			    snprintf(result[i], strlen(result[i]) + 1,
2941 				     form, width, (iex >= dig) ? (iex+1) : dig, xx);
2942 			}
2943 		    } /* xx != 0 */
2944 		} /* if(do_fg) for(i..) */
2945 	    else
2946 		for (R_xlen_t i = 0; i < n; i++)
2947 		    snprintf(result[i], strlen(result[i]) + 1,
2948 			     form, width, dig, ((double *)x)[i]);
2949 	} else
2950 	    error("'type' must be \"real\" for this format");
2951     }
2952     vmaxset(vmax);
2953 }
2954 
2955 
2956 /* wrap original DATAPTR based str_signif in ITERATE_BY_REGION calls to
2957    support ALTREPs
2958 
2959    We still accept type because it is part of the defined API and only defaults
2960    to matching the SEXP type.
2961 */
2962 static
2963 void str_signif_sexp(SEXP x, const char *type, int width, int digits,
2964 		     const char *format, const char *flag, char **result)
2965 {
2966     /* result + idx is the overall position of the chunk we're populating */
2967     if(TYPEOF(x) == INTSXP) {
2968 	ITERATE_BY_REGION(x, px, idx, nb, int, INTEGER,
2969 			  {
2970 			      str_signif((void *) px, nb, type, width, digits,
2971 					 format, flag, result + idx);
2972 			  });
2973     } else if (TYPEOF(x) == REALSXP) {
2974 	ITERATE_BY_REGION(x, px, idx, nb, double, REAL,
2975 			  {
2976 			      str_signif((void *) px, nb, type, width, digits,
2977 					 format, flag, result + idx);
2978 			  });
2979     } else {
2980 	error("unsupported type ");
2981     }
2982 }
2983 
2984 /* added in R 4.1.0.
2985    This checks if it succeeds.
2986    FIXME: is this worth inlining?
2987  */
2988 char *Rstrdup(const char *s)
2989 {
2990     size_t nb = strlen(s) + 1;
2991     void *cpy = malloc(nb);
2992     if (cpy == NULL) error("allocation error in Rstrdup");
2993     memcpy (cpy, s, nb);
2994     return (char *) cpy;
2995 }
2996