1 /*
2 * R : A Computer Language for Statistical Data Analysis
3 * Copyright (C) 1995, 1996 Robert Gentleman and Ross Ihaka
4 * Copyright (C) 1998-2021 The R Core Team.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, a copy is available at
18 * https://www.R-project.org/Licenses/
19 */
20
21 /* <UTF8>
22 byte-level access needed checks.
23 OK in UTF-8 provided quotes, comment, sep and dec chars are ASCII.
24 Also OK in DBCS.
25
26 We use only ' ', tab, CR, LF as space chars.
27 There is also the possibility of other digits (which we should
28 probably continue to ignore).
29 */
30
31 #ifdef HAVE_CONFIG_H
32 #include <config.h>
33 #endif
34
35 #define R_USE_SIGNALS 1
36 #include <Defn.h>
37 #include <float.h> /* for DBL_DIG */
38 #include <Fileio.h>
39 #include <Rconnections.h>
40 #include <errno.h>
41 #include <Print.h>
42
43 #include <rlocale.h> /* for btowc */
44
45 #undef _
46 #ifdef ENABLE_NLS
47 #include <libintl.h>
48 #define _(String) dgettext ("utils", String)
49 #else
50 #define _(String) (String)
51 #endif
52
53
54 /* The size of vector initially allocated by scan */
55 #define SCAN_BLOCKSIZE 1000
56 /* The size of the console buffer */
57 /* NB: in Windows this also needs to be set in gnuwin32/getline/getline.c */
58 #define CONSOLE_PROMPT_SIZE 256
59
60 #define NO_COMCHAR 100000 /* won't occur even in Unicode */
61
62
63 /* The number of distinct strings to track */
64 #define MAX_STRINGS 10000
65
66
67 static unsigned char ConsoleBuf[CONSOLE_BUFFER_SIZE+1], *ConsoleBufp;
68 static int ConsoleBufCnt;
69 static char ConsolePrompt[CONSOLE_PROMPT_SIZE];
70
71 typedef struct {
72 SEXP NAstrings;
73 int quiet;
74 int sepchar; /* = 0 */ /* This gets compared to ints */
75 char decchar; /* = '.' */ /* This only gets compared to chars */
76 char quoteset[10]; /* = "" */
77 int comchar; /* = NO_COMCHAR */
78 int ttyflag; /* = 0 */
79 Rconnection con; /* = NULL */
80 Rboolean wasopen; /* = FALSE */
81 Rboolean escapes; /* = FALSE */
82 int save; /* = 0; */
83 Rboolean isLatin1; /* = FALSE */
84 Rboolean isUTF8; /* = FALSE */
85 Rboolean skipNul;
86 char convbuf[100];
87 } LocalData;
88
89 /* If mode = 0 use for numeric fields where "" is NA
90 If mode = 1 use for character fields where "" is verbatim unless
91 na.strings includes "" */
isNAstring(const char * buf,int mode,LocalData * d)92 static R_INLINE int isNAstring(const char *buf, int mode, LocalData *d)
93 {
94 int i;
95
96 if(!mode && strlen(buf) == 0) return 1;
97 for (i = 0; i < length(d->NAstrings); i++)
98 if (!strcmp(CHAR(STRING_ELT(d->NAstrings, i)), buf)) return 1;
99 return 0;
100 }
101
102
Rspace(unsigned int c)103 static R_INLINE Rboolean Rspace(unsigned int c)
104 {
105 if (c == ' ' || c == '\t' || c == '\n' || c == '\r') return TRUE;
106 #ifdef Win32
107 /* 0xa0 is NBSP in all 8-bit Windows locales */
108 if(!mbcslocale && c == 0xa0) return TRUE;
109 #else
110 /* 0xa0 is NBSP in Latin-1 */
111 if(known_to_be_latin1 && c == 0xa0) return TRUE;
112 #endif
113 return FALSE;
114 }
115
116
117 /* used by readline() and menu() */
ConsoleGetchar(void)118 static int ConsoleGetchar(void)
119 {
120 if (--ConsoleBufCnt < 0) {
121 ConsoleBuf[CONSOLE_BUFFER_SIZE] = '\0';
122 if (R_ReadConsole(ConsolePrompt, ConsoleBuf,
123 CONSOLE_BUFFER_SIZE, 0) == 0) {
124 R_ClearerrConsole();
125 return R_EOF;
126 }
127 ConsoleBufp = ConsoleBuf;
128 ConsoleBufCnt = (int) strlen((char *)ConsoleBuf);
129 ConsoleBufCnt--;
130 }
131 /* at this point we need to use unsigned char or similar */
132 return (int) *ConsoleBufp++;
133 }
134
135 /* used by scan() */
ConsoleGetcharWithPushBack(Rconnection con)136 static int ConsoleGetcharWithPushBack(Rconnection con)
137 {
138 // -fanalyzer says this can free curLine twice.
139 if(con->nPushBack > 0) {
140 char *curLine = con->PushBack[con->nPushBack-1];
141 int c = curLine[con->posPushBack++];
142 if(con->posPushBack >= strlen(curLine)) {
143 /* last character on a line, so pop the line */
144 free(curLine);
145 con->nPushBack--;
146 con->posPushBack = 0;
147 if(con->nPushBack == 0) free(con->PushBack);
148 }
149 return c;
150 } else
151 return ConsoleGetchar();
152 }
153
154 /* Like strtol, but for ints not longs and returns NA_INTEGER on overflow */
Strtoi(const char * nptr,int base)155 static int Strtoi(const char *nptr, int base)
156 {
157 long res;
158 char *endp;
159
160 errno = 0;
161 res = strtol(nptr, &endp, base);
162 if (*endp != '\0') res = NA_INTEGER;
163 /* next can happen on a 64-bit platform */
164 if (res > INT_MAX || res < INT_MIN) res = NA_INTEGER;
165 if (errno == ERANGE) res = NA_INTEGER;
166 return (int) res;
167 }
168
169 static double
Strtod(const char * nptr,char ** endptr,Rboolean NA,LocalData * d,int i_exact)170 Strtod (const char *nptr, char **endptr, Rboolean NA, LocalData *d, int i_exact)
171 {
172 return R_strtod5(nptr, endptr, d->decchar, NA, i_exact);
173 }
174
175 static Rcomplex
strtoc(const char * nptr,char ** endptr,Rboolean NA,LocalData * d,int i_exact)176 strtoc(const char *nptr, char **endptr, Rboolean NA, LocalData *d, int i_exact)
177 {
178 Rcomplex z;
179 double x, y;
180 char *s, *endp;
181
182 x = Strtod(nptr, &endp, NA, d, i_exact);
183 if (isBlankString(endp)) {
184 z.r = x; z.i = 0;
185 } else if (*endp == 'i') {
186 if (endp == nptr) {
187 z.r = NA_REAL; z.i = NA_REAL;
188 }
189 else {
190 z.r = 0; z.i = x;
191 endp++;
192 }
193 } else {
194 s = endp;
195 y = Strtod(s, &endp, NA, d, i_exact);
196 if (*endp == 'i') {
197 z.r = x; z.i = y;
198 endp++;
199 } else {
200 z.r = NA_REAL; z.i = NA_REAL;
201 endp = (char *) nptr; /* -Wall */
202 }
203 }
204 *endptr = endp;
205 return z;
206 }
207
208
scanchar_raw(LocalData * d)209 static R_INLINE int scanchar_raw(LocalData *d)
210 {
211 int c = (d->ttyflag) ? ConsoleGetcharWithPushBack(d->con) :
212 Rconn_fgetc(d->con);
213 if(c == 0) {
214 if(d->skipNul) {
215 do {
216 c = (d->ttyflag) ? ConsoleGetcharWithPushBack(d->con) :
217 Rconn_fgetc(d->con);
218 } while(c == 0);
219 }
220 }
221 return c;
222 }
223
unscanchar(int c,LocalData * d)224 static R_INLINE void unscanchar(int c, LocalData *d)
225 {
226 d->save = c;
227 }
228
229 /* For second bytes in a DBCS:
230 should not be called when a char is saved, but be cautious
231 */
scanchar2(LocalData * d)232 static R_INLINE int scanchar2(LocalData *d)
233 {
234 int next;
235 if (d->save) {
236 next = d->save;
237 d->save = 0;
238 } else
239 next = scanchar_raw(d);
240 return next;
241 }
242
scanchar(Rboolean inQuote,LocalData * d)243 static int scanchar(Rboolean inQuote, LocalData *d)
244 {
245 int next;
246 if (d->save) {
247 next = d->save;
248 d->save = 0;
249 } else
250 next = scanchar_raw(d);
251 if(next == d->comchar && !inQuote) {
252 do
253 next = scanchar_raw(d);
254 while (next != '\n' && next != R_EOF);
255 }
256 if(next == '\\' && d->escapes) {
257 next = scanchar_raw(d);
258 if ('0' <= next && next <= '8') {
259 int octal = next - '0';
260 if ('0' <= (next = scanchar_raw(d)) && next <= '8') {
261 octal = 8 * octal + next - '0';
262 if ('0' <= (next = scanchar_raw(d)) && next <= '8') {
263 octal = 8 * octal + next - '0';
264 } else unscanchar(next, d);
265 } else unscanchar(next, d);
266 next = octal;
267 } else
268 switch(next) {
269 case 'a': next = '\a'; break;
270 case 'b': next = '\b'; break;
271 case 'f': next = '\f'; break;
272 case 'n': next = '\n'; break;
273 case 'r': next = '\r'; break;
274 case 't': next = '\t'; break;
275 case 'v': next = '\v'; break;
276 case 'x': {
277 int val = 0; int i, ext;
278 for(i = 0; i < 2; i++) {
279 next = scanchar_raw(d);
280 if(next >= '0' && next <= '9') ext = next - '0';
281 else if (next >= 'A' && next <= 'F') ext = next - 'A' + 10;
282 else if (next >= 'a' && next <= 'f') ext = next - 'a' + 10;
283 else {unscanchar(next, d); break;}
284 val = 16*val + ext;
285 }
286 next = val;
287 }
288 break;
289 default:
290 /* Any other char and even EOF escapes to itself, but we
291 need to preserve \" etc inside quotes.
292 */
293 if(inQuote && strchr(d->quoteset, next)) {
294 unscanchar(next, d);
295 next = '\\';
296 }
297 break;
298 }
299 }
300 return next;
301 }
302
303
304 #include "RBufferUtils.h"
305
306
countfields(SEXP args)307 SEXP countfields(SEXP args)
308 {
309 SEXP ans, file, sep, bns, quotes, comstr;
310 int nfields, nskip, i, c, inquote, quote = 0;
311 int blocksize, nlines, blskip;
312 const char *p;
313 Rboolean dbcslocale = (R_MB_CUR_MAX == 2);
314 LocalData data = {NULL, 0, 0, '.', "", NO_COMCHAR, 0, NULL, FALSE,
315 FALSE, 0, FALSE, FALSE};
316 data.NAstrings = R_NilValue;
317
318 args = CDR(args);
319
320 file = CAR(args); args = CDR(args);
321 sep = CAR(args); args = CDR(args);
322 quotes = CAR(args); args = CDR(args);
323 nskip = asInteger(CAR(args)); args = CDR(args);
324 blskip = asLogical(CAR(args)); args = CDR(args);
325 comstr = CAR(args);
326 if (TYPEOF(comstr) != STRSXP || length(comstr) != 1)
327 error(_("invalid '%s' argument"), "comment.char");
328 p = translateChar(STRING_ELT(comstr, 0));
329 data.comchar = NO_COMCHAR; /* here for -Wall */
330 if (strlen(p) > 1)
331 error(_("invalid '%s' argument"), "comment.char");
332 else if (strlen(p) == 1) data.comchar = (unsigned char)*p;
333
334 if (nskip < 0 || nskip == NA_INTEGER) nskip = 0;
335 if (blskip == NA_LOGICAL) blskip = 1;
336
337 if (isString(sep) || isNull(sep)) {
338 if (length(sep) == 0) data.sepchar = 0;
339 else data.sepchar = (unsigned char) translateChar(STRING_ELT(sep, 0))[0];
340 /* gets compared to chars: bug prior to 1.7.0 */
341 } else error(_("invalid '%s' argument"), "sep");
342
343 if (isString(quotes)) {
344 const char *sc = translateChar(STRING_ELT(quotes, 0));
345 if (strlen(sc)) strcpy(data.quoteset, sc);
346 else strcpy(data.quoteset, "");
347 } else if (isNull(quotes))
348 strcpy(data.quoteset, "");
349 else
350 error(_("invalid quote symbol set"));
351
352 i = asInteger(file);
353 data.con = getConnection(i);
354 if(i == 0) {
355 data.ttyflag = 1;
356 } else {
357 data.ttyflag = 0;
358 data.wasopen = data.con->isopen;
359 if(!data.wasopen) {
360 strcpy(data.con->mode, "r");
361 if(!data.con->open(data.con))
362 error(_("cannot open the connection"));
363 if(!data.con->canread) {
364 data.con->close(data.con);
365 error(_("cannot read from this connection"));
366 }
367 } else {
368 if(!data.con->canread)
369 error(_("cannot read from this connection"));
370 }
371 for (i = 0; i < nskip; i++) /* MBCS-safe */
372 while ((c = scanchar(FALSE, &data)) != '\n' && c != R_EOF);
373 }
374
375 blocksize = SCAN_BLOCKSIZE;
376 PROTECT(ans = allocVector(INTSXP, blocksize));
377 nlines = 0;
378 nfields = 0;
379 inquote = 0;
380
381 data.save = 0;
382
383 for (;;) {
384 c = scanchar(inquote, &data);
385 if (c == R_EOF) {
386 if (nfields != 0)
387 INTEGER(ans)[nlines] = nfields;
388 else nlines--;
389 goto donecf;
390 }
391 else if (c == '\n') {
392 if (inquote) {
393 INTEGER(ans)[nlines] = NA_INTEGER;
394 nlines++;
395 } else if (nfields || !blskip) {
396 INTEGER(ans)[nlines] = nfields;
397 nlines++;
398 nfields = 0;
399 inquote = 0;
400 }
401 if (nlines == blocksize) {
402 bns = ans;
403 blocksize = 2 * blocksize;
404 ans = allocVector(INTSXP, blocksize);
405 UNPROTECT(1);
406 PROTECT(ans);
407 copyVector(ans, bns);
408 }
409 continue;
410 }
411 else if (data.sepchar) {
412 if (nfields == 0)
413 nfields++;
414 if (inquote && c == R_EOF) {
415 if(!data.wasopen) data.con->close(data.con);
416 error(_("quoted string on line %d terminated by EOF"), inquote);
417 }
418 if (inquote && c == quote)
419 inquote = 0;
420 else if (strchr(data.quoteset, c)) {
421 inquote = nlines + 1;
422 quote = c;
423 }
424 if (c == data.sepchar && !inquote)
425 nfields++;
426 }
427 else if (!Rspace(c)) {
428 if (strchr(data.quoteset, c)) {
429 quote = c;
430 inquote = nlines + 1;
431 while ((c = scanchar(inquote, &data)) != quote) {
432 if (c == R_EOF) {
433 if(!data.wasopen) data.con->close(data.con);
434 error(_("quoted string on line %d terminated by EOF"), inquote);
435 } else if (c == '\n') {
436 INTEGER(ans)[nlines] = NA_INTEGER;
437 nlines++;
438 if (nlines == blocksize) {
439 bns = ans;
440 blocksize = 2 * blocksize;
441 ans = allocVector(INTSXP, blocksize);
442 UNPROTECT(1);
443 PROTECT(ans);
444 copyVector(ans, bns);
445 }
446 }
447 }
448 inquote = 0;
449 } else {
450 do {
451 if(dbcslocale && btowc(c) == WEOF) scanchar2(&data);
452 c = scanchar(FALSE, &data);
453 } while (!Rspace(c) && c != R_EOF);
454 if (c == R_EOF) c = '\n';
455 unscanchar(c, &data);
456 }
457 nfields++;
458 }
459
460 }
461 donecf:
462 /* we might have a character that was unscanchar-ed.
463 So pushback if possible */
464 if (data.save && !data.ttyflag && data.wasopen) {
465 char line[2] = " ";
466 line[0] = (char) data.save;
467 con_pushback(data.con, FALSE, line);
468 }
469 if(!data.wasopen) data.con->close(data.con);
470
471 if (nlines < 0) {
472 UNPROTECT(1);
473 return R_NilValue;
474 }
475 if (nlines == blocksize) {
476 UNPROTECT(1);
477 return ans;
478 }
479
480 bns = allocVector(INTSXP, nlines+1);
481 for (i = 0; i <= nlines; i++)
482 INTEGER(bns)[i] = INTEGER(ans)[i];
483 UNPROTECT(1);
484 return bns;
485 }
486
487 /* A struct used by typeconvert to keep track of possible types for the input */
488 typedef struct typecvt_possible_types {
489 unsigned int islogical : 1;
490 unsigned int isinteger : 1;
491 unsigned int isreal : 1;
492 unsigned int iscomplex : 1;
493 } Typecvt_Info;
494
495
496 /* Sets fields of typeInfo, ruling out possible types based on s.
497 *
498 * The typeInfo struct should be initialized with all fields TRUE.
499 */
ruleout_types(const char * s,Typecvt_Info * typeInfo,LocalData * data,Rboolean exact)500 static void ruleout_types(const char *s, Typecvt_Info *typeInfo, LocalData *data,
501 Rboolean exact)
502 {
503 int res;
504 char *endp;
505
506 if (typeInfo->islogical) {
507 if (strcmp(s, "F") == 0 || strcmp(s, "T") == 0 ||
508 strcmp(s, "FALSE") == 0 || strcmp(s, "TRUE") == 0) {
509 typeInfo->isinteger = FALSE;
510 typeInfo->isreal = FALSE;
511 typeInfo->iscomplex = FALSE;
512 return; // short cut
513 } else {
514 typeInfo->islogical = FALSE;
515 }
516 }
517
518 if (typeInfo->isinteger) {
519 res = Strtoi(s, 10);
520 if (res == NA_INTEGER)
521 typeInfo->isinteger = FALSE;
522 }
523
524 if (typeInfo->isreal) {
525 Strtod(s, &endp, TRUE, data, exact);
526 if (!isBlankString(endp))
527 typeInfo->isreal = FALSE;
528 }
529
530 if (typeInfo->iscomplex) {
531 strtoc(s, &endp, TRUE, data, exact);
532 if (!isBlankString(endp))
533 typeInfo->iscomplex = FALSE;
534 }
535 }
536
537
538 /* type.convert(char, na.strings, as.is, dec, numerals) */
539
540 /* This is a horrible hack which is used in read.table to take a
541 character variable, if possible to convert it to a logical,
542 integer, numeric or complex variable. If this is not possible,
543 the result is a character string if as.is == TRUE
544 or a factor if as.is == FALSE. */
545
546
typeconvert(SEXP call,SEXP op,SEXP args,SEXP env)547 SEXP typeconvert(SEXP call, SEXP op, SEXP args, SEXP env)
548 {
549 SEXP cvec, a, dup, levs, dims, names, dec, numerals;
550 SEXP rval = R_NilValue; /* -Wall */
551 int i, j, len, asIs, i_exact;
552 Rboolean done = FALSE, exact;
553 char *endp;
554 const char *tmp = NULL;
555 LocalData data = {NULL, 0, 0, '.', "", NO_COMCHAR, 0, NULL, FALSE,
556 FALSE, 0, FALSE, FALSE};
557 Typecvt_Info typeInfo; /* keep track of possible types of cvec */
558 typeInfo.islogical = TRUE; /* we can't rule anything out initially */
559 typeInfo.isinteger = TRUE;
560 typeInfo.isreal = TRUE;
561 typeInfo.iscomplex = TRUE;
562 data.NAstrings = R_NilValue;
563
564 args = CDR(args);
565
566 if (!isString(CAR(args)))
567 error(_("the first argument must be of mode character"));
568
569 data.NAstrings = CADR(args);
570 if (TYPEOF(data.NAstrings) != STRSXP)
571 error(_("invalid '%s' argument"), "na.strings");
572
573 asIs = asLogical(CADDR(args));
574 if (asIs == NA_LOGICAL) asIs = 0;
575
576 dec = CADDDR(args);
577 if (isString(dec) || isNull(dec)) {
578 if (length(dec) == 0)
579 data.decchar = '.';
580 else
581 data.decchar = translateChar(STRING_ELT(dec, 0))[0];
582 }
583
584 numerals = CAD4R(args); // string, one of c("allow.loss", "warn.loss", "no.loss")
585 if (isString(numerals)) {
586 tmp = CHAR(STRING_ELT(numerals, 0));
587 if(strcmp(tmp, "allow.loss") == 0) {
588 i_exact = FALSE;
589 exact = FALSE;
590 } else if(strcmp(tmp, "warn.loss") == 0) {
591 i_exact = NA_INTEGER;
592 exact = FALSE;
593 } else if(strcmp(tmp, "no.loss") == 0) {
594 i_exact = TRUE;
595 exact = TRUE;
596 } else // should never happen
597 error(_("invalid 'numerals' string: \"%s\""), tmp);
598
599 } else { // (currently never happens): use default
600 i_exact = FALSE;
601 exact = FALSE;
602 }
603
604 cvec = CAR(args);
605 len = length(cvec);
606
607 /* save the dim/dimnames attributes */
608
609 PROTECT(dims = getAttrib(cvec, R_DimSymbol));
610 if (isArray(cvec))
611 PROTECT(names = getAttrib(cvec, R_DimNamesSymbol));
612 else
613 PROTECT(names = getAttrib(cvec, R_NamesSymbol));
614
615 /* Find the first non-NA entry (empty => NA) */
616 for (i = 0; i < len; i++) {
617 tmp = CHAR(STRING_ELT(cvec, i));
618 if (!(STRING_ELT(cvec, i) == NA_STRING || strlen(tmp) == 0
619 || isNAstring(tmp, 1, &data) || isBlankString(tmp)))
620 break;
621 }
622 if (i < len) { // Found non-NA entry; use it to screen:
623 ruleout_types(tmp, &typeInfo, &data, exact);
624 }
625
626 if (typeInfo.islogical) {
627 PROTECT(rval = allocVector(LGLSXP, len));
628 for (i = 0; i < len; i++) {
629 tmp = CHAR(STRING_ELT(cvec, i));
630 if (STRING_ELT(cvec, i) == NA_STRING || strlen(tmp) == 0
631 || isNAstring(tmp, 1, &data) || isBlankString(tmp))
632 LOGICAL(rval)[i] = NA_LOGICAL;
633 else {
634 if (strcmp(tmp, "F") == 0 || strcmp(tmp, "FALSE") == 0)
635 LOGICAL(rval)[i] = 0;
636 else if(strcmp(tmp, "T") == 0 || strcmp(tmp, "TRUE") == 0)
637 LOGICAL(rval)[i] = 1;
638 else {
639 typeInfo.islogical = FALSE;
640 ruleout_types(tmp, &typeInfo, &data, exact);
641 break;
642 }
643 }
644 }
645 if (typeInfo.islogical) done = TRUE; else UNPROTECT(1);
646 }
647
648 if (!done && typeInfo.isinteger) {
649 PROTECT(rval = allocVector(INTSXP, len));
650 for (i = 0; i < len; i++) {
651 tmp = CHAR(STRING_ELT(cvec, i));
652 if (STRING_ELT(cvec, i) == NA_STRING || strlen(tmp) == 0
653 || isNAstring(tmp, 1, &data) || isBlankString(tmp))
654 INTEGER(rval)[i] = NA_INTEGER;
655 else {
656 INTEGER(rval)[i] = Strtoi(tmp, 10);
657 if (INTEGER(rval)[i] == NA_INTEGER) {
658 typeInfo.isinteger = FALSE;
659 ruleout_types(tmp, &typeInfo, &data, exact);
660 break;
661 }
662 }
663 }
664 if(typeInfo.isinteger) done = TRUE; else UNPROTECT(1);
665 }
666
667 if (!done && typeInfo.isreal) {
668 PROTECT(rval = allocVector(REALSXP, len));
669 for (i = 0; i < len; i++) {
670 tmp = CHAR(STRING_ELT(cvec, i));
671 if (STRING_ELT(cvec, i) == NA_STRING || strlen(tmp) == 0
672 || isNAstring(tmp, 1, &data) || isBlankString(tmp))
673 REAL(rval)[i] = NA_REAL;
674 else {
675 REAL(rval)[i] = Strtod(tmp, &endp, FALSE, &data, i_exact);
676 if (!isBlankString(endp)) {
677 typeInfo.isreal = FALSE;
678 ruleout_types(tmp, &typeInfo, &data, exact);
679 break;
680 }
681 }
682 }
683 if(typeInfo.isreal) done = TRUE; else UNPROTECT(1);
684 }
685
686 if (!done && typeInfo.iscomplex) {
687 PROTECT(rval = allocVector(CPLXSXP, len));
688 for (i = 0; i < len; i++) {
689 tmp = CHAR(STRING_ELT(cvec, i));
690 if (STRING_ELT(cvec, i) == NA_STRING || strlen(tmp) == 0
691 || isNAstring(tmp, 1, &data) || isBlankString(tmp))
692 COMPLEX(rval)[i].r = COMPLEX(rval)[i].i = NA_REAL;
693 else {
694 COMPLEX(rval)[i] = strtoc(tmp, &endp, FALSE, &data, i_exact);
695 if (!isBlankString(endp)) {
696 typeInfo.iscomplex = FALSE;
697 /* this is not needed, unless other cases are added */
698 ruleout_types(tmp, &typeInfo, &data, exact);
699 break;
700 }
701 }
702 }
703 if(typeInfo.iscomplex) done = TRUE; else UNPROTECT(1);
704 }
705
706 if (!done) {
707 if (asIs) {
708 PROTECT(rval = duplicate(cvec));
709 for (i = 0; i < len; i++)
710 if(isNAstring(CHAR(STRING_ELT(rval, i)), 1, &data))
711 SET_STRING_ELT(rval, i, NA_STRING);
712 }
713 else {
714 PROTECT(dup = duplicated(cvec, FALSE));
715 j = 0;
716 for (i = 0; i < len; i++) {
717 /* <NA> is never to be a level here */
718 if (STRING_ELT(cvec, i) == NA_STRING) continue;
719 if (LOGICAL(dup)[i] == 0 && !isNAstring(CHAR(STRING_ELT(cvec, i)), 1, &data))
720 j++;
721 }
722
723 PROTECT(levs = allocVector(STRSXP,j));
724 j = 0;
725 for (i = 0; i < len; i++) {
726 if (STRING_ELT(cvec, i) == NA_STRING) continue;
727 if (LOGICAL(dup)[i] == 0 && !isNAstring(CHAR(STRING_ELT(cvec, i)), 1, &data))
728 SET_STRING_ELT(levs, j++, STRING_ELT(cvec, i));
729 }
730
731 /* We avoid an allocation by reusing dup,
732 * a LGLSXP of the right length
733 */
734 rval = dup;
735 SET_TYPEOF(rval, INTSXP);
736
737 /* put the levels in lexicographic order */
738
739 sortVector(levs, FALSE);
740
741 PROTECT(a = matchE(levs, cvec, NA_INTEGER, env));
742 for (i = 0; i < len; i++)
743 INTEGER(rval)[i] = INTEGER(a)[i];
744
745 setAttrib(rval, R_LevelsSymbol, levs);
746 PROTECT(a = mkString("factor"));
747 setAttrib(rval, R_ClassSymbol, a);
748 UNPROTECT(3);
749 }
750 }
751
752 setAttrib(rval, R_DimSymbol, dims);
753 setAttrib(rval, isArray(cvec) ? R_DimNamesSymbol : R_NamesSymbol, names);
754 UNPROTECT(3);
755 return rval;
756 }
757
758
759 /* Works with digits, but OK in UTF-8 */
menu(SEXP choices)760 SEXP menu(SEXP choices)
761 {
762 int c, j;
763 double first;
764 char buffer[MAXELTSIZE], *bufp = buffer;
765 LocalData data = {NULL, 0, 0, '.', "", NO_COMCHAR, 0, NULL, FALSE,
766 FALSE, 0, FALSE, FALSE};
767 data.NAstrings = R_NilValue;
768
769
770 if (!isString(choices))
771 error(_("invalid '%s' argument"), "choices");
772
773 snprintf(ConsolePrompt, CONSOLE_PROMPT_SIZE, _("Selection: "));
774
775 while ((c = ConsoleGetchar()) != '\n' && c != R_EOF) {
776 if (bufp >= &buffer[MAXELTSIZE - 2]) continue;
777 *bufp++ = (char) c;
778 }
779 *bufp++ = '\0';
780 ConsolePrompt[0] = '\0';
781
782 bufp = buffer;
783 while (Rspace((int)*bufp)) bufp++;
784 first = LENGTH(choices) + 1;
785 if (isdigit((int)*bufp)) {
786 first = Strtod(buffer, NULL, TRUE, &data, /*exact*/FALSE);
787 } else {
788 for (j = 0; j < LENGTH(choices); j++) {
789 if (streql(translateChar(STRING_ELT(choices, j)), buffer)) {
790 first = j + 1;
791 break;
792 }
793 }
794 }
795 return ScalarInteger((int)first);
796 }
797
798 /* readTableHead(file, nlines, comment.char, blank.lines.skip, quote, sep) */
799 /* simplified version of readLines, with skip of blank lines and
800 comment-only lines */
801 #define BUF_SIZE 1000
readtablehead(SEXP args)802 SEXP readtablehead(SEXP args)
803 {
804 SEXP file, comstr, ans = R_NilValue, ans2, quotes, sep;
805 int nlines, i, c, quote = 0, nread, nbuf, buf_size = BUF_SIZE,
806 blskip, skipNul;
807 const char *p; char *buf;
808 Rboolean empty, skip, firstnonwhite;
809 LocalData data = {NULL, 0, 0, '.', "", NO_COMCHAR, 0, NULL, FALSE,
810 FALSE, 0, FALSE, FALSE, FALSE};
811 data.NAstrings = R_NilValue;
812
813 args = CDR(args);
814
815 file = CAR(args); args = CDR(args);
816 nlines = asInteger(CAR(args)); args = CDR(args);
817 comstr = CAR(args); args = CDR(args);
818 blskip = asLogical(CAR(args)); args = CDR(args);
819 quotes = CAR(args); args = CDR(args);
820 sep = CAR(args); args = CDR(args);
821 skipNul = asLogical(CAR(args));
822
823 if (nlines <= 0 || nlines == NA_INTEGER)
824 error(_("invalid '%s' argument"), "nlines");
825 if (blskip == NA_LOGICAL) blskip = 1;
826 if (isString(quotes)) {
827 const char *sc = translateChar(STRING_ELT(quotes, 0));
828 /* FIXME: will leak memory at long jump */
829 // strdup allocates and can fail
830 if (strlen(sc)) strcpy(data.quoteset, sc);
831 else strcpy(data.quoteset, "");
832 } else if (isNull(quotes))
833 strcpy(data.quoteset, "");
834 else
835 error(_("invalid quote symbol set"));
836
837 if (TYPEOF(comstr) != STRSXP || length(comstr) != 1)
838 error(_("invalid '%s' argument"), "comment.char");
839 p = translateChar(STRING_ELT(comstr, 0));
840 data.comchar = NO_COMCHAR; /* here for -Wall */
841 if (strlen(p) > 1)
842 error(_("invalid '%s' argument"), "comment.char");
843 else if (strlen(p) == 1) data.comchar = (int)*p;
844 if (isString(sep) || isNull(sep)) {
845 if (length(sep) == 0) data.sepchar = 0;
846 else data.sepchar = (unsigned char) translateChar(STRING_ELT(sep, 0))[0];
847 /* gets compared to chars: bug prior to 1.7.0 */
848 } else error(_("invalid '%s' argument"), "sep");
849 if (skipNul == NA_LOGICAL) error(_("invalid '%s' argument"), "skipNul");
850 data.skipNul = skipNul;
851
852 i = asInteger(file);
853 data.con = getConnection(i);
854 data.ttyflag = (i == 0);
855 data.wasopen = data.con->isopen;
856 if(!data.wasopen) {
857 strcpy(data.con->mode, "r");
858 if(!data.con->open(data.con)) error(_("cannot open the connection"));
859 } else { /* for a non-blocking connection, more input may
860 have become available, so re-position */
861 if(data.con->canseek && !data.con->blocking)
862 data.con->seek(data.con, data.con->seek(data.con, -1, 1, 1), 1, 1);
863 }
864
865 /* FIXME: will leak memory at long jump */
866 buf = (char *) malloc(buf_size);
867 if(!buf)
868 error(_("cannot allocate buffer in 'readTableHead'"));
869
870 ans = PROTECT(allocVector(STRSXP, nlines));
871 for(nread = 0; nread < nlines; ) {
872 nbuf = 0; empty = TRUE; skip = FALSE; firstnonwhite = TRUE;
873 if (data.ttyflag)
874 snprintf(ConsolePrompt, CONSOLE_PROMPT_SIZE, "%d: ", nread);
875 /* want to interpret comments here, not in scanchar */
876 while((c = scanchar(TRUE, &data)) != R_EOF) {
877 if(nbuf >= buf_size - 3) {
878 buf_size *= 2;
879 /* FIXME: will leak memory at long jump */
880 char *tmp = (char *) realloc(buf, buf_size);
881 if(!tmp) {
882 free(buf);
883 error(_("cannot allocate buffer in 'readTableHead'"));
884 } else buf = tmp;
885 }
886 /* Need to handle escaped embedded quotes, and how they are
887 escaped depends on 'sep' */
888 if(quote) {
889 if(data.sepchar == 0 && c == '\\') {
890 /* all escapes should be passed through */
891 /* fillBuffer would not copy a backslash preceding quote */
892 buf[nbuf++] = (char) c;
893 c = scanchar(TRUE, &data);
894 if(c == R_EOF) {
895 free(buf);
896 error(_("\\ followed by EOF"));
897 }
898 buf[nbuf++] = (char) c;
899 continue;
900 } else if(quote && c == quote) {
901 if(data.sepchar == 0)
902 quote = 0;
903 else { /* need to check for doubled quote */
904 char c2 = (char) scanchar(TRUE, &data);
905 if(c2 == quote)
906 buf[nbuf++] = (char) c; /* and c = c2 */
907 else {
908 unscanchar(c2, &data);
909 quote = 0;
910 }
911 }
912 }
913 } else if(!skip && (firstnonwhite || data.sepchar != 0) && strchr(data.quoteset, c))
914 quote = c;
915 else if (!skip && data.sepchar == 0 && Rspace(c))
916 /* firstnonwhite stays true within quoted section */
917 firstnonwhite = TRUE;
918 else if (c != ' ' && c != '\t') firstnonwhite = FALSE;
919 /* A line is empty only if it contains nothing before
920 EOL, EOF or a comment char.
921 A line containing just white space is not empty if sep=","
922 However foo\nEOF does not have a final empty line.
923 */
924 if(empty && !skip)
925 if(c != '\n' && c != data.comchar) empty = FALSE;
926 if(!quote && !skip && c == data.comchar) skip = TRUE;
927 if(quote || c != '\n') buf[nbuf++] = (char) c; else break;
928 }
929 buf[nbuf] = '\0';
930 if(data.ttyflag && empty) goto no_more_lines;
931 if(!empty || (c != R_EOF && !blskip)) { /* see previous comment */
932 SET_STRING_ELT(ans, nread, mkChar(buf));
933 nread++;
934 if (strlen(buf) < nbuf) // PR#15625
935 warning("line %d appears to contain embedded nulls", nread);
936 }
937 if(c == R_EOF) goto no_more_lines;
938 }
939 UNPROTECT(1);
940 free(buf);
941 if(!data.wasopen) data.con->close(data.con);
942 return ans;
943
944 no_more_lines:
945 if(!data.wasopen) data.con->close(data.con);
946 if(nbuf > 0) { /* incomplete last line */
947 if(data.con->text && data.con->blocking) {
948 warning(_("incomplete final line found by readTableHeader on '%s'"),
949 data.con->description);
950 } else {
951 free(buf);
952 error(_("incomplete final line found by readTableHeader on '%s'"),
953 data.con->description);
954 }
955 }
956 free(buf);
957 PROTECT(ans2 = allocVector(STRSXP, nread));
958 for(i = 0; i < nread; i++)
959 SET_STRING_ELT(ans2, i, STRING_ELT(ans, i));
960 UNPROTECT(2);
961 return ans2;
962 }
963
964 /* --------- write.table --------- */
965
966 /* write.table(x, file, nr, nc, rnames, sep, eol, na, dec, quote, qstring)
967 x is a matrix or data frame
968 file is a connection
969 sep eol dec qstring are character strings
970 quote is a numeric vector
971 */
972
isna(SEXP x,R_xlen_t indx)973 static Rboolean isna(SEXP x, R_xlen_t indx)
974 {
975 Rcomplex rc;
976 switch(TYPEOF(x)) {
977 case LGLSXP:
978 return LOGICAL(x)[indx] == NA_LOGICAL;
979 break;
980 case INTSXP:
981 return INTEGER(x)[indx] == NA_INTEGER;
982 break;
983 case REALSXP:
984 return ISNAN(REAL(x)[indx]);
985 break;
986 case STRSXP:
987 return STRING_ELT(x, indx) == NA_STRING;
988 break;
989 case CPLXSXP:
990 rc = COMPLEX(x)[indx];
991 return ISNAN(rc.r) || ISNAN(rc.i);
992 break;
993 default:
994 break;
995 }
996 return FALSE;
997 }
998
999 /* a version of EncodeElement with different escaping of char strings */
1000 static const char
EncodeElement2(SEXP x,R_xlen_t indx,Rboolean quote,Rboolean qmethod,R_StringBuffer * buff,const char * dec)1001 *EncodeElement2(SEXP x, R_xlen_t indx, Rboolean quote,
1002 Rboolean qmethod, R_StringBuffer *buff, const char *dec)
1003 {
1004 int nbuf;
1005 char *q;
1006 const char *p, *p0;
1007
1008 if (indx < 0 || indx >= xlength(x))
1009 error(_("index out of range"));
1010 if(TYPEOF(x) == STRSXP) {
1011 const void *vmax = vmaxget();
1012 p0 = translateChar(STRING_ELT(x, indx));
1013 if(!quote) return p0;
1014 for(nbuf = 2, p = p0; *p; p++) /* find buffer length needed */
1015 nbuf += (*p == '"') ? 2 : 1;
1016 R_AllocStringBuffer(nbuf, buff);
1017 q = buff->data; *q++ = '"';
1018 for(p = p0; *p;) {
1019 if(*p == '"') *q++ = qmethod ? '\\' : '"';
1020 *q++ = *p++;
1021 }
1022 *q++ = '"'; *q = '\0';
1023 vmaxset(vmax);
1024 return buff->data;
1025 }
1026 return EncodeElement0(x, indx, quote ? '"' : 0, dec);
1027 }
1028
1029 typedef struct wt_info {
1030 Rboolean wasopen;
1031 Rconnection con;
1032 R_StringBuffer *buf;
1033 int savedigits;
1034 } wt_info;
1035
1036 /* utility to cleanup e.g. after interrupts */
wt_cleanup(void * data)1037 static void wt_cleanup(void *data)
1038 {
1039 wt_info *ld = data;
1040 if(!ld->wasopen) {
1041 errno = 0;
1042 ld->con->close(ld->con);
1043 if (ld->con->status != NA_INTEGER && ld->con->status < 0) {
1044 int serrno = errno;
1045 if (serrno)
1046 warning(_("Problem closing connection: %s"), strerror(serrno));
1047 else
1048 warning(_("Problem closing connection"));
1049 }
1050 }
1051 R_FreeStringBuffer(ld->buf);
1052 R_print.digits = ld->savedigits;
1053 }
1054
writetable(SEXP call,SEXP op,SEXP args,SEXP env)1055 SEXP writetable(SEXP call, SEXP op, SEXP args, SEXP env)
1056 {
1057 SEXP x, sep, rnames, eol, na, dec, quote, xj;
1058 Rboolean wasopen, quote_rn = FALSE, *quote_col;
1059 Rconnection con;
1060 const char *csep, *ceol, *cna, *sdec, *tmp = NULL /* -Wall */;
1061 SEXP *levels;
1062 R_StringBuffer strBuf = {NULL, 0, MAXELTSIZE};
1063 wt_info wi;
1064 RCNTXT cntxt;
1065
1066 args = CDR(args);
1067
1068 x = CAR(args); args = CDR(args);
1069 /* this is going to be a connection open or openable for writing */
1070 if(!inherits(CAR(args), "connection"))
1071 error(_("'file' is not a connection"));
1072 con = getConnection(asInteger(CAR(args))); args = CDR(args);
1073 if(!con->canwrite)
1074 error(_("cannot write to this connection"));
1075 wasopen = con->isopen;
1076 if(!wasopen) {
1077 strcpy(con->mode, "wt");
1078 if(!con->open(con)) error(_("cannot open the connection"));
1079 }
1080 int nr = asInteger(CAR(args)); args = CDR(args);
1081 int nc = asInteger(CAR(args)); args = CDR(args);
1082 rnames = CAR(args); args = CDR(args);
1083 sep = CAR(args); args = CDR(args);
1084 eol = CAR(args); args = CDR(args);
1085 na = CAR(args); args = CDR(args);
1086 dec = CAR(args); args = CDR(args);
1087 quote = CAR(args); args = CDR(args);
1088 int qmethod = asLogical(CAR(args));
1089
1090 if(nr == NA_INTEGER) error(_("invalid '%s' argument"), "nr");
1091 if(nc == NA_INTEGER) error(_("invalid '%s' argument"), "nc");
1092 if(!isNull(rnames) && !isString(rnames))
1093 error(_("invalid '%s' argument"), "rnames");
1094 if(!isString(sep)) error(_("invalid '%s' argument"), "sep");
1095 if(!isString(eol)) error(_("invalid '%s' argument"), "eol");
1096 if(!isString(na)) error(_("invalid '%s' argument"), "na");
1097 if(!isString(dec)) error(_("invalid '%s' argument"), "dec");
1098 if(qmethod == NA_LOGICAL) error(_("invalid '%s' argument"), "qmethod");
1099 csep = translateChar(STRING_ELT(sep, 0));
1100 ceol = translateChar(STRING_ELT(eol, 0));
1101 cna = translateChar(STRING_ELT(na, 0));
1102 sdec = translateChar(STRING_ELT(dec, 0));
1103 if(strlen(sdec) != 1)
1104 error(_("'dec' must be a single character"));
1105 quote_col = (Rboolean *) R_alloc(nc, sizeof(Rboolean));
1106 for(int j = 0; j < nc; j++) quote_col[j] = FALSE;
1107 for(int i = 0; i < length(quote); i++) { /* NB, quote might be NULL */
1108 int this = INTEGER(quote)[i];
1109 if(this == 0) quote_rn = TRUE;
1110 if(this > 0) quote_col[this - 1] = TRUE;
1111 }
1112 R_AllocStringBuffer(0, &strBuf);
1113 PrintDefaults();
1114 wi.savedigits = R_print.digits; R_print.digits = DBL_DIG;/* MAX precision */
1115 wi.con = con;
1116 wi.wasopen = wasopen;
1117 wi.buf = &strBuf;
1118 begincontext(&cntxt, CTXT_CCODE, call, R_BaseEnv, R_BaseEnv,
1119 R_NilValue, R_NilValue);
1120 cntxt.cend = &wt_cleanup;
1121 cntxt.cenddata = &wi;
1122
1123 if(isVectorList(x)) { /* A data frame */
1124
1125 /* handle factors internally, check integrity */
1126 levels = (SEXP *) R_alloc(nc, sizeof(SEXP));
1127 for(int j = 0; j < nc; j++) {
1128 xj = VECTOR_ELT(x, j);
1129 if(LENGTH(xj) != nr)
1130 error(_("corrupt data frame -- length of column %d does not match nrows"),
1131 j+1);
1132 if(inherits(xj, "factor")) {
1133 levels[j] = getAttrib(xj, R_LevelsSymbol);
1134 } else levels[j] = R_NilValue;
1135 }
1136
1137 for(int i = 0; i < nr; i++) {
1138 if(i % 1000 == 999) R_CheckUserInterrupt();
1139 if(!isNull(rnames))
1140 Rconn_printf(con, "%s%s",
1141 EncodeElement2(rnames, i, quote_rn, qmethod,
1142 &strBuf, sdec), csep);
1143 for(int j = 0; j < nc; j++) {
1144 xj = VECTOR_ELT(x, j);
1145 if(j > 0) Rconn_printf(con, "%s", csep);
1146 if(isna(xj, i)) tmp = cna;
1147 else {
1148 if(!isNull(levels[j])) {
1149 /* We do not assume factors have integer levels,
1150 although they should. */
1151 if(TYPEOF(xj) == INTSXP)
1152 tmp = EncodeElement2(levels[j], INTEGER(xj)[i] - 1,
1153 quote_col[j], qmethod,
1154 &strBuf, sdec);
1155 else if(TYPEOF(xj) == REALSXP)
1156 tmp = EncodeElement2(levels[j],
1157 (R_xlen_t) (REAL(xj)[i] - 1),
1158 quote_col[j], qmethod,
1159 &strBuf, sdec);
1160 else
1161 error(_("column %s claims to be a factor but does not have numeric codes"),
1162 j+1);
1163 } else {
1164 tmp = EncodeElement2(xj, i, quote_col[j], qmethod,
1165 &strBuf, sdec);
1166 }
1167 }
1168 Rconn_printf(con, "%s", tmp);
1169 }
1170 Rconn_printf(con, "%s", ceol);
1171 }
1172
1173 } else { /* A matrix */
1174
1175 if(!isVectorAtomic(x))
1176 UNIMPLEMENTED_TYPE("write.table, matrix method", x);
1177 /* quick integrity check */
1178 if(XLENGTH(x) != (R_xlen_t)nr * nc)
1179 error(_("corrupt matrix -- dims do not match length"));
1180
1181 for(int i = 0; i < nr; i++) {
1182 if(i % 1000 == 999) R_CheckUserInterrupt();
1183 if(!isNull(rnames))
1184 Rconn_printf(con, "%s%s",
1185 EncodeElement2(rnames, i, quote_rn, qmethod,
1186 &strBuf, sdec), csep);
1187 for(int j = 0; j < nc; j++) {
1188 if(j > 0) Rconn_printf(con, "%s", csep);
1189 if(isna(x, i + (R_xlen_t)j*nr)) tmp = cna;
1190 else {
1191 tmp = EncodeElement2(x, i + (R_xlen_t)j*nr,
1192 quote_col[j], qmethod,
1193 &strBuf, sdec);
1194 }
1195 Rconn_printf(con, "%s", tmp);
1196 }
1197 Rconn_printf(con, "%s", ceol);
1198 }
1199
1200 }
1201 endcontext(&cntxt);
1202 wt_cleanup(&wi);
1203 return R_NilValue;
1204 }
1205