1 /*
2  *  R : A Computer Language for Statistical Data Analysis
3  *  Copyright (C) 2003-2021   The R Core Team.
4  *
5  *  This program is free software; you can redistribute it and/or modify
6  *  it under the terms of the GNU General Public License as published by
7  *  the Free Software Foundation; either version 2 of the License, or
8  *  (at your option) any later version.
9  *
10  *  This program is distributed in the hope that it will be useful,
11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  *  GNU General Public License for more details.
14  *
15  *  You should have received a copy of the GNU General Public License
16  *  along with this program; if not, a copy is available at
17  *  https://www.R-project.org/Licenses/
18  */
19 
20 /* <UTF8> OK, provided the delimiters are ASCII
21    match length is in now in chars.
22 */
23 
24 #include <string.h>
25 #include <R.h>
26 #include "tools.h"
27 
28 #include <stdlib.h> /* for MB_CUR_MAX */
29 #include <wchar.h>
30 LibExtern Rboolean mbcslocale;
31 LibExtern int R_MB_CUR_MAX;
32 
33 size_t Rf_mbrtowc(wchar_t *wc, const char *s, size_t n, mbstate_t *ps);
34 
35 /* .Call, so manages R_alloc stack */
36 SEXP
delim_match(SEXP x,SEXP delims)37 delim_match(SEXP x, SEXP delims)
38 {
39     /*
40       Match delimited substrings in a character vector x.
41 
42       Returns an integer vector with the same length of x giving the
43       starting position of the match (including the start delimiter), or
44       -1 if there is none, with attribute "match.length" giving the
45       length of the matched text (including the end delimiter), or -1
46       for no match.
47 
48       This is still very experimental.
49 
50       Currently, the start and end delimiters must be single characters;
51       it would be nice to allow for arbitrary regexps.
52 
53       Currently, the only syntax supported is Rd ('\' is the escape
54       character, '%' starts a comment extending to the next newline, no
55       quote characters.  It would be nice to generalize this, too.
56 
57       Nevertheless, this is already useful for parsing Rd.
58     */
59 
60     char c;
61     const char *s, *delim_start, *delim_end;
62     Sint n, i, pos, start, end, delim_depth;
63     int lstart, lend;
64     Rboolean is_escaped, equal_start_and_end_delims;
65     SEXP ans, matchlen;
66     mbstate_t mb_st; int used;
67 
68     if(!isString(x) || !isString(delims) || (length(delims) != 2))
69 	error(_("invalid argument type"));
70 
71     delim_start = translateChar(STRING_ELT(delims, 0));
72     delim_end = translateChar(STRING_ELT(delims, 1));
73     lstart = (int) strlen(delim_start); lend = (int) strlen(delim_end);
74     equal_start_and_end_delims = strcmp(delim_start, delim_end) == 0;
75 
76     n = length(x);
77     PROTECT(ans = allocVector(INTSXP, n));
78     PROTECT(matchlen = allocVector(INTSXP, n));
79 
80     for(i = 0; i < n; i++) {
81 	memset(&mb_st, 0, sizeof(mbstate_t));
82 	start = end = -1;
83 	s = translateChar(STRING_ELT(x, i));
84 	pos = is_escaped = delim_depth = 0;
85 	while((c = *s) != '\0') {
86 	    if(c == '\n') {
87 		is_escaped = FALSE;
88 	    }
89 	    else if(c == '\\') {
90 		is_escaped = is_escaped ? FALSE : TRUE;
91 	    }
92 	    else if(is_escaped) {
93 		is_escaped = FALSE;
94 	    }
95 	    else if(c == '%') {
96 		while((c != '\0') && (c != '\n')) {
97 		    if(mbcslocale) {
98 			used = (int) Rf_mbrtowc(NULL, s, R_MB_CUR_MAX, &mb_st);
99 			if(used == 0) break;
100 			s += used; c = *s;
101 		    } else
102 			c = *++s;
103 		    pos++;
104 		}
105 	    }
106 	    else if(strncmp(s, delim_end, lend) == 0) {
107 		if(delim_depth > 1) delim_depth--;
108 		else if(delim_depth == 1) {
109 		    end = pos;
110 		    break;
111 		}
112 		else if(equal_start_and_end_delims) {
113 		    start = pos;
114 		    delim_depth++;
115 		}
116 	    }
117 	    else if(strncmp(s, delim_start, lstart) == 0) {
118 		if(delim_depth == 0) start = pos;
119 		delim_depth++;
120 	    }
121 	    if(mbcslocale) {
122 		used = (int) Rf_mbrtowc(NULL, s, R_MB_CUR_MAX, &mb_st);
123 		if(used == 0) break;
124 		s += used;
125 	    } else
126 		s++;
127 	    pos++;
128 	}
129 	if(end > -1) {
130 	    INTEGER(ans)[i] = start + 1; /* index from one */
131 	    INTEGER(matchlen)[i] = end - start + 1;
132 	}
133 	else {
134 	    INTEGER(ans)[i] = INTEGER(matchlen)[i] = -1;
135 	}
136     }
137     setAttrib(ans, install("match.length"), matchlen);
138     UNPROTECT(2);
139     return(ans);
140 }
141 
142 SEXP
check_nonASCII(SEXP text,SEXP ignore_quotes)143 check_nonASCII(SEXP text, SEXP ignore_quotes)
144 {
145     /* Check if all the lines in 'text' are ASCII, after removing
146        comments and ignoring the contents of quotes (unless ignore_quotes)
147        (which might span more than one line and might be escaped).
148 
149        This cannot be entirely correct, as quotes and \ might occur as
150        part of another character in a MBCS: but this does not happen
151        in UTF-8.
152     */
153     int i, nbslash = 0; /* number of preceding backslashes */
154     const char *p;
155     char quote= '\0';
156     Rboolean ign, inquote = FALSE;
157 
158     if(TYPEOF(text) != STRSXP) error("invalid input");
159     ign = asLogical(ignore_quotes);
160     if(ign == NA_LOGICAL) error("'ignore_quotes' must be TRUE or FALSE");
161 
162     for (i = 0; i < LENGTH(text); i++) {
163 	p = CHAR(STRING_ELT(text, i)); // ASCII or not not affected by charset
164 	inquote = FALSE; /* avoid runaway quotes */
165 	for(; *p; p++) {
166 	    if(!inquote && *p == '#') break;
167 	    if(!inquote || ign) {
168 		if((unsigned int) *p > 127) {
169 		    /* Rprintf("%s\n", CHAR(STRING_ELT(text, i)));
170 		       Rprintf("found %x\n", (unsigned int) *p); */
171 		    return ScalarLogical(TRUE);
172 		}
173 	    }
174 	    if((nbslash % 2 == 0) && (*p == '"' || *p == '\'')) {
175 		if(inquote && *p == quote) {
176 		    inquote = FALSE;
177 		} else if(!inquote) {
178 		    quote = *p;
179 		    inquote = TRUE;
180 		}
181 	    }
182 	    if(*p == '\\') nbslash++; else nbslash = 0;
183 	}
184     }
185     return ScalarLogical(FALSE);
186 }
187 
check_nonASCII2(SEXP text)188 SEXP check_nonASCII2(SEXP text)
189 {
190     SEXP ans = R_NilValue;
191     int i, m = 0, m_all = 100, *ind, *ians, yes;
192     const char *p;
193 
194     if(TYPEOF(text) != STRSXP) error("invalid input");
195     ind = Calloc(m_all, int);
196     for (i = 0; i < LENGTH(text); i++) {
197 	p = CHAR(STRING_ELT(text, i));
198 	yes = 0;
199 	for(; *p; p++)
200 	    if((unsigned int) *p > 127) {
201 		yes = 1;
202 		break;
203 	    }
204 	if(yes) {
205 	    if(m >= m_all) {
206 		m_all *= 2;
207 		ind = Realloc(ind, m_all, int);
208 	    }
209 	    ind[m++] = i + 1; /* R is 1-based */
210 	}
211     }
212     if(m) {
213 	ans = allocVector(INTSXP, m);
214 	ians = INTEGER(ans);
215 	for(i = 0; i < m; i++) ians[i] = ind[i];
216     }
217     Free(ind);
218     return ans;
219 }
220 
doTabExpand(SEXP strings,SEXP starts)221 SEXP doTabExpand(SEXP strings, SEXP starts)  /* does tab expansion for UTF-8 strings only */
222 {
223     int bufsize = 1024;
224     char *buffer = malloc(bufsize*sizeof(char));
225     if (buffer == NULL) error(_("out of memory"));
226     SEXP result = PROTECT(allocVector(STRSXP, length(strings)));
227     for (int i = 0; i < length(strings); i++) {
228 	char *b;
229 	const char *input = CHAR(STRING_ELT(strings, i));
230 	int start = INTEGER(starts)[i];
231 	for (b = buffer; *input; ) {
232 	    /* only the first byte of multi-byte chars counts */
233 	    if (0x80 <= (unsigned char)*input && (unsigned char)*input <= 0xBF)
234 		start--;
235 	    else if (*input == '\n')
236 		start = (int)(buffer-b-1);
237 	    if (*input == '\t') do {
238 		    *b++ = ' ';
239 		} while (((b-buffer+start) & 7) != 0);
240 	    else *b++ = *input;
241 	    if (b - buffer >= bufsize - 8) {
242 		int pos = (int)(b - buffer);
243 		bufsize *= 2;
244 		char *tmp = realloc(buffer, bufsize*sizeof(char));
245 		if (!tmp) {
246 		    free(buffer); // free original allocation
247 		    error(_("out of memory"));
248 		} else buffer = tmp; // and realloc freed original buffer
249 		b = buffer + pos;
250 	    }
251 	    input++;
252 	}
253 	*b = '\0';
254 	SET_STRING_ELT(result, i, mkCharCE(buffer, Rf_getCharCE(STRING_ELT(strings, i))));
255     }
256     UNPROTECT(1);
257     free(buffer);
258     return result; // -fanalyzer claims b leaks, but maybe it does not understand realloc
259 }
260 
261 /* This could be done in wchar_t, but it is only used for
262    ASCII delimiters which are not lead bytes in UTF-8 or
263    DBCS encodings. */
splitString(SEXP string,SEXP delims)264 SEXP splitString(SEXP string, SEXP delims)
265 {
266     if(!isString(string) || length(string) != 1)
267 	error("first arg must be a single character string");
268     if(!isString(delims) || length(delims) != 1)
269 	error("first arg must be a single character string");
270 
271     if(STRING_ELT(string, 0) == NA_STRING)
272 	return ScalarString(NA_STRING);
273     if(STRING_ELT(delims, 0) == NA_STRING)
274 	return ScalarString(NA_STRING);
275 
276     const char *in = CHAR(STRING_ELT(string, 0)),
277 	*del = CHAR(STRING_ELT(delims, 0));
278     cetype_t ienc = getCharCE(STRING_ELT(string, 0));
279     int nc = (int) strlen(in), used = 0;
280 
281     // Used for short strings, so OK to over-allocate wildly
282     SEXP out = PROTECT(allocVector(STRSXP, nc));
283     const char *p;
284     char tmp[nc], *this = tmp;
285     int nthis = 0;
286     for(p = in; *p ; p++) {
287 	if(strchr(del, *p)) {
288 	    // put out current string (if any)
289 	    if(nthis)
290 		SET_STRING_ELT(out, used++, mkCharLenCE(tmp, nthis, ienc));
291 	    // put out delimiter
292 	    SET_STRING_ELT(out, used++, mkCharLen(p, 1));
293 	    // restart
294 	    this = tmp; nthis = 0;
295 	} else {
296 	    *this++ = *p;
297 	    nthis++;
298 	}
299     }
300     if(nthis) SET_STRING_ELT(out, used++, mkCharLenCE(tmp, nthis, ienc));
301 
302     SEXP ans = lengthgets(out, used);
303     UNPROTECT(1);
304     return ans;
305 }
306