1 /*
2 * R : A Computer Language for Statistical Data Analysis
3 * Copyright (C) 2003-2021 The R Core Team.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, a copy is available at
17 * https://www.R-project.org/Licenses/
18 */
19
20 /* <UTF8> OK, provided the delimiters are ASCII
21 match length is in now in chars.
22 */
23
24 #include <string.h>
25 #include <R.h>
26 #include "tools.h"
27
28 #include <stdlib.h> /* for MB_CUR_MAX */
29 #include <wchar.h>
30 LibExtern Rboolean mbcslocale;
31 LibExtern int R_MB_CUR_MAX;
32
33 size_t Rf_mbrtowc(wchar_t *wc, const char *s, size_t n, mbstate_t *ps);
34
35 /* .Call, so manages R_alloc stack */
36 SEXP
delim_match(SEXP x,SEXP delims)37 delim_match(SEXP x, SEXP delims)
38 {
39 /*
40 Match delimited substrings in a character vector x.
41
42 Returns an integer vector with the same length of x giving the
43 starting position of the match (including the start delimiter), or
44 -1 if there is none, with attribute "match.length" giving the
45 length of the matched text (including the end delimiter), or -1
46 for no match.
47
48 This is still very experimental.
49
50 Currently, the start and end delimiters must be single characters;
51 it would be nice to allow for arbitrary regexps.
52
53 Currently, the only syntax supported is Rd ('\' is the escape
54 character, '%' starts a comment extending to the next newline, no
55 quote characters. It would be nice to generalize this, too.
56
57 Nevertheless, this is already useful for parsing Rd.
58 */
59
60 char c;
61 const char *s, *delim_start, *delim_end;
62 Sint n, i, pos, start, end, delim_depth;
63 int lstart, lend;
64 Rboolean is_escaped, equal_start_and_end_delims;
65 SEXP ans, matchlen;
66 mbstate_t mb_st; int used;
67
68 if(!isString(x) || !isString(delims) || (length(delims) != 2))
69 error(_("invalid argument type"));
70
71 delim_start = translateChar(STRING_ELT(delims, 0));
72 delim_end = translateChar(STRING_ELT(delims, 1));
73 lstart = (int) strlen(delim_start); lend = (int) strlen(delim_end);
74 equal_start_and_end_delims = strcmp(delim_start, delim_end) == 0;
75
76 n = length(x);
77 PROTECT(ans = allocVector(INTSXP, n));
78 PROTECT(matchlen = allocVector(INTSXP, n));
79
80 for(i = 0; i < n; i++) {
81 memset(&mb_st, 0, sizeof(mbstate_t));
82 start = end = -1;
83 s = translateChar(STRING_ELT(x, i));
84 pos = is_escaped = delim_depth = 0;
85 while((c = *s) != '\0') {
86 if(c == '\n') {
87 is_escaped = FALSE;
88 }
89 else if(c == '\\') {
90 is_escaped = is_escaped ? FALSE : TRUE;
91 }
92 else if(is_escaped) {
93 is_escaped = FALSE;
94 }
95 else if(c == '%') {
96 while((c != '\0') && (c != '\n')) {
97 if(mbcslocale) {
98 used = (int) Rf_mbrtowc(NULL, s, R_MB_CUR_MAX, &mb_st);
99 if(used == 0) break;
100 s += used; c = *s;
101 } else
102 c = *++s;
103 pos++;
104 }
105 }
106 else if(strncmp(s, delim_end, lend) == 0) {
107 if(delim_depth > 1) delim_depth--;
108 else if(delim_depth == 1) {
109 end = pos;
110 break;
111 }
112 else if(equal_start_and_end_delims) {
113 start = pos;
114 delim_depth++;
115 }
116 }
117 else if(strncmp(s, delim_start, lstart) == 0) {
118 if(delim_depth == 0) start = pos;
119 delim_depth++;
120 }
121 if(mbcslocale) {
122 used = (int) Rf_mbrtowc(NULL, s, R_MB_CUR_MAX, &mb_st);
123 if(used == 0) break;
124 s += used;
125 } else
126 s++;
127 pos++;
128 }
129 if(end > -1) {
130 INTEGER(ans)[i] = start + 1; /* index from one */
131 INTEGER(matchlen)[i] = end - start + 1;
132 }
133 else {
134 INTEGER(ans)[i] = INTEGER(matchlen)[i] = -1;
135 }
136 }
137 setAttrib(ans, install("match.length"), matchlen);
138 UNPROTECT(2);
139 return(ans);
140 }
141
142 SEXP
check_nonASCII(SEXP text,SEXP ignore_quotes)143 check_nonASCII(SEXP text, SEXP ignore_quotes)
144 {
145 /* Check if all the lines in 'text' are ASCII, after removing
146 comments and ignoring the contents of quotes (unless ignore_quotes)
147 (which might span more than one line and might be escaped).
148
149 This cannot be entirely correct, as quotes and \ might occur as
150 part of another character in a MBCS: but this does not happen
151 in UTF-8.
152 */
153 int i, nbslash = 0; /* number of preceding backslashes */
154 const char *p;
155 char quote= '\0';
156 Rboolean ign, inquote = FALSE;
157
158 if(TYPEOF(text) != STRSXP) error("invalid input");
159 ign = asLogical(ignore_quotes);
160 if(ign == NA_LOGICAL) error("'ignore_quotes' must be TRUE or FALSE");
161
162 for (i = 0; i < LENGTH(text); i++) {
163 p = CHAR(STRING_ELT(text, i)); // ASCII or not not affected by charset
164 inquote = FALSE; /* avoid runaway quotes */
165 for(; *p; p++) {
166 if(!inquote && *p == '#') break;
167 if(!inquote || ign) {
168 if((unsigned int) *p > 127) {
169 /* Rprintf("%s\n", CHAR(STRING_ELT(text, i)));
170 Rprintf("found %x\n", (unsigned int) *p); */
171 return ScalarLogical(TRUE);
172 }
173 }
174 if((nbslash % 2 == 0) && (*p == '"' || *p == '\'')) {
175 if(inquote && *p == quote) {
176 inquote = FALSE;
177 } else if(!inquote) {
178 quote = *p;
179 inquote = TRUE;
180 }
181 }
182 if(*p == '\\') nbslash++; else nbslash = 0;
183 }
184 }
185 return ScalarLogical(FALSE);
186 }
187
check_nonASCII2(SEXP text)188 SEXP check_nonASCII2(SEXP text)
189 {
190 SEXP ans = R_NilValue;
191 int i, m = 0, m_all = 100, *ind, *ians, yes;
192 const char *p;
193
194 if(TYPEOF(text) != STRSXP) error("invalid input");
195 ind = Calloc(m_all, int);
196 for (i = 0; i < LENGTH(text); i++) {
197 p = CHAR(STRING_ELT(text, i));
198 yes = 0;
199 for(; *p; p++)
200 if((unsigned int) *p > 127) {
201 yes = 1;
202 break;
203 }
204 if(yes) {
205 if(m >= m_all) {
206 m_all *= 2;
207 ind = Realloc(ind, m_all, int);
208 }
209 ind[m++] = i + 1; /* R is 1-based */
210 }
211 }
212 if(m) {
213 ans = allocVector(INTSXP, m);
214 ians = INTEGER(ans);
215 for(i = 0; i < m; i++) ians[i] = ind[i];
216 }
217 Free(ind);
218 return ans;
219 }
220
doTabExpand(SEXP strings,SEXP starts)221 SEXP doTabExpand(SEXP strings, SEXP starts) /* does tab expansion for UTF-8 strings only */
222 {
223 int bufsize = 1024;
224 char *buffer = malloc(bufsize*sizeof(char));
225 if (buffer == NULL) error(_("out of memory"));
226 SEXP result = PROTECT(allocVector(STRSXP, length(strings)));
227 for (int i = 0; i < length(strings); i++) {
228 char *b;
229 const char *input = CHAR(STRING_ELT(strings, i));
230 int start = INTEGER(starts)[i];
231 for (b = buffer; *input; ) {
232 /* only the first byte of multi-byte chars counts */
233 if (0x80 <= (unsigned char)*input && (unsigned char)*input <= 0xBF)
234 start--;
235 else if (*input == '\n')
236 start = (int)(buffer-b-1);
237 if (*input == '\t') do {
238 *b++ = ' ';
239 } while (((b-buffer+start) & 7) != 0);
240 else *b++ = *input;
241 if (b - buffer >= bufsize - 8) {
242 int pos = (int)(b - buffer);
243 bufsize *= 2;
244 char *tmp = realloc(buffer, bufsize*sizeof(char));
245 if (!tmp) {
246 free(buffer); // free original allocation
247 error(_("out of memory"));
248 } else buffer = tmp; // and realloc freed original buffer
249 b = buffer + pos;
250 }
251 input++;
252 }
253 *b = '\0';
254 SET_STRING_ELT(result, i, mkCharCE(buffer, Rf_getCharCE(STRING_ELT(strings, i))));
255 }
256 UNPROTECT(1);
257 free(buffer);
258 return result; // -fanalyzer claims b leaks, but maybe it does not understand realloc
259 }
260
261 /* This could be done in wchar_t, but it is only used for
262 ASCII delimiters which are not lead bytes in UTF-8 or
263 DBCS encodings. */
splitString(SEXP string,SEXP delims)264 SEXP splitString(SEXP string, SEXP delims)
265 {
266 if(!isString(string) || length(string) != 1)
267 error("first arg must be a single character string");
268 if(!isString(delims) || length(delims) != 1)
269 error("first arg must be a single character string");
270
271 if(STRING_ELT(string, 0) == NA_STRING)
272 return ScalarString(NA_STRING);
273 if(STRING_ELT(delims, 0) == NA_STRING)
274 return ScalarString(NA_STRING);
275
276 const char *in = CHAR(STRING_ELT(string, 0)),
277 *del = CHAR(STRING_ELT(delims, 0));
278 cetype_t ienc = getCharCE(STRING_ELT(string, 0));
279 int nc = (int) strlen(in), used = 0;
280
281 // Used for short strings, so OK to over-allocate wildly
282 SEXP out = PROTECT(allocVector(STRSXP, nc));
283 const char *p;
284 char tmp[nc], *this = tmp;
285 int nthis = 0;
286 for(p = in; *p ; p++) {
287 if(strchr(del, *p)) {
288 // put out current string (if any)
289 if(nthis)
290 SET_STRING_ELT(out, used++, mkCharLenCE(tmp, nthis, ienc));
291 // put out delimiter
292 SET_STRING_ELT(out, used++, mkCharLen(p, 1));
293 // restart
294 this = tmp; nthis = 0;
295 } else {
296 *this++ = *p;
297 nthis++;
298 }
299 }
300 if(nthis) SET_STRING_ELT(out, used++, mkCharLenCE(tmp, nthis, ienc));
301
302 SEXP ans = lengthgets(out, used);
303 UNPROTECT(1);
304 return ans;
305 }
306