1 /* This file is part of the 'stringi' project.
2 * Copyright (c) 2013-2021, Marek Gagolewski <https://www.gagolewski.com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from
17 * this software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
21 * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
28 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
29 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 */
31
32
33 #include "stri_stringi.h"
34 #include "stri_container_utf8.h"
35 #include "stri_container_utf16.h"
36 #include "stri_container_listraw.h"
37 #include "stri_container_listint.h"
38 #include "stri_string8buf.h"
39 #include "stri_ucnv.h"
40 #include <vector>
41
42
43 #define BUF_MAX_LENGTH 2147483647
44
45
46 /** Convert from UTF-32
47 *
48 * @param vec integer vector or list with integer vectors
49 * @return character vector
50 *
51 * @version 0.1-?? (Marek Gagolewski)
52 *
53 * @version 0.2-1 (Marek Gagolewski, 2014-03-25)
54 * StriException friently;
55 * use StriContainerListInt
56 *
57 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
58 * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
59 */
stri_enc_fromutf32(SEXP vec)60 SEXP stri_enc_fromutf32(SEXP vec)
61 {
62 PROTECT(vec = stri__prepare_arg_list_integer(vec, "vec"));
63
64 STRI__ERROR_HANDLER_BEGIN(1)
65 StriContainerListInt vec_cont(vec);
66 R_len_t vec_n = vec_cont.get_n();
67
68 // get required buf size
69 R_len_t bufsize = 0;
70 for (R_len_t i=0; i<vec_n; ++i) {
71 if (!vec_cont.isNA(i) && vec_cont.get(i).size() > bufsize)
72 bufsize = vec_cont.get(i).size();
73 }
74 bufsize = U8_MAX_LENGTH*bufsize+1; // this will surely be sufficient
75 String8buf buf(bufsize);
76 char* bufdata = buf.data();
77
78 SEXP ret;
79 STRI__PROTECT(ret = Rf_allocVector(STRSXP, vec_n));
80
81 for (R_len_t i=0; i<vec_n; ++i) {
82 if (vec_cont.isNA(i)) {
83 SET_STRING_ELT(ret, i, NA_STRING);
84 continue;
85 }
86
87 const int* cur_data = vec_cont.get(i).data();
88 R_len_t cur_n = vec_cont.get(i).size();
89 UChar32 c = (UChar32)0;
90 R_len_t j = 0;
91 R_len_t k = 0;
92 UBool err = FALSE;
93 while (!err && k < cur_n) {
94 c = cur_data[k++];
95 U8_APPEND((uint8_t*)bufdata, j, bufsize, c, err);
96
97 // Rf_mkCharLenCE detects embedded nuls, but stops execution completely
98 if (c == 0) err = TRUE;
99 }
100
101 if (err) {
102 Rf_warning(MSG__INVALID_CODE_POINT, (int)c);
103 SET_STRING_ELT(ret, i, NA_STRING);
104 }
105 else
106 SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, j, CE_UTF8));
107 }
108
109 STRI__UNPROTECT_ALL;
110 return ret;
111 STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
112 }
113
114
115 /** Convert character vector to UTF-32
116 *
117 * @param str character vector
118 * @return list with integer vectors
119 *
120 * @version 0.1-?? (Marek Gagolewski)
121 *
122 * @version 0.1-?? (Marek Gagolewski, 2013-06-16)
123 * make StriException-friendly
124 *
125 * @version 0.2-1 (Marek Gagolewski, 2014-03-26)
126 * use vector<UChar32> buf instead of R_alloc;
127 * warn and set NULL on improper UTF-8 byte sequences
128 *
129 * @version 0.2-3 (Marek Gagolewski, 2014-05-12)
130 * Use UChar32* instead of vector<UChar32> as ::data is C++11
131 *
132 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
133 * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
134 */
stri_enc_toutf32(SEXP str)135 SEXP stri_enc_toutf32(SEXP str)
136 {
137 PROTECT(str = stri__prepare_arg_string(str, "str"));
138 R_len_t n = LENGTH(str);
139
140 STRI__ERROR_HANDLER_BEGIN(1)
141 StriContainerUTF8 str_cont(str, n);
142
143 R_len_t bufsize = 1; // to avoid allocating an empty buffer
144 for (R_len_t i=0; i<n; ++i) {
145 if (str_cont.isNA(i)) continue;
146 R_len_t ni = str_cont.get(i).length();
147 if (ni > bufsize) bufsize = ni;
148 }
149
150 UChar32* buf = (UChar32*)R_alloc((size_t)bufsize, (int)sizeof(UChar32)); // at most bufsize UChars32 (bufsize/4 min.)
151 STRI_ASSERT(buf);
152 if (!buf) throw StriException(MSG__MEM_ALLOC_ERROR);
153 // deque<UChar32> was slower than using a common, over-sized buf
154
155 SEXP ret;
156 STRI__PROTECT(ret = Rf_allocVector(VECSXP, n)); // all
157
158 for (R_len_t i=0; i<n; ++i) {
159
160 if (str_cont.isNA(i)) {
161 SET_VECTOR_ELT(ret, i, R_NilValue);
162 continue;
163 }
164
165 UChar32 c = (UChar32)0;
166 const char* s = str_cont.get(i).c_str();
167 R_len_t sn = str_cont.get(i).length();
168 R_len_t j = 0;
169 R_len_t k = 0;
170 while (c >= 0 && j < sn) {
171 U8_NEXT(s, j, sn, c);
172 buf[k++] = (int)c;
173 }
174
175 if (c < 0) {
176 throw StriException(MSG__INVALID_UTF8);
177 // SET_VECTOR_ELT(ret, i, R_NilValue);
178 // continue;
179 }
180 else {
181 SEXP conv;
182 STRI__PROTECT(conv = Rf_allocVector(INTSXP, k));
183 memcpy(INTEGER(conv), buf, (size_t)sizeof(int)*k);
184 SET_VECTOR_ELT(ret, i, conv);
185 STRI__UNPROTECT(1);
186 }
187 }
188
189 STRI__UNPROTECT_ALL
190 return ret;
191 STRI__ERROR_HANDLER_END({ /* do nothing on error */ })
192 }
193
194
195 /** Convert character vector to UTF-8
196 *
197 * @param str character vector
198 * @param is_unknown_8bit single logical value;
199 * if TRUE, then in case of ENC_NATIVE or ENC_LATIN1, UTF-8
200 * REPLACEMENT CHARACTERs (U+FFFD) are
201 * put for codes > 127
202 * @param validate single logical value (or NA)
203 *
204 * @return character vector
205 *
206 * @version 0.1-XX (Marek Gagolewski)
207 *
208 * @version 0.1-XX (Marek Gagolewski, 2013-06-16)
209 * make StriException-friendly
210 *
211 * @version 0.2-1 (Marek Gagolewski, 2014-03-26)
212 * Use one String8buf;
213 * is_unknown_8bit_logical and UTF-8 tries now to remove BOMs
214 *
215 * @version 0.2-1 (Marek Gagolewksi, 2014-03-30)
216 * added validate arg
217 *
218 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
219 * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
220 */
stri_enc_toutf8(SEXP str,SEXP is_unknown_8bit,SEXP validate)221 SEXP stri_enc_toutf8(SEXP str, SEXP is_unknown_8bit, SEXP validate)
222 {
223 PROTECT(validate = stri__prepare_arg_logical_1(validate, "validate"));
224 bool is_unknown_8bit_logical =
225 stri__prepare_arg_logical_1_notNA(is_unknown_8bit, "is_unknown_8bit");
226 PROTECT(str = stri__prepare_arg_string(str, "str"));
227 R_len_t n = LENGTH(str);
228
229 STRI__ERROR_HANDLER_BEGIN(2)
230 SEXP ret;
231 if (!is_unknown_8bit_logical) {
232 // Trivial - everything we need is in StriContainerUTF8 :)
233 // which removes BOMs silently
234 StriContainerUTF8 str_cont(str, n);
235 STRI__PROTECT(ret = str_cont.toR());
236 }
237 else {
238 // get buf size
239 size_t bufsize = 0;
240 for (R_len_t i=0; i<n; ++i) {
241 SEXP curs = STRING_ELT(str, i);
242 if (curs == NA_STRING || IS_ASCII(curs) || IS_UTF8(curs))
243 continue;
244
245 size_t ni = LENGTH(curs);
246 if (ni > bufsize) bufsize = ni;
247 }
248 String8buf buf(bufsize*3); // either 1 byte < 127 or U+FFFD == 3 bytes UTF-8
249 char* bufdata = buf.data();
250
251 STRI__PROTECT(ret = Rf_allocVector(STRSXP, n));
252 for (R_len_t i=0; i<n; ++i) {
253 SEXP curs = STRING_ELT(str, i);
254 if (curs == NA_STRING) {
255 SET_STRING_ELT(ret, i, NA_STRING);
256 continue;
257 }
258
259 if (IS_ASCII(curs) || IS_UTF8(curs)) {
260 R_len_t curs_n = LENGTH(curs);
261 const char* curs_s = CHAR(curs); // TODO: ALTREP will be problematic?
262 if (curs_n >= 3 &&
263 (uint8_t)(curs_s[0]) == UTF8_BOM_BYTE1 &&
264 (uint8_t)(curs_s[1]) == UTF8_BOM_BYTE2 &&
265 (uint8_t)(curs_s[2]) == UTF8_BOM_BYTE3) {
266 // has BOM - get rid of it
267 SET_STRING_ELT(ret, i, Rf_mkCharLenCE(curs_s+3, curs_n-3, CE_UTF8));
268 }
269 else
270 SET_STRING_ELT(ret, i, curs);
271
272 continue;
273 }
274
275 // otherwise, we have an 8-bit encoding
276 R_len_t curn = LENGTH(curs);
277 const char* curs_tab = CHAR(curs); // TODO: ALTREP will be problematic?
278 R_len_t k = 0;
279 for (R_len_t j=0; j<curn; ++j) {
280 if (U8_IS_SINGLE(curs_tab[j]))
281 bufdata[k++] = curs_tab[j];
282 else { // 0xEF 0xBF 0xBD
283 bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE1;
284 bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE2;
285 bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE3;
286 }
287 }
288 SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8));
289 }
290
291 }
292
293 // validate utf8 byte stream
294 if (LOGICAL(validate)[0] != FALSE) { // NA or TRUE
295 R_len_t ret_n = LENGTH(ret);
296 for (R_len_t i=0; i<ret_n; ++i) {
297 SEXP curs = STRING_ELT(ret, i);
298 if (curs == NA_STRING) continue;
299
300 const char* s = CHAR(curs); // TODO: ALTREP will be problematic?
301 R_len_t sn = LENGTH(curs);
302 R_len_t j = 0;
303 UChar32 c = 0;
304 while (c >= 0 && j < sn) {
305 U8_NEXT(s, j, sn, c);
306 }
307
308 if (c >= 0) continue; // valid, nothing to do
309
310 if (LOGICAL(validate)[0] == NA_LOGICAL) {
311 Rf_warning(MSG__INVALID_CODE_POINT_REPLNA);
312 SET_STRING_ELT(ret, i, NA_STRING);
313 }
314 else {
315 size_t bufsize = sn*3; // maximum: 1 byte -> U+FFFD (3 bytes)
316 String8buf buf(bufsize); // maximum: 1 byte -> U+FFFD (3 bytes)
317 char* bufdata = buf.data();
318
319 j = 0;
320 size_t k = 0;
321 UBool err = FALSE;
322 while (!err && j < sn) {
323 U8_NEXT(s, j, sn, c);
324 if (c >= 0) {
325 U8_APPEND((uint8_t*)bufdata, k, bufsize, c, err);
326 } else {
327 Rf_warning(MSG__INVALID_CODE_POINT_FIXING);
328 bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE1;
329 bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE2;
330 bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE3;
331 }
332 }
333
334 if (err) throw StriException(MSG__INTERNAL_ERROR);
335 SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8));
336 }
337 }
338 }
339
340 STRI__UNPROTECT_ALL
341 return ret;
342 STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
343 }
344
345
346 /** Convert character vector to ASCII
347 *
348 * All charcodes > 127 are replaced with subst chars (0x1A)
349 *
350 * @param str character vector
351 * @return character vector
352 *
353 * @version 0.1-?? (Marek Gagolewski)
354 *
355 * @version 0.1-?? (Marek Gagolewski, 2013-06-16)
356 * make StriException-friendly
357 *
358 * @version 0.2-1 (Marek Gagolewski, 2014-03-30)
359 * use single common buf;
360 * warn on invalid utf8 byte stream
361 *
362 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
363 * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
364 */
stri_enc_toascii(SEXP str)365 SEXP stri_enc_toascii(SEXP str)
366 {
367 PROTECT(str = stri__prepare_arg_string(str, "str"));
368 R_len_t n = LENGTH(str);
369
370 STRI__ERROR_HANDLER_BEGIN(1)
371
372 // get buf size
373 size_t bufsize = 0;
374 for (R_len_t i=0; i<n; ++i) {
375 SEXP curs = STRING_ELT(str, i);
376 if (curs == NA_STRING)
377 continue;
378
379 size_t ni = LENGTH(curs);
380 if (ni > bufsize) bufsize = ni;
381 }
382 String8buf buf(bufsize); // no more bytes than this needed
383 char* bufdata = buf.data();
384
385 SEXP ret;
386 STRI__PROTECT(ret = Rf_allocVector(STRSXP, n));
387 for (R_len_t i=0; i<n; ++i) {
388 SEXP curs = STRING_ELT(str, i);
389 if (curs == NA_STRING || IS_ASCII(curs)) {
390 // nothing to do
391 SET_STRING_ELT(ret, i, curs);
392 continue;
393 }
394
395 R_len_t curn = LENGTH(curs);
396 const char* curs_tab = CHAR(curs); // TODO: ALTREP will be problematic?
397
398 if (IS_UTF8(curs)) {
399 R_len_t k = 0, j = 0;
400 UChar32 c;
401 while (j<curn) {
402 U8_NEXT(curs_tab, j, curn, c);
403 if (c < 0) {
404 Rf_warning(MSG__INVALID_CODE_POINT_FIXING);
405 bufdata[k++] = ASCII_SUBSTITUTE;
406 }
407 else if (c > ASCII_MAXCHARCODE)
408 bufdata[k++] = ASCII_SUBSTITUTE;
409 else
410 bufdata[k++] = (char)c;
411 }
412 SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8));
413 // the string will be marked as ASCII anyway by mkCharLenCE
414 }
415 else { // some 8-bit encoding
416 R_len_t k = 0;
417 for (R_len_t j=0; j<curn; ++j) {
418 if (U8_IS_SINGLE(curs_tab[j]))
419 bufdata[k++] = curs_tab[j];
420 else {
421 bufdata[k++] = (char)ASCII_SUBSTITUTE; // subst char in ascii
422 }
423 }
424 SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8));
425 // the string will be marked as ASCII anyway by mkCharLenCE
426 }
427 }
428
429 STRI__UNPROTECT_ALL
430 return ret;
431 STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
432 }
433
434
435 // ------------------------------------------------------------------------
436
437 /**
438 * Convert character vector between marked encodings and the encoding provided
439 *
440 * @param str input character vector
441 * @param to target encoding, \code{NULL} or \code{""} for default enc
442 * @param to_raw single logical, should list of raw vectors be returned?
443 * @return a converted character vector or list of raw vectors
444 *
445 * @version 0.1-?? (Marek Gagolewski, 2013-11-12)
446 *
447 * @version 0.2-1 (Marek Gagolewski, 2014-03-28)
448 * use StriUcnv
449 *
450 * @version 0.2-1 (Marek Gagolewski, 2014-04-01)
451 * calc required buf size a priori
452 *
453 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
454 * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
455 */
stri_encode_from_marked(SEXP str,SEXP to,SEXP to_raw)456 SEXP stri_encode_from_marked(SEXP str, SEXP to, SEXP to_raw)
457 {
458 PROTECT(str = stri__prepare_arg_string(str, "str"));
459 const char* selected_to = stri__prepare_arg_enc(to, "to", true); /* this is R_alloc'ed */
460 bool to_raw_logical = stri__prepare_arg_logical_1_notNA(to_raw, "to_raw");
461
462 STRI__ERROR_HANDLER_BEGIN(1)
463 R_len_t str_n = LENGTH(str);
464 StriContainerUTF16 str_cont(str, str_n);
465
466 // get the number of strings to convert; if == 0, then you know what's the result
467 if (str_n <= 0) return Rf_allocVector(to_raw_logical?VECSXP:STRSXP, 0);
468
469 // Open converters
470 StriUcnv ucnv(selected_to);
471 UConverter* uconv_to = ucnv.getConverter(true /*register_callbacks*/);
472
473 // Get target encoding mark
474 cetype_t encmark_to = to_raw_logical?CE_BYTES:ucnv.getCE();
475
476 // Prepare out val
477 SEXP ret;
478 STRI__PROTECT(ret = Rf_allocVector(to_raw_logical?VECSXP:STRSXP, str_n));
479
480 // calculate required buf size
481 size_t bufsize = 0;
482 for (R_len_t i=0; i<str_n; ++i) {
483 if (!str_cont.isNA(i) && (size_t)str_cont.get(i).length() > bufsize)
484 bufsize = str_cont.get(i).length();
485 }
486 bufsize = UCNV_GET_MAX_BYTES_FOR_STRING(bufsize, ucnv_getMaxCharSize(uconv_to));
487 // "The calculated size is guaranteed to be sufficient for this conversion."
488 if (bufsize > BUF_MAX_LENGTH)
489 bufsize = BUF_MAX_LENGTH;
490 String8buf buf(bufsize);
491
492 for (R_len_t i=0; i<str_n; ++i) {
493 if (str_cont.isNA(i)) {
494 if (to_raw_logical) SET_VECTOR_ELT(ret, i, R_NilValue);
495 else SET_STRING_ELT(ret, i, NA_STRING);
496 continue;
497 }
498
499 R_len_t curn_tmp = str_cont.get(i).length();
500 const UChar* curs_tmp = str_cont.get(i).getBuffer(); // The buffer content is (probably) not NUL-terminated.
501 if (!curs_tmp)
502 throw StriException(MSG__INTERNAL_ERROR);
503
504 UErrorCode status = U_ZERO_ERROR;
505 ucnv_resetFromUnicode(uconv_to);
506 size_t bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(),
507 curs_tmp, curn_tmp, &status);
508 if (bufneed <= buf.size()) {
509 STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
510 }
511 else {// larger buffer needed
512 if (bufneed > BUF_MAX_LENGTH)
513 throw StriException(MSG__BUF_SIZE_EXCEEDED);
514 buf.resize(bufneed, false/*destroy contents*/);
515 status = U_ZERO_ERROR;
516 ucnv_resetFromUnicode(uconv_to);
517 bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(),
518 curs_tmp, curn_tmp, &status);
519 STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
520 }
521
522 if (to_raw_logical) {
523 SEXP outobj;
524 STRI__PROTECT(outobj = Rf_allocVector(RAWSXP, bufneed));
525 memcpy(RAW(outobj), buf.data(), (size_t)bufneed);
526 SET_VECTOR_ELT(ret, i, outobj);
527 STRI__UNPROTECT(1);
528 }
529 else {
530 SET_STRING_ELT(ret, i,
531 Rf_mkCharLenCE(buf.data(), bufneed, encmark_to));
532 }
533 }
534
535 STRI__UNPROTECT_ALL
536 return ret;
537
538 STRI__ERROR_HANDLER_END({/* nothing special on error */})
539 }
540
541
542 /**
543 * Convert character vector between given encodings
544 *
545 * @param str input character/raw vector or list of raw vectors
546 * @param from source encoding, \code{NULL} or \code{""} for default enc
547 * @param to target encoding, \code{NULL} or \code{""} for default enc
548 * @param to_raw single logical, should list of raw vectors be returned?
549 * @return a converted character vector or list of raw vectors
550 *
551 * @version 0.1-?? (Marek Gagolewski)
552 *
553 * @version 0.1-?? (Marek Gagolewski)
554 * arg to_raw_added, encoding marking
555 *
556 * @version 0.1-?? (Marek Gagolewski, 2013-06-16)
557 * make StriException-friendly
558 *
559 * @version 0.1-?? (Marek Gagolewski, 2013-08-08)
560 * use StriContainerListRaw
561 *
562 * @version 0.1-?? (Marek Gagolewski, 2013-11-20)
563 * BUGFIX call stri_encode_from_marked if necessary
564 *
565 * @version 0.2-1 (Marek Gagolewski, 2014-03-28)
566 * use StriUcnv
567 *
568 * @version 0.2-1 (Marek Gagolewski, 2014-04-01)
569 * estimate required buf size a priori
570 *
571 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
572 * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
573 */
stri_encode(SEXP str,SEXP from,SEXP to,SEXP to_raw)574 SEXP stri_encode(SEXP str, SEXP from, SEXP to, SEXP to_raw)
575 {
576 const char* selected_from = stri__prepare_arg_enc(from, "from", true); /* this is R_alloc'ed */
577 if (!selected_from && Rf_isVectorAtomic(str) && !isRaw(str))
578 return stri_encode_from_marked(str, to, to_raw);
579 const char* selected_to = stri__prepare_arg_enc(to, "to", true); /* this is R_alloc'ed */
580 bool to_raw_logical = stri__prepare_arg_logical_1_notNA(to_raw, "to_raw");
581
582 // raw vector, character vector, or list of raw vectors:
583 PROTECT(str = stri__prepare_arg_list_raw(str, "str"));
584
585
586 STRI__ERROR_HANDLER_BEGIN(1)
587 StriContainerListRaw str_cont(str);
588 R_len_t str_n = str_cont.get_n();
589
590 // get the number of strings to convert; if == 0, then you know what's the result
591 if (str_n <= 0) {
592 STRI__UNPROTECT_ALL
593 return Rf_allocVector(to_raw_logical?VECSXP:STRSXP, 0);
594 }
595
596 // Open converters
597 StriUcnv ucnv1(selected_from);
598 StriUcnv ucnv2(selected_to);
599 UConverter* uconv_from = ucnv1.getConverter(true /*register_callbacks*/);
600 UConverter* uconv_to = ucnv2.getConverter(true /*register_callbacks*/);
601
602 // Get target encoding mark
603 cetype_t encmark_to = to_raw_logical?CE_BYTES:ucnv2.getCE();
604
605 SEXP ret;
606 STRI__PROTECT(ret = Rf_allocVector(to_raw_logical?VECSXP:STRSXP, str_n));
607
608
609 // // estimate required buf size
610 // size_t bufsize = 0;
611 // for (R_len_t i=0; i<str_n; ++i) {
612 // if (!str_cont.isNA(i) && (size_t)str_cont.get(i).length() > bufsize)
613 // bufsize = str_cont.get(i).length();
614 // }
615 // bufsize = bufsize*4; // this is just an estimate (for 8bit->utf8 conversions)
616 // String8buf buf(bufsize);
617 String8buf buf(0);
618
619
620 for (R_len_t i=0; i<str_n; ++i) {
621 if (str_cont.isNA(i)) {
622 if (to_raw_logical) SET_VECTOR_ELT(ret, i, R_NilValue);
623 else SET_STRING_ELT(ret, i, NA_STRING);
624 continue;
625 }
626
627 const char* curs = str_cont.get(i).c_str();
628 R_len_t curn = str_cont.get(i).length();
629
630 UErrorCode status = U_ZERO_ERROR;
631 UnicodeString encs(curs, curn, uconv_from, status); // FROM -> UTF-16 [this is the slow part]
632 if (status == U_ILLEGAL_ARGUMENT_ERROR)
633 throw StriException(MSG__MEM_ALLOC_ERROR); // see #395
634 STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
635
636 R_len_t curn_tmp = encs.length();
637 const UChar* curs_tmp = encs.getBuffer(); // The buffer contents is (probably) not NUL-terminated.
638 if (!curs_tmp) {
639 throw StriException(MSG__INTERNAL_ERROR);
640 }
641
642 size_t bufneed = UCNV_GET_MAX_BYTES_FOR_STRING(curn_tmp, ucnv_getMaxCharSize(uconv_to));
643 // "The calculated size is guaranteed to be sufficient for this conversion."
644 if (bufneed > BUF_MAX_LENGTH)
645 bufneed = BUF_MAX_LENGTH;
646 buf.resize(bufneed, false/*destroy contents*/); // grows or stays as-is
647
648 status = U_ZERO_ERROR;
649 ucnv_resetFromUnicode(uconv_to);
650 bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(), curs_tmp,
651 curn_tmp, &status);
652 if (bufneed <= buf.size()) {
653 STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
654 }
655 else {// larger buffer needed
656 if (bufneed > BUF_MAX_LENGTH)
657 throw StriException(MSG__BUF_SIZE_EXCEEDED);
658 buf.resize(bufneed, false/*destroy contents*/);
659 status = U_ZERO_ERROR;
660 ucnv_resetFromUnicode(uconv_to);
661 bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(), curs_tmp,
662 curn_tmp, &status);
663 STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
664 }
665
666 if (to_raw_logical) {
667 SEXP outobj;
668 STRI__PROTECT(outobj = Rf_allocVector(RAWSXP, bufneed));
669 memcpy(RAW(outobj), buf.data(), (size_t)bufneed);
670 SET_VECTOR_ELT(ret, i, outobj);
671 STRI__UNPROTECT(1);
672 }
673 else {
674 SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), bufneed, encmark_to));
675 }
676 }
677
678 STRI__UNPROTECT_ALL
679 return ret;
680
681 STRI__ERROR_HANDLER_END({/* no special action on error */})
682 }
683