1 /* This file is part of the 'stringi' project.
2  * Copyright (c) 2013-2021, Marek Gagolewski <https://www.gagolewski.com>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * 3. Neither the name of the copyright holder nor the names of its
16  * contributors may be used to endorse or promote products derived from
17  * this software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
21  * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
28  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
29  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 
33 #include "stri_stringi.h"
34 #include "stri_container_utf8.h"
35 #include "stri_container_utf16.h"
36 #include "stri_container_listraw.h"
37 #include "stri_container_listint.h"
38 #include "stri_string8buf.h"
39 #include "stri_ucnv.h"
40 #include <vector>
41 
42 
43 #define BUF_MAX_LENGTH 2147483647
44 
45 
46 /** Convert from UTF-32
47  *
48  * @param vec integer vector or list with integer vectors
49  * @return character vector
50  *
51  * @version 0.1-?? (Marek Gagolewski)
52  *
53  * @version 0.2-1 (Marek Gagolewski, 2014-03-25)
54  *          StriException friently;
55  *          use StriContainerListInt
56  *
57  * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
58  *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
59  */
stri_enc_fromutf32(SEXP vec)60 SEXP stri_enc_fromutf32(SEXP vec)
61 {
62     PROTECT(vec = stri__prepare_arg_list_integer(vec, "vec"));
63 
64     STRI__ERROR_HANDLER_BEGIN(1)
65     StriContainerListInt vec_cont(vec);
66     R_len_t vec_n = vec_cont.get_n();
67 
68     // get required buf size
69     R_len_t bufsize = 0;
70     for (R_len_t i=0; i<vec_n; ++i) {
71         if (!vec_cont.isNA(i) && vec_cont.get(i).size() > bufsize)
72             bufsize = vec_cont.get(i).size();
73     }
74     bufsize = U8_MAX_LENGTH*bufsize+1; // this will surely be sufficient
75     String8buf buf(bufsize);
76     char* bufdata = buf.data();
77 
78     SEXP ret;
79     STRI__PROTECT(ret = Rf_allocVector(STRSXP, vec_n));
80 
81     for (R_len_t i=0; i<vec_n; ++i) {
82         if (vec_cont.isNA(i)) {
83             SET_STRING_ELT(ret, i, NA_STRING);
84             continue;
85         }
86 
87         const int* cur_data = vec_cont.get(i).data();
88         R_len_t    cur_n    = vec_cont.get(i).size();
89         UChar32 c = (UChar32)0;
90         R_len_t j = 0;
91         R_len_t k = 0;
92         UBool err = FALSE;
93         while (!err && k < cur_n) {
94             c = cur_data[k++];
95             U8_APPEND((uint8_t*)bufdata, j, bufsize, c, err);
96 
97             // Rf_mkCharLenCE detects embedded nuls, but stops execution completely
98             if (c == 0) err = TRUE;
99         }
100 
101         if (err) {
102             Rf_warning(MSG__INVALID_CODE_POINT, (int)c);
103             SET_STRING_ELT(ret, i, NA_STRING);
104         }
105         else
106             SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, j, CE_UTF8));
107     }
108 
109     STRI__UNPROTECT_ALL;
110     return ret;
111     STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
112 }
113 
114 
115 /** Convert character vector to UTF-32
116  *
117  * @param str character vector
118  * @return list with integer vectors
119  *
120  * @version 0.1-?? (Marek Gagolewski)
121  *
122  * @version 0.1-?? (Marek Gagolewski, 2013-06-16)
123  *          make StriException-friendly
124  *
125  * @version 0.2-1 (Marek Gagolewski, 2014-03-26)
126  *          use vector<UChar32> buf instead of R_alloc;
127  *          warn and set NULL on improper UTF-8 byte sequences
128  *
129  * @version 0.2-3 (Marek Gagolewski, 2014-05-12)
130  *          Use UChar32* instead of vector<UChar32> as ::data is C++11
131  *
132  * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
133  *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
134  */
stri_enc_toutf32(SEXP str)135 SEXP stri_enc_toutf32(SEXP str)
136 {
137     PROTECT(str = stri__prepare_arg_string(str, "str"));
138     R_len_t n = LENGTH(str);
139 
140     STRI__ERROR_HANDLER_BEGIN(1)
141     StriContainerUTF8 str_cont(str, n);
142 
143     R_len_t bufsize = 1; // to avoid allocating an empty buffer
144     for (R_len_t i=0; i<n; ++i) {
145         if (str_cont.isNA(i)) continue;
146         R_len_t ni = str_cont.get(i).length();
147         if (ni > bufsize) bufsize = ni;
148     }
149 
150     UChar32* buf = (UChar32*)R_alloc((size_t)bufsize, (int)sizeof(UChar32)); // at most bufsize UChars32 (bufsize/4 min.)
151     STRI_ASSERT(buf);
152     if (!buf) throw StriException(MSG__MEM_ALLOC_ERROR);
153     // deque<UChar32> was slower than using a common, over-sized buf
154 
155     SEXP ret;
156     STRI__PROTECT(ret = Rf_allocVector(VECSXP, n)); // all
157 
158     for (R_len_t i=0; i<n; ++i) {
159 
160         if (str_cont.isNA(i)) {
161             SET_VECTOR_ELT(ret, i, R_NilValue);
162             continue;
163         }
164 
165         UChar32 c = (UChar32)0;
166         const char* s = str_cont.get(i).c_str();
167         R_len_t sn = str_cont.get(i).length();
168         R_len_t j = 0;
169         R_len_t k = 0;
170         while (c >= 0 && j < sn) {
171             U8_NEXT(s, j, sn, c);
172             buf[k++] = (int)c;
173         }
174 
175         if (c < 0) {
176             throw StriException(MSG__INVALID_UTF8);
177 //             SET_VECTOR_ELT(ret, i, R_NilValue);
178 //             continue;
179         }
180         else {
181             SEXP conv;
182             STRI__PROTECT(conv = Rf_allocVector(INTSXP, k));
183             memcpy(INTEGER(conv), buf, (size_t)sizeof(int)*k);
184             SET_VECTOR_ELT(ret, i, conv);
185             STRI__UNPROTECT(1);
186         }
187     }
188 
189     STRI__UNPROTECT_ALL
190     return ret;
191     STRI__ERROR_HANDLER_END({ /* do nothing on error */ })
192 }
193 
194 
195 /** Convert character vector to UTF-8
196  *
197  * @param str character vector
198  * @param is_unknown_8bit single logical value;
199  * if TRUE, then in case of ENC_NATIVE or ENC_LATIN1, UTF-8
200  * REPLACEMENT CHARACTERs (U+FFFD) are
201  * put for codes > 127
202  * @param validate single logical value (or NA)
203  *
204  * @return character vector
205  *
206  * @version 0.1-XX (Marek Gagolewski)
207  *
208  * @version 0.1-XX (Marek Gagolewski, 2013-06-16)
209  *                  make StriException-friendly
210  *
211  * @version 0.2-1  (Marek Gagolewski, 2014-03-26)
212  *                 Use one String8buf;
213  *                 is_unknown_8bit_logical and UTF-8 tries now to remove BOMs
214  *
215  * @version 0.2-1  (Marek Gagolewksi, 2014-03-30)
216  *                 added validate arg
217  *
218  * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
219  *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
220  */
stri_enc_toutf8(SEXP str,SEXP is_unknown_8bit,SEXP validate)221 SEXP stri_enc_toutf8(SEXP str, SEXP is_unknown_8bit, SEXP validate)
222 {
223     PROTECT(validate = stri__prepare_arg_logical_1(validate, "validate"));
224     bool is_unknown_8bit_logical =
225         stri__prepare_arg_logical_1_notNA(is_unknown_8bit, "is_unknown_8bit");
226     PROTECT(str = stri__prepare_arg_string(str, "str"));
227     R_len_t n = LENGTH(str);
228 
229     STRI__ERROR_HANDLER_BEGIN(2)
230     SEXP ret;
231     if (!is_unknown_8bit_logical) {
232         // Trivial - everything we need is in StriContainerUTF8 :)
233         // which removes BOMs silently
234         StriContainerUTF8 str_cont(str, n);
235         STRI__PROTECT(ret = str_cont.toR());
236     }
237     else {
238         // get buf size
239         size_t bufsize = 0;
240         for (R_len_t i=0; i<n; ++i) {
241             SEXP curs = STRING_ELT(str, i);
242             if (curs == NA_STRING || IS_ASCII(curs) || IS_UTF8(curs))
243                 continue;
244 
245             size_t ni = LENGTH(curs);
246             if (ni > bufsize) bufsize = ni;
247         }
248         String8buf buf(bufsize*3); // either 1 byte < 127 or U+FFFD == 3 bytes UTF-8
249         char* bufdata = buf.data();
250 
251         STRI__PROTECT(ret = Rf_allocVector(STRSXP, n));
252         for (R_len_t i=0; i<n; ++i) {
253             SEXP curs = STRING_ELT(str, i);
254             if (curs == NA_STRING) {
255                 SET_STRING_ELT(ret, i, NA_STRING);
256                 continue;
257             }
258 
259             if (IS_ASCII(curs) || IS_UTF8(curs)) {
260                 R_len_t curs_n = LENGTH(curs);
261                 const char* curs_s = CHAR(curs);  // TODO: ALTREP will be problematic?
262                 if (curs_n >= 3 &&
263                         (uint8_t)(curs_s[0]) == UTF8_BOM_BYTE1 &&
264                         (uint8_t)(curs_s[1]) == UTF8_BOM_BYTE2 &&
265                         (uint8_t)(curs_s[2]) == UTF8_BOM_BYTE3) {
266                     // has BOM - get rid of it
267                     SET_STRING_ELT(ret, i, Rf_mkCharLenCE(curs_s+3, curs_n-3, CE_UTF8));
268                 }
269                 else
270                     SET_STRING_ELT(ret, i, curs);
271 
272                 continue;
273             }
274 
275             // otherwise, we have an 8-bit encoding
276             R_len_t curn = LENGTH(curs);
277             const char* curs_tab = CHAR(curs);  // TODO: ALTREP will be problematic?
278             R_len_t k = 0;
279             for (R_len_t j=0; j<curn; ++j) {
280                 if (U8_IS_SINGLE(curs_tab[j]))
281                     bufdata[k++] = curs_tab[j];
282                 else { // 0xEF 0xBF 0xBD
283                     bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE1;
284                     bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE2;
285                     bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE3;
286                 }
287             }
288             SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8));
289         }
290 
291     }
292 
293     // validate utf8 byte stream
294     if (LOGICAL(validate)[0] != FALSE) { // NA or TRUE
295         R_len_t ret_n = LENGTH(ret);
296         for (R_len_t i=0; i<ret_n; ++i) {
297             SEXP curs = STRING_ELT(ret, i);
298             if (curs == NA_STRING) continue;
299 
300             const char* s = CHAR(curs);  // TODO: ALTREP will be problematic?
301             R_len_t sn = LENGTH(curs);
302             R_len_t j = 0;
303             UChar32 c = 0;
304             while (c >= 0 && j < sn) {
305                 U8_NEXT(s, j, sn, c);
306             }
307 
308             if (c >= 0) continue; // valid, nothing to do
309 
310             if (LOGICAL(validate)[0] == NA_LOGICAL) {
311                 Rf_warning(MSG__INVALID_CODE_POINT_REPLNA);
312                 SET_STRING_ELT(ret, i, NA_STRING);
313             }
314             else {
315                 size_t bufsize = sn*3; // maximum: 1 byte -> U+FFFD (3 bytes)
316                 String8buf buf(bufsize); // maximum: 1 byte -> U+FFFD (3 bytes)
317                 char* bufdata = buf.data();
318 
319                 j = 0;
320                 size_t k = 0;
321                 UBool err = FALSE;
322                 while (!err && j < sn) {
323                     U8_NEXT(s, j, sn, c);
324                     if (c >= 0) {
325                         U8_APPEND((uint8_t*)bufdata, k, bufsize, c, err);
326                     } else {
327                         Rf_warning(MSG__INVALID_CODE_POINT_FIXING);
328                         bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE1;
329                         bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE2;
330                         bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE3;
331                     }
332                 }
333 
334                 if (err) throw StriException(MSG__INTERNAL_ERROR);
335                 SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8));
336             }
337         }
338     }
339 
340     STRI__UNPROTECT_ALL
341     return ret;
342     STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
343 }
344 
345 
346 /** Convert character vector to ASCII
347  *
348  * All charcodes > 127 are replaced with subst chars (0x1A)
349  *
350  * @param str character vector
351  * @return character vector
352  *
353  * @version 0.1-?? (Marek Gagolewski)
354  *
355  * @version 0.1-?? (Marek Gagolewski, 2013-06-16)
356  *          make StriException-friendly
357  *
358  * @version 0.2-1 (Marek Gagolewski, 2014-03-30)
359  *          use single common buf;
360  *          warn on invalid utf8 byte stream
361  *
362  * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
363  *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
364  */
stri_enc_toascii(SEXP str)365 SEXP stri_enc_toascii(SEXP str)
366 {
367     PROTECT(str = stri__prepare_arg_string(str, "str"));
368     R_len_t n = LENGTH(str);
369 
370     STRI__ERROR_HANDLER_BEGIN(1)
371 
372     // get buf size
373     size_t bufsize = 0;
374     for (R_len_t i=0; i<n; ++i) {
375         SEXP curs = STRING_ELT(str, i);
376         if (curs == NA_STRING)
377             continue;
378 
379         size_t ni = LENGTH(curs);
380         if (ni > bufsize) bufsize = ni;
381     }
382     String8buf buf(bufsize); // no more bytes than this needed
383     char* bufdata = buf.data();
384 
385     SEXP ret;
386     STRI__PROTECT(ret = Rf_allocVector(STRSXP, n));
387     for (R_len_t i=0; i<n; ++i) {
388         SEXP curs = STRING_ELT(str, i);
389         if (curs == NA_STRING || IS_ASCII(curs)) {
390             // nothing to do
391             SET_STRING_ELT(ret, i, curs);
392             continue;
393         }
394 
395         R_len_t curn = LENGTH(curs);
396         const char* curs_tab = CHAR(curs);  // TODO: ALTREP will be problematic?
397 
398         if (IS_UTF8(curs)) {
399             R_len_t k = 0, j = 0;
400             UChar32 c;
401             while (j<curn) {
402                 U8_NEXT(curs_tab, j, curn, c);
403                 if (c < 0) {
404                     Rf_warning(MSG__INVALID_CODE_POINT_FIXING);
405                     bufdata[k++] = ASCII_SUBSTITUTE;
406                 }
407                 else if (c > ASCII_MAXCHARCODE)
408                     bufdata[k++] = ASCII_SUBSTITUTE;
409                 else
410                     bufdata[k++] = (char)c;
411             }
412             SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8));
413             // the string will be marked as ASCII anyway by mkCharLenCE
414         }
415         else { // some 8-bit encoding
416             R_len_t k = 0;
417             for (R_len_t j=0; j<curn; ++j) {
418                 if (U8_IS_SINGLE(curs_tab[j]))
419                     bufdata[k++] = curs_tab[j];
420                 else {
421                     bufdata[k++] = (char)ASCII_SUBSTITUTE; // subst char in ascii
422                 }
423             }
424             SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8));
425             // the string will be marked as ASCII anyway by mkCharLenCE
426         }
427     }
428 
429     STRI__UNPROTECT_ALL
430     return ret;
431     STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
432 }
433 
434 
435 // ------------------------------------------------------------------------
436 
437 /**
438  * Convert character vector between marked encodings and the encoding provided
439  *
440  * @param str     input character vector
441  * @param to    target encoding, \code{NULL} or \code{""} for default enc
442  * @param to_raw single logical, should list of raw vectors be returned?
443  * @return a converted character vector or list of raw vectors
444  *
445  * @version 0.1-?? (Marek Gagolewski, 2013-11-12)
446  *
447  * @version 0.2-1 (Marek Gagolewski, 2014-03-28)
448  *          use StriUcnv
449  *
450  * @version 0.2-1 (Marek Gagolewski, 2014-04-01)
451  *          calc required buf size a priori
452  *
453  * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
454  *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
455  */
stri_encode_from_marked(SEXP str,SEXP to,SEXP to_raw)456 SEXP stri_encode_from_marked(SEXP str, SEXP to, SEXP to_raw)
457 {
458     PROTECT(str = stri__prepare_arg_string(str, "str"));
459     const char* selected_to   = stri__prepare_arg_enc(to, "to", true); /* this is R_alloc'ed */
460     bool to_raw_logical = stri__prepare_arg_logical_1_notNA(to_raw, "to_raw");
461 
462     STRI__ERROR_HANDLER_BEGIN(1)
463     R_len_t str_n = LENGTH(str);
464     StriContainerUTF16 str_cont(str, str_n);
465 
466     // get the number of strings to convert; if == 0, then you know what's the result
467     if (str_n <= 0) return Rf_allocVector(to_raw_logical?VECSXP:STRSXP, 0);
468 
469     // Open converters
470     StriUcnv ucnv(selected_to);
471     UConverter* uconv_to = ucnv.getConverter(true /*register_callbacks*/);
472 
473     // Get target encoding mark
474     cetype_t encmark_to = to_raw_logical?CE_BYTES:ucnv.getCE();
475 
476     // Prepare out val
477     SEXP ret;
478     STRI__PROTECT(ret = Rf_allocVector(to_raw_logical?VECSXP:STRSXP, str_n));
479 
480     // calculate required buf size
481     size_t bufsize = 0;
482     for (R_len_t i=0; i<str_n; ++i) {
483         if (!str_cont.isNA(i) && (size_t)str_cont.get(i).length() > bufsize)
484             bufsize = str_cont.get(i).length();
485     }
486     bufsize = UCNV_GET_MAX_BYTES_FOR_STRING(bufsize, ucnv_getMaxCharSize(uconv_to));
487     // "The calculated size is guaranteed to be sufficient for this conversion."
488     if (bufsize > BUF_MAX_LENGTH)
489         bufsize = BUF_MAX_LENGTH;
490     String8buf buf(bufsize);
491 
492     for (R_len_t i=0; i<str_n; ++i) {
493         if (str_cont.isNA(i)) {
494             if (to_raw_logical) SET_VECTOR_ELT(ret, i, R_NilValue);
495             else                SET_STRING_ELT(ret, i, NA_STRING);
496             continue;
497         }
498 
499         R_len_t curn_tmp = str_cont.get(i).length();
500         const UChar* curs_tmp = str_cont.get(i).getBuffer(); // The buffer content is (probably) not NUL-terminated.
501         if (!curs_tmp)
502             throw StriException(MSG__INTERNAL_ERROR);
503 
504         UErrorCode status = U_ZERO_ERROR;
505         ucnv_resetFromUnicode(uconv_to);
506         size_t bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(),
507                                          curs_tmp, curn_tmp, &status);
508         if (bufneed <= buf.size()) {
509             STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
510         }
511         else {// larger buffer needed
512             if (bufneed > BUF_MAX_LENGTH)
513                 throw StriException(MSG__BUF_SIZE_EXCEEDED);
514             buf.resize(bufneed, false/*destroy contents*/);
515             status = U_ZERO_ERROR;
516             ucnv_resetFromUnicode(uconv_to);
517             bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(),
518                                       curs_tmp, curn_tmp, &status);
519             STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
520         }
521 
522         if (to_raw_logical) {
523             SEXP outobj;
524             STRI__PROTECT(outobj = Rf_allocVector(RAWSXP, bufneed));
525             memcpy(RAW(outobj), buf.data(), (size_t)bufneed);
526             SET_VECTOR_ELT(ret, i, outobj);
527             STRI__UNPROTECT(1);
528         }
529         else {
530             SET_STRING_ELT(ret, i,
531                            Rf_mkCharLenCE(buf.data(), bufneed, encmark_to));
532         }
533     }
534 
535     STRI__UNPROTECT_ALL
536     return ret;
537 
538     STRI__ERROR_HANDLER_END({/* nothing special on error */})
539 }
540 
541 
542 /**
543  * Convert character vector between given encodings
544  *
545  * @param str     input character/raw vector or list of raw vectors
546  * @param from  source encoding, \code{NULL} or \code{""} for default enc
547  * @param to    target encoding, \code{NULL} or \code{""} for default enc
548  * @param to_raw single logical, should list of raw vectors be returned?
549  * @return a converted character vector or list of raw vectors
550  *
551  * @version 0.1-?? (Marek Gagolewski)
552  *
553  * @version 0.1-?? (Marek Gagolewski)
554  *          arg to_raw_added, encoding marking
555  *
556  * @version 0.1-?? (Marek Gagolewski, 2013-06-16)
557  *          make StriException-friendly
558  *
559  * @version 0.1-?? (Marek Gagolewski, 2013-08-08)
560  *          use StriContainerListRaw
561  *
562  * @version 0.1-?? (Marek Gagolewski, 2013-11-20)
563  *          BUGFIX call stri_encode_from_marked if necessary
564  *
565  * @version 0.2-1 (Marek Gagolewski, 2014-03-28)
566  *          use StriUcnv
567  *
568  * @version 0.2-1 (Marek Gagolewski, 2014-04-01)
569  *          estimate required buf size a priori
570  *
571  * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
572  *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
573  */
stri_encode(SEXP str,SEXP from,SEXP to,SEXP to_raw)574 SEXP stri_encode(SEXP str, SEXP from, SEXP to, SEXP to_raw)
575 {
576     const char* selected_from = stri__prepare_arg_enc(from, "from", true); /* this is R_alloc'ed */
577     if (!selected_from && Rf_isVectorAtomic(str) && !isRaw(str))
578         return stri_encode_from_marked(str, to, to_raw);
579     const char* selected_to   = stri__prepare_arg_enc(to, "to", true); /* this is R_alloc'ed */
580     bool to_raw_logical = stri__prepare_arg_logical_1_notNA(to_raw, "to_raw");
581 
582     // raw vector, character vector, or list of raw vectors:
583     PROTECT(str = stri__prepare_arg_list_raw(str, "str"));
584 
585 
586     STRI__ERROR_HANDLER_BEGIN(1)
587     StriContainerListRaw str_cont(str);
588     R_len_t str_n = str_cont.get_n();
589 
590     // get the number of strings to convert; if == 0, then you know what's the result
591     if (str_n <= 0) {
592         STRI__UNPROTECT_ALL
593         return Rf_allocVector(to_raw_logical?VECSXP:STRSXP, 0);
594     }
595 
596     // Open converters
597     StriUcnv ucnv1(selected_from);
598     StriUcnv ucnv2(selected_to);
599     UConverter* uconv_from = ucnv1.getConverter(true /*register_callbacks*/);
600     UConverter* uconv_to   = ucnv2.getConverter(true /*register_callbacks*/);
601 
602     // Get target encoding mark
603     cetype_t encmark_to = to_raw_logical?CE_BYTES:ucnv2.getCE();
604 
605     SEXP ret;
606     STRI__PROTECT(ret = Rf_allocVector(to_raw_logical?VECSXP:STRSXP, str_n));
607 
608 
609 //   // estimate required buf size
610 //    size_t bufsize = 0;
611 //    for (R_len_t i=0; i<str_n; ++i) {
612 //       if (!str_cont.isNA(i) && (size_t)str_cont.get(i).length() > bufsize)
613 //          bufsize = str_cont.get(i).length();
614 //    }
615 //    bufsize = bufsize*4; // this is just an estimate (for 8bit->utf8 conversions)
616 //    String8buf buf(bufsize);
617     String8buf buf(0);
618 
619 
620     for (R_len_t i=0; i<str_n; ++i) {
621         if (str_cont.isNA(i)) {
622             if (to_raw_logical) SET_VECTOR_ELT(ret, i, R_NilValue);
623             else                SET_STRING_ELT(ret, i, NA_STRING);
624             continue;
625         }
626 
627         const char* curs = str_cont.get(i).c_str();
628         R_len_t curn     = str_cont.get(i).length();
629 
630         UErrorCode status = U_ZERO_ERROR;
631         UnicodeString encs(curs, curn, uconv_from, status); // FROM -> UTF-16 [this is the slow part]
632         if (status == U_ILLEGAL_ARGUMENT_ERROR)
633             throw StriException(MSG__MEM_ALLOC_ERROR);  // see #395
634         STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
635 
636         R_len_t curn_tmp = encs.length();
637         const UChar* curs_tmp = encs.getBuffer(); // The buffer contents is (probably) not NUL-terminated.
638         if (!curs_tmp) {
639             throw StriException(MSG__INTERNAL_ERROR);
640         }
641 
642         size_t bufneed = UCNV_GET_MAX_BYTES_FOR_STRING(curn_tmp, ucnv_getMaxCharSize(uconv_to));
643         // "The calculated size is guaranteed to be sufficient for this conversion."
644         if (bufneed > BUF_MAX_LENGTH)
645             bufneed = BUF_MAX_LENGTH;
646         buf.resize(bufneed, false/*destroy contents*/); // grows or stays as-is
647 
648         status = U_ZERO_ERROR;
649         ucnv_resetFromUnicode(uconv_to);
650         bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(), curs_tmp,
651                                   curn_tmp, &status);
652         if (bufneed <= buf.size()) {
653             STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
654         }
655         else {// larger buffer needed
656             if (bufneed > BUF_MAX_LENGTH)
657                 throw StriException(MSG__BUF_SIZE_EXCEEDED);
658             buf.resize(bufneed, false/*destroy contents*/);
659             status = U_ZERO_ERROR;
660             ucnv_resetFromUnicode(uconv_to);
661             bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(), curs_tmp,
662                                       curn_tmp, &status);
663             STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
664         }
665 
666         if (to_raw_logical) {
667             SEXP outobj;
668             STRI__PROTECT(outobj = Rf_allocVector(RAWSXP, bufneed));
669             memcpy(RAW(outobj), buf.data(), (size_t)bufneed);
670             SET_VECTOR_ELT(ret, i, outobj);
671             STRI__UNPROTECT(1);
672         }
673         else {
674             SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), bufneed, encmark_to));
675         }
676     }
677 
678     STRI__UNPROTECT_ALL
679     return ret;
680 
681     STRI__ERROR_HANDLER_END({/* no special action on error */})
682 }
683