1 /* This file is part of the 'stringi' project.
2  * Copyright (c) 2013-2021, Marek Gagolewski <https://www.gagolewski.com>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * 3. Neither the name of the copyright holder nor the names of its
16  * contributors may be used to endorse or promote products derived from
17  * this software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
21  * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
28  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
29  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 
33 #include "stri_stringi.h"
34 #include "stri_container_utf8.h"
35 #include "stri_container_utf16.h"
36 
37 
38 #define StriEscape_BUFSIZE 12
39 
40 /**
41  *  Escape Unicode code points
42  *
43  *  @param str character vector
44  *  @return character vector
45  *
46  * @version 0.1-?? (Marek Gagolewski, 2013-08-17)
47  *
48  * @version 0.2-1 (Marek Gagolewski, 2014-04-01)
49  *          fail on incorrect utf8 byte seqs;
50  *
51  * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
52  *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
53  *
54  * @version 1.1.6 (Steve Grubb, 2017-07-20)
55  *          if ((char)c >= 32 || (char)c <= 126) should be &&
56 */
stri_escape_unicode(SEXP str)57 SEXP stri_escape_unicode(SEXP str)
58 {
59     PROTECT(str = stri__prepare_arg_string(str, "str")); // prepare string argument
60 
61     STRI__ERROR_HANDLER_BEGIN(1)
62     R_len_t str_length = LENGTH(str);
63     StriContainerUTF8 str_cont(str, str_length);
64 
65     SEXP ret;
66     STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_length));
67 
68     std::string out; // @TODO: estimate len a priori?
69 
70     for (R_len_t i = str_cont.vectorize_init();
71             i != str_cont.vectorize_end();
72             i = str_cont.vectorize_next(i))
73     {
74         if (str_cont.isNA(i)) {
75             SET_STRING_ELT(ret, i, NA_STRING);
76             continue;
77         }
78 
79         const char* str_cur_s = str_cont.get(i).c_str();
80         R_len_t     str_cur_n = str_cont.get(i).length();
81 
82         // estimate buf size
83         R_len_t bufsize = 0;
84         UChar32 c;
85         R_len_t j = 0;
86 
87         while (j < str_cur_n) {
88             U8_NEXT(str_cur_s, j, str_cur_n, c);
89             if (c < 0)
90                 throw StriException(MSG__INVALID_UTF8);
91             else if ((char)c >= 32 && (char)c <= 126)
92                 bufsize += 1;
93             else if (c <= 0xff)
94                 bufsize += 6; // for \a, \n this will be overestimated
95             else
96                 bufsize += 10;
97         }
98         out.clear();
99         if ((size_t)bufsize > (size_t)out.size())
100             out.reserve(bufsize);
101 
102         // do escape
103         j = 0;
104         char buf[StriEscape_BUFSIZE];
105         while (j < str_cur_n) {
106             U8_NEXT(str_cur_s, j, str_cur_n, c);
107             /* if (c < 0)
108                throw StriException(MSG__INVALID_UTF8); // this has already been checked :)
109             else */
110             if (c <= ASCII_MAXCHARCODE) {
111                 switch ((char)c) {
112                 case 0x07:
113                     out.append("\\a");
114                     break;
115                 case 0x08:
116                     out.append("\\b");
117                     break;
118                 case 0x09:
119                     out.append("\\t");
120                     break;
121                 case 0x0a:
122                     out.append("\\n");
123                     break;
124                 case 0x0b:
125                     out.append("\\v");
126                     break;
127                 case 0x0c:
128                     out.append("\\f");
129                     break;
130                 case 0x0d:
131                     out.append("\\r");
132                     break;
133 //               case 0x1b: out.append("\\e"); break; // R doesn't know that
134                 case 0x22:
135                     out.append("\\\"");
136                     break;
137                 case 0x27:
138                     out.append("\\'");
139                     break;
140                 case 0x5c:
141                     out.append("\\\\");
142                     break;
143                 default:
144                     if ((char)c >= 32 && (char)c <= 126) // printable characters
145                         out.append(1, (char)c);
146                     else {
147                         snprintf(buf, StriEscape_BUFSIZE, "\\u%4.4x", (uint16_t)c);
148                         out.append(buf, 6);
149                     }
150                 }
151             }
152             else if (c <= 0xffff) {
153                 snprintf(buf, StriEscape_BUFSIZE, "\\u%4.4x", (uint16_t)c);
154                 out.append(buf, 6);
155             }
156             else {
157                 snprintf(buf, StriEscape_BUFSIZE, "\\U%8.8x", (uint32_t)c);
158                 out.append(buf, 10);
159             }
160         }
161 
162         SET_STRING_ELT(ret, i,
163                        Rf_mkCharLenCE(out.c_str(), (int)out.size(), (cetype_t)CE_UTF8)
164                       );
165     }
166 
167     STRI__UNPROTECT_ALL
168     return ret;
169     STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
170 }
171 
172 
173 /**
174  *  Unescape Unicode code points
175  *
176  *  @param str character vector
177  *  @return character vector
178  *
179  * @version 0.1-?? (Marek Gagolewski, 2013-08-17)
180  *
181  * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
182  *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
183 */
stri_unescape_unicode(SEXP str)184 SEXP stri_unescape_unicode(SEXP str)
185 {
186     PROTECT(str = stri__prepare_arg_string(str, "str")); // prepare string argument
187 
188     STRI__ERROR_HANDLER_BEGIN(1)
189     R_len_t str_length = LENGTH(str);
190     StriContainerUTF16 str_cont(str, str_length, false); // writable
191 
192     for (R_len_t i = str_cont.vectorize_init();
193             i != str_cont.vectorize_end();
194             i = str_cont.vectorize_next(i))
195     {
196         if (str_cont.isNA(i) || str_cont.get(i).length() == 0)
197             continue; // leave as-is
198 
199         str_cont.getWritable(i).setTo(str_cont.get(i).unescape());
200 
201         if (str_cont.get(i).length() == 0) {
202             Rf_warning(MSG__INVALID_ESCAPE);
203             str_cont.setNA(i); // something went wrong
204         }
205     }
206 
207     STRI__UNPROTECT_ALL
208     return str_cont.toR();
209     STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
210 }
211