1 /* This file is part of the 'stringi' project.
2 * Copyright (c) 2013-2021, Marek Gagolewski <https://www.gagolewski.com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from
17 * this software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
21 * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
28 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
29 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 */
31
32
33 #include "stri_stringi.h"
34 #include "stri_container_utf8.h"
35 #include "stri_container_utf16.h"
36
37
38 #define StriEscape_BUFSIZE 12
39
40 /**
41 * Escape Unicode code points
42 *
43 * @param str character vector
44 * @return character vector
45 *
46 * @version 0.1-?? (Marek Gagolewski, 2013-08-17)
47 *
48 * @version 0.2-1 (Marek Gagolewski, 2014-04-01)
49 * fail on incorrect utf8 byte seqs;
50 *
51 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
52 * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
53 *
54 * @version 1.1.6 (Steve Grubb, 2017-07-20)
55 * if ((char)c >= 32 || (char)c <= 126) should be &&
56 */
stri_escape_unicode(SEXP str)57 SEXP stri_escape_unicode(SEXP str)
58 {
59 PROTECT(str = stri__prepare_arg_string(str, "str")); // prepare string argument
60
61 STRI__ERROR_HANDLER_BEGIN(1)
62 R_len_t str_length = LENGTH(str);
63 StriContainerUTF8 str_cont(str, str_length);
64
65 SEXP ret;
66 STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_length));
67
68 std::string out; // @TODO: estimate len a priori?
69
70 for (R_len_t i = str_cont.vectorize_init();
71 i != str_cont.vectorize_end();
72 i = str_cont.vectorize_next(i))
73 {
74 if (str_cont.isNA(i)) {
75 SET_STRING_ELT(ret, i, NA_STRING);
76 continue;
77 }
78
79 const char* str_cur_s = str_cont.get(i).c_str();
80 R_len_t str_cur_n = str_cont.get(i).length();
81
82 // estimate buf size
83 R_len_t bufsize = 0;
84 UChar32 c;
85 R_len_t j = 0;
86
87 while (j < str_cur_n) {
88 U8_NEXT(str_cur_s, j, str_cur_n, c);
89 if (c < 0)
90 throw StriException(MSG__INVALID_UTF8);
91 else if ((char)c >= 32 && (char)c <= 126)
92 bufsize += 1;
93 else if (c <= 0xff)
94 bufsize += 6; // for \a, \n this will be overestimated
95 else
96 bufsize += 10;
97 }
98 out.clear();
99 if ((size_t)bufsize > (size_t)out.size())
100 out.reserve(bufsize);
101
102 // do escape
103 j = 0;
104 char buf[StriEscape_BUFSIZE];
105 while (j < str_cur_n) {
106 U8_NEXT(str_cur_s, j, str_cur_n, c);
107 /* if (c < 0)
108 throw StriException(MSG__INVALID_UTF8); // this has already been checked :)
109 else */
110 if (c <= ASCII_MAXCHARCODE) {
111 switch ((char)c) {
112 case 0x07:
113 out.append("\\a");
114 break;
115 case 0x08:
116 out.append("\\b");
117 break;
118 case 0x09:
119 out.append("\\t");
120 break;
121 case 0x0a:
122 out.append("\\n");
123 break;
124 case 0x0b:
125 out.append("\\v");
126 break;
127 case 0x0c:
128 out.append("\\f");
129 break;
130 case 0x0d:
131 out.append("\\r");
132 break;
133 // case 0x1b: out.append("\\e"); break; // R doesn't know that
134 case 0x22:
135 out.append("\\\"");
136 break;
137 case 0x27:
138 out.append("\\'");
139 break;
140 case 0x5c:
141 out.append("\\\\");
142 break;
143 default:
144 if ((char)c >= 32 && (char)c <= 126) // printable characters
145 out.append(1, (char)c);
146 else {
147 snprintf(buf, StriEscape_BUFSIZE, "\\u%4.4x", (uint16_t)c);
148 out.append(buf, 6);
149 }
150 }
151 }
152 else if (c <= 0xffff) {
153 snprintf(buf, StriEscape_BUFSIZE, "\\u%4.4x", (uint16_t)c);
154 out.append(buf, 6);
155 }
156 else {
157 snprintf(buf, StriEscape_BUFSIZE, "\\U%8.8x", (uint32_t)c);
158 out.append(buf, 10);
159 }
160 }
161
162 SET_STRING_ELT(ret, i,
163 Rf_mkCharLenCE(out.c_str(), (int)out.size(), (cetype_t)CE_UTF8)
164 );
165 }
166
167 STRI__UNPROTECT_ALL
168 return ret;
169 STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
170 }
171
172
173 /**
174 * Unescape Unicode code points
175 *
176 * @param str character vector
177 * @return character vector
178 *
179 * @version 0.1-?? (Marek Gagolewski, 2013-08-17)
180 *
181 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
182 * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
183 */
stri_unescape_unicode(SEXP str)184 SEXP stri_unescape_unicode(SEXP str)
185 {
186 PROTECT(str = stri__prepare_arg_string(str, "str")); // prepare string argument
187
188 STRI__ERROR_HANDLER_BEGIN(1)
189 R_len_t str_length = LENGTH(str);
190 StriContainerUTF16 str_cont(str, str_length, false); // writable
191
192 for (R_len_t i = str_cont.vectorize_init();
193 i != str_cont.vectorize_end();
194 i = str_cont.vectorize_next(i))
195 {
196 if (str_cont.isNA(i) || str_cont.get(i).length() == 0)
197 continue; // leave as-is
198
199 str_cont.getWritable(i).setTo(str_cont.get(i).unescape());
200
201 if (str_cont.get(i).length() == 0) {
202 Rf_warning(MSG__INVALID_ESCAPE);
203 str_cont.setNA(i); // something went wrong
204 }
205 }
206
207 STRI__UNPROTECT_ALL
208 return str_cont.toR();
209 STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
210 }
211