1 /*
2 * %CopyrightBegin%
3 *
4 * Copyright Ericsson AB 1998-2018. All Rights Reserved.
5 *
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 *
18 * %CopyrightEnd%
19 */
20 #include <string.h>
21 #include "eidef.h"
22 #include "eiext.h"
23 #include "putget.h"
24
25
ei_decode_atom(const char * buf,int * index,char * p)26 int ei_decode_atom(const char *buf, int *index, char *p)
27 {
28 return ei_decode_atom_as(buf, index, p, MAXATOMLEN, ERLANG_LATIN1, NULL, NULL);
29 }
30
ei_decode_atom_as(const char * buf,int * index,char * p,int destlen,erlang_char_encoding want_enc,erlang_char_encoding * was_encp,erlang_char_encoding * res_encp)31 int ei_decode_atom_as(const char *buf, int *index, char* p, int destlen,
32 erlang_char_encoding want_enc,
33 erlang_char_encoding* was_encp,
34 erlang_char_encoding* res_encp)
35 {
36 const char *s = buf + *index;
37 const char *s0 = s;
38 int len;
39 erlang_char_encoding got_enc;
40
41 switch (get8(s)) {
42 case ERL_ATOM_EXT:
43 len = get16be(s);
44 got_enc = ERLANG_LATIN1;
45 break;
46 case ERL_SMALL_ATOM_EXT:
47 len = get8(s);
48 got_enc = ERLANG_LATIN1;
49 break;
50 case ERL_ATOM_UTF8_EXT:
51 len = get16be(s);
52 got_enc = ERLANG_UTF8;
53 break;
54 case ERL_SMALL_ATOM_UTF8_EXT:
55 len = get8(s);
56 got_enc = ERLANG_UTF8;
57 break;
58 default:
59 return -1;
60 }
61
62 if ((want_enc & got_enc) || want_enc == ERLANG_ASCII) {
63 int i, found_non_ascii = 0;
64 if (len >= destlen)
65 return -1;
66 for (i=0; i<len; i++) {
67 if (s[i] & 0x80) found_non_ascii = 1;
68 if (p) p[i] = s[i];
69 }
70 if (p) p[len] = 0;
71 if (want_enc == ERLANG_ASCII && found_non_ascii) {
72 return -1;
73 }
74 if (res_encp) {
75 *res_encp = found_non_ascii ? got_enc : ERLANG_ASCII;
76 }
77 }
78 else {
79 int plen = (got_enc == ERLANG_LATIN1) ?
80 latin1_to_utf8(p, s, len, destlen-1, res_encp) :
81 utf8_to_latin1(p, s, len, destlen-1, res_encp);
82 if (plen < 0) return -1;
83 if (p) p[plen] = 0;
84 }
85 if (was_encp) {
86 *was_encp = got_enc;
87 }
88
89 s += len;
90 *index += s-s0;
91 return 0;
92 }
93
94
95
96 #ifdef HAVE_UNALIGNED_WORD_ACCESS
97
98 #if SIZEOF_VOID_P == SIZEOF_LONG
99 typedef unsigned long AsciiWord;
100 #elif SIZEOF_VOID_P == SIZEOF_LONG_LONG
101 typedef unsigned long long AsciiWord;
102 #else
103 # error "Uknown word type"
104 #endif
105
106 #if SIZEOF_VOID_P == 4
107 # define ASCII_CHECK_MASK ((AsciiWord)0x80808080U)
108 #elif SIZEOF_VOID_P == 8
109 # define ASCII_CHECK_MASK ((AsciiWord)0x8080808080808080U)
110 #endif
111
ascii_fast_track(char * dst,const char * src,int slen,int destlen)112 static int ascii_fast_track(char* dst, const char* src, int slen, int destlen)
113 {
114 const AsciiWord* src_word = (AsciiWord*) src;
115 const AsciiWord* const src_word_end = src_word + (slen / sizeof(AsciiWord));
116
117 if (destlen < slen)
118 return 0;
119
120 if (dst) {
121 AsciiWord* dst_word = (AsciiWord*)dst;
122
123 while (src_word < src_word_end) {
124 if ((*src_word & ASCII_CHECK_MASK) != 0)
125 break;
126 *dst_word++ = *src_word++;
127 }
128 }
129 else {
130 while (src_word < src_word_end) {
131 if ((*src_word & ASCII_CHECK_MASK) != 0)
132 break;
133 src_word++;
134 }
135 }
136 return (char*)src_word - src;
137 }
138 #endif /* HAVE_UNALIGNED_WORD_ACCESS */
139
utf8_to_latin1(char * dst,const char * src,int slen,int destlen,erlang_char_encoding * res_encp)140 int utf8_to_latin1(char* dst, const char* src, int slen, int destlen,
141 erlang_char_encoding* res_encp)
142 {
143 const char* const dst_start = dst;
144 const char* const dst_end = dst + destlen;
145 int found_non_ascii = 0;
146
147 #ifdef HAVE_UNALIGNED_WORD_ACCESS
148 {
149 int aft = ascii_fast_track(dst, src, slen, destlen);
150 src += aft;
151 slen -= aft;
152 dst += aft;
153 }
154 #endif
155
156 while (slen > 0) {
157 if (dst >= dst_end) return -1;
158 if ((src[0] & 0x80) == 0) {
159 if (dst_start) {
160 *dst = *src;
161 }
162 ++dst;
163 ++src;
164 --slen;
165 }
166 else if (slen > 1 &&
167 (src[0] & 0xFE) == 0xC2 &&
168 (src[1] & 0xC0) == 0x80) {
169 if (dst_start) {
170 *dst = (char) ((src[0] << 6) | (src[1] & 0x3F));
171 }
172 ++dst;
173 src += 2;
174 slen -= 2;
175 found_non_ascii = 1;
176 }
177 else return -1;
178 }
179 if (res_encp) {
180 *res_encp = found_non_ascii ? ERLANG_LATIN1 : ERLANG_ASCII;
181 }
182 return dst - dst_start;
183 }
184
latin1_to_utf8(char * dst,const char * src,int slen,int destlen,erlang_char_encoding * res_encp)185 int latin1_to_utf8(char* dst, const char* src, int slen, int destlen,
186 erlang_char_encoding* res_encp)
187 {
188 const char* const src_end = src + slen;
189 const char* const dst_start = dst;
190 const char* const dst_end = dst + destlen;
191 int found_non_ascii = 0;
192
193 #ifdef HAVE_UNALIGNED_WORD_ACCESS
194 {
195 int aft = ascii_fast_track(dst, src, slen, destlen);
196 dst += aft;
197 src += aft;
198 }
199 #endif
200
201 while (src < src_end) {
202 if (dst >= dst_end) return -1;
203 if ((src[0] & 0x80) == 0) {
204 if (dst_start) {
205 *dst = *src;
206 }
207 ++dst;
208 }
209 else {
210 if (dst_start) {
211 unsigned char ch = *src;
212 dst[0] = 0xC0 | (ch >> 6);
213 dst[1] = 0x80 | (ch & 0x3F);
214 }
215 dst += 2;
216 found_non_ascii = 1;
217 }
218 ++src;
219 }
220 if (res_encp) {
221 *res_encp = found_non_ascii ? ERLANG_UTF8 : ERLANG_ASCII;
222 }
223 return dst - dst_start;
224 }
225
226
227
ei_internal_get_atom(const char ** bufp,char * p,erlang_char_encoding * was_encp)228 int ei_internal_get_atom(const char** bufp, char* p,
229 erlang_char_encoding* was_encp)
230 {
231 int ix = 0;
232 if (ei_decode_atom_as(*bufp, &ix, p, MAXATOMLEN_UTF8, ERLANG_UTF8, was_encp, NULL) < 0)
233 return -1;
234 *bufp += ix;
235 return 0;
236 }
237
238
239