1 /*
2  * %CopyrightBegin%
3  *
4  * Copyright Ericsson AB 1998-2018. All Rights Reserved.
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  *     http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  *
18  * %CopyrightEnd%
19  */
20 #include <string.h>
21 #include "eidef.h"
22 #include "eiext.h"
23 #include "putget.h"
24 
25 
ei_decode_atom(const char * buf,int * index,char * p)26 int ei_decode_atom(const char *buf, int *index, char *p)
27 {
28     return ei_decode_atom_as(buf, index, p, MAXATOMLEN, ERLANG_LATIN1, NULL, NULL);
29 }
30 
ei_decode_atom_as(const char * buf,int * index,char * p,int destlen,erlang_char_encoding want_enc,erlang_char_encoding * was_encp,erlang_char_encoding * res_encp)31 int ei_decode_atom_as(const char *buf, int *index, char* p, int destlen,
32 		      erlang_char_encoding want_enc,
33 		      erlang_char_encoding* was_encp,
34 		      erlang_char_encoding* res_encp)
35 {
36     const char *s = buf + *index;
37     const char *s0 = s;
38     int len;
39     erlang_char_encoding got_enc;
40 
41     switch (get8(s)) {
42     case ERL_ATOM_EXT:
43 	len = get16be(s);
44 	got_enc = ERLANG_LATIN1;
45 	break;
46     case ERL_SMALL_ATOM_EXT:
47 	len = get8(s);
48 	got_enc = ERLANG_LATIN1;
49 	break;
50     case ERL_ATOM_UTF8_EXT:
51 	len = get16be(s);
52 	got_enc = ERLANG_UTF8;
53 	break;
54     case ERL_SMALL_ATOM_UTF8_EXT:
55 	len = get8(s);
56 	got_enc = ERLANG_UTF8;
57 	break;
58     default:
59 	return -1;
60     }
61 
62     if ((want_enc & got_enc) || want_enc == ERLANG_ASCII) {
63 	int i, found_non_ascii = 0;
64 	if (len >= destlen)
65 	    return -1;
66 	for (i=0; i<len; i++) {
67 	    if (s[i] & 0x80) found_non_ascii = 1;
68 	    if (p) p[i] = s[i];
69 	}
70 	if (p) p[len] = 0;
71 	if (want_enc == ERLANG_ASCII && found_non_ascii) {
72 	    return -1;
73 	}
74 	if (res_encp) {
75 	    *res_encp = found_non_ascii ? got_enc : ERLANG_ASCII;
76 	}
77     }
78     else {
79 	int plen = (got_enc == ERLANG_LATIN1) ?
80 	  latin1_to_utf8(p, s, len, destlen-1, res_encp) :
81 	  utf8_to_latin1(p, s, len, destlen-1, res_encp);
82 	if (plen < 0) return -1;
83 	if (p) p[plen] = 0;
84     }
85     if (was_encp) {
86 	*was_encp = got_enc;
87     }
88 
89     s += len;
90     *index += s-s0;
91     return 0;
92 }
93 
94 
95 
96 #ifdef HAVE_UNALIGNED_WORD_ACCESS
97 
98 #if SIZEOF_VOID_P == SIZEOF_LONG
99 typedef unsigned long AsciiWord;
100 #elif SIZEOF_VOID_P == SIZEOF_LONG_LONG
101 typedef unsigned long long AsciiWord;
102 #else
103 #  error "Uknown word type"
104 #endif
105 
106 #if SIZEOF_VOID_P == 4
107 #  define ASCII_CHECK_MASK ((AsciiWord)0x80808080U)
108 #elif SIZEOF_VOID_P == 8
109 #  define ASCII_CHECK_MASK ((AsciiWord)0x8080808080808080U)
110 #endif
111 
ascii_fast_track(char * dst,const char * src,int slen,int destlen)112 static int ascii_fast_track(char* dst, const char* src, int slen, int destlen)
113 {
114     const AsciiWord* src_word = (AsciiWord*) src;
115     const AsciiWord* const src_word_end = src_word + (slen / sizeof(AsciiWord));
116 
117     if (destlen < slen)
118         return 0;
119 
120     if (dst) {
121         AsciiWord* dst_word = (AsciiWord*)dst;
122 
123         while (src_word < src_word_end) {
124             if ((*src_word & ASCII_CHECK_MASK) != 0)
125                 break;
126             *dst_word++ = *src_word++;
127         }
128     }
129     else {
130         while (src_word < src_word_end) {
131             if ((*src_word & ASCII_CHECK_MASK) != 0)
132                 break;
133             src_word++;
134         }
135     }
136     return (char*)src_word - src;
137 }
138 #endif /* HAVE_UNALIGNED_WORD_ACCESS */
139 
utf8_to_latin1(char * dst,const char * src,int slen,int destlen,erlang_char_encoding * res_encp)140 int utf8_to_latin1(char* dst, const char* src, int slen, int destlen,
141 		   erlang_char_encoding* res_encp)
142 {
143     const char* const dst_start = dst;
144     const char* const dst_end = dst + destlen;
145     int found_non_ascii = 0;
146 
147 #ifdef HAVE_UNALIGNED_WORD_ACCESS
148     {
149         int aft = ascii_fast_track(dst, src, slen, destlen);
150         src += aft;
151         slen -= aft;
152         dst += aft;
153     }
154 #endif
155 
156     while (slen > 0) {
157 	if (dst >= dst_end) return -1;
158 	if ((src[0] & 0x80) == 0) {
159 	    if (dst_start) {
160 		*dst = *src;
161 	    }
162 	    ++dst;
163 	    ++src;
164 	    --slen;
165 	}
166 	else if (slen > 1 &&
167 		 (src[0] & 0xFE) == 0xC2 &&
168 		 (src[1] & 0xC0) == 0x80) {
169 	    if (dst_start) {
170 		*dst = (char) ((src[0] << 6) | (src[1] & 0x3F));
171 	    }
172 	    ++dst;
173 	    src += 2;
174 	    slen -= 2;
175 	    found_non_ascii = 1;
176 	}
177 	else return -1;
178     }
179     if (res_encp) {
180 	*res_encp = found_non_ascii ? ERLANG_LATIN1 : ERLANG_ASCII;
181     }
182     return dst - dst_start;
183 }
184 
latin1_to_utf8(char * dst,const char * src,int slen,int destlen,erlang_char_encoding * res_encp)185 int latin1_to_utf8(char* dst, const char* src, int slen, int destlen,
186 		   erlang_char_encoding* res_encp)
187 {
188     const char* const src_end = src + slen;
189     const char* const dst_start = dst;
190     const char* const dst_end = dst + destlen;
191     int found_non_ascii = 0;
192 
193 #ifdef HAVE_UNALIGNED_WORD_ACCESS
194     {
195         int aft = ascii_fast_track(dst, src, slen, destlen);
196         dst += aft;
197         src += aft;
198     }
199 #endif
200 
201     while (src < src_end) {
202 	if (dst >= dst_end) return -1;
203 	if ((src[0] & 0x80) == 0) {
204 	    if (dst_start) {
205 		*dst = *src;
206 	    }
207 	    ++dst;
208 	}
209 	else {
210 	    if (dst_start) {
211 		unsigned char ch = *src;
212 		dst[0] = 0xC0 | (ch >> 6);
213 		dst[1] = 0x80 | (ch & 0x3F);
214 	    }
215 	    dst += 2;
216 	    found_non_ascii = 1;
217 	}
218 	++src;
219     }
220     if (res_encp) {
221 	*res_encp = found_non_ascii ? ERLANG_UTF8 : ERLANG_ASCII;
222     }
223     return dst - dst_start;
224 }
225 
226 
227 
ei_internal_get_atom(const char ** bufp,char * p,erlang_char_encoding * was_encp)228 int ei_internal_get_atom(const char** bufp, char* p,
229 			 erlang_char_encoding* was_encp)
230 {
231     int ix = 0;
232     if (ei_decode_atom_as(*bufp, &ix, p, MAXATOMLEN_UTF8, ERLANG_UTF8, was_encp, NULL) < 0)
233 	return -1;
234     *bufp += ix;
235     return 0;
236 }
237 
238 
239