1 /*
2  * Copyright 2014 Google Inc. All rights reserved.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.google.flatbuffers;
18 
19 import java.nio.ByteBuffer;
20 
21 import static java.lang.Character.MAX_SURROGATE;
22 import static java.lang.Character.MIN_SURROGATE;
23 import static java.lang.Character.MIN_HIGH_SURROGATE;
24 import static java.lang.Character.MIN_LOW_SURROGATE;
25 import static java.lang.Character.MIN_SUPPLEMENTARY_CODE_POINT;
26 import static java.lang.Character.isSurrogatePair;
27 import static java.lang.Character.toCodePoint;
28 
29 public abstract class Utf8 {
30 
31   /**
32    * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string,
33    * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in
34    * both time and space.
35    *
36    * @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired
37    *     surrogates)
38    */
encodedLength(CharSequence sequence)39   public abstract int encodedLength(CharSequence sequence);
40 
41   /**
42    * Encodes the given characters to the target {@link ByteBuffer} using UTF-8 encoding.
43    *
44    * <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct)
45    * and the capabilities of the platform.
46    *
47    * @param in the source string to be encoded
48    * @param out the target buffer to receive the encoded string.
49    */
encodeUtf8(CharSequence in, ByteBuffer out)50   public abstract void encodeUtf8(CharSequence in, ByteBuffer out);
51 
52   /**
53    * Decodes the given UTF-8 portion of the {@link ByteBuffer} into a {@link String}.
54    *
55    * @throws IllegalArgumentException if the input is not valid UTF-8.
56    */
decodeUtf8(ByteBuffer buffer, int offset, int length)57   public abstract String decodeUtf8(ByteBuffer buffer, int offset, int length);
58 
59   private static Utf8 DEFAULT;
60 
61   /**
62    * Get the default UTF-8 processor.
63    * @return the default processor
64    */
getDefault()65   public static Utf8 getDefault() {
66     if (DEFAULT == null) {
67       DEFAULT = new Utf8Safe();
68     }
69     return DEFAULT;
70   }
71 
72   /**
73    * Set the default instance of the UTF-8 processor.
74    * @param instance the new instance to use
75    */
setDefault(Utf8 instance)76   public static void setDefault(Utf8 instance) {
77     DEFAULT = instance;
78   }
79 
80   /**
81    * Encode a Java's CharSequence UTF8 codepoint into a byte array.
82    * @param in CharSequence to be encoded
83    * @param start start position of the first char in the codepoint
84    * @param out byte array of 4 bytes to be filled
85    * @return return the amount of bytes occupied by the codepoint
86    */
encodeUtf8CodePoint(CharSequence in, int start, byte[] out)87   public static int encodeUtf8CodePoint(CharSequence in, int start, byte[] out) {
88     // utf8 codepoint needs at least 4 bytes
89     assert out.length >= 4;
90 
91     final int inLength = in.length();
92     if (start >= inLength) {
93       return 0;
94     }
95 
96     char c = in.charAt(start);
97      if (c < 0x80) {
98        // One byte (0xxx xxxx)
99        out[0] = (byte) c;
100        return 1;
101      } else if (c < 0x800) {
102       // Two bytes (110x xxxx 10xx xxxx)
103       out[0] = (byte) (0xC0 | (c >>> 6));
104       out[1] = (byte) (0x80 | (0x3F & c));
105       return 2;
106     } else if (c < MIN_SURROGATE || MAX_SURROGATE < c) {
107       // Three bytes (1110 xxxx 10xx xxxx 10xx xxxx)
108       // Maximum single-char code point is 0xFFFF, 16 bits.
109       out[0] = (byte) (0xE0 | (c >>> 12));
110       out[1] =(byte) (0x80 | (0x3F & (c >>> 6)));
111       out[2] = (byte) (0x80 | (0x3F & c));
112       return 3;
113     } else {
114       // Four bytes (1111 xxxx 10xx xxxx 10xx xxxx 10xx xxxx)
115       // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8
116       // bytes
117       final char low;
118       if (start + 1 == inLength || !isSurrogatePair(c, (low = in.charAt(start+1)))) {
119         throw new UnpairedSurrogateException(start, inLength);
120       }
121       int codePoint = toCodePoint(c, low);
122       out[0] = (byte) ((0xF << 4) | (codePoint >>> 18));
123       out[1] = (byte) (0x80 | (0x3F & (codePoint >>> 12)));
124       out[2] = (byte) (0x80 | (0x3F & (codePoint >>> 6)));
125       out[3] = (byte) (0x80 | (0x3F & codePoint));
126       return 4;
127     }
128   }
129 
130   /**
131    * Utility methods for decoding bytes into {@link String}. Callers are responsible for extracting
132    * bytes (possibly using Unsafe methods), and checking remaining bytes. All other UTF-8 validity
133    * checks and codepoint conversion happen in this class.
134    */
135   static class DecodeUtil {
136 
137     /**
138      * Returns whether this is a single-byte codepoint (i.e., ASCII) with the form '0XXXXXXX'.
139      */
isOneByte(byte b)140     static boolean isOneByte(byte b) {
141       return b >= 0;
142     }
143 
144     /**
145      * Returns whether this is a two-byte codepoint with the form '10XXXXXX'.
146      */
isTwoBytes(byte b)147     static boolean isTwoBytes(byte b) {
148       return b < (byte) 0xE0;
149     }
150 
151     /**
152      * Returns whether this is a three-byte codepoint with the form '110XXXXX'.
153      */
isThreeBytes(byte b)154     static boolean isThreeBytes(byte b) {
155       return b < (byte) 0xF0;
156     }
157 
handleOneByte(byte byte1, char[] resultArr, int resultPos)158     static void handleOneByte(byte byte1, char[] resultArr, int resultPos) {
159       resultArr[resultPos] = (char) byte1;
160     }
161 
handleTwoBytes( byte byte1, byte byte2, char[] resultArr, int resultPos)162     static void handleTwoBytes(
163         byte byte1, byte byte2, char[] resultArr, int resultPos)
164         throws IllegalArgumentException {
165       // Simultaneously checks for illegal trailing-byte in leading position (<= '11000000') and
166       // overlong 2-byte, '11000001'.
167       if (byte1 < (byte) 0xC2) {
168         throw new IllegalArgumentException("Invalid UTF-8: Illegal leading byte in 2 bytes utf");
169       }
170       if (isNotTrailingByte(byte2)) {
171         throw new IllegalArgumentException("Invalid UTF-8: Illegal trailing byte in 2 bytes utf");
172       }
173       resultArr[resultPos] = (char) (((byte1 & 0x1F) << 6) | trailingByteValue(byte2));
174     }
175 
handleThreeBytes( byte byte1, byte byte2, byte byte3, char[] resultArr, int resultPos)176     static void handleThreeBytes(
177         byte byte1, byte byte2, byte byte3, char[] resultArr, int resultPos)
178         throws IllegalArgumentException {
179       if (isNotTrailingByte(byte2)
180               // overlong? 5 most significant bits must not all be zero
181               || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
182               // check for illegal surrogate codepoints
183               || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
184               || isNotTrailingByte(byte3)) {
185         throw new IllegalArgumentException("Invalid UTF-8");
186       }
187       resultArr[resultPos] = (char)
188                                  (((byte1 & 0x0F) << 12) | (trailingByteValue(byte2) << 6) | trailingByteValue(byte3));
189     }
190 
handleFourBytes( byte byte1, byte byte2, byte byte3, byte byte4, char[] resultArr, int resultPos)191     static void handleFourBytes(
192         byte byte1, byte byte2, byte byte3, byte byte4, char[] resultArr, int resultPos)
193         throws IllegalArgumentException{
194       if (isNotTrailingByte(byte2)
195               // Check that 1 <= plane <= 16.  Tricky optimized form of:
196               //   valid 4-byte leading byte?
197               // if (byte1 > (byte) 0xF4 ||
198               //   overlong? 4 most significant bits must not all be zero
199               //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
200               //   codepoint larger than the highest code point (U+10FFFF)?
201               //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
202               || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
203               || isNotTrailingByte(byte3)
204               || isNotTrailingByte(byte4)) {
205         throw new IllegalArgumentException("Invalid UTF-8");
206       }
207       int codepoint = ((byte1 & 0x07) << 18)
208                           | (trailingByteValue(byte2) << 12)
209                           | (trailingByteValue(byte3) << 6)
210                           | trailingByteValue(byte4);
211       resultArr[resultPos] = DecodeUtil.highSurrogate(codepoint);
212       resultArr[resultPos + 1] = DecodeUtil.lowSurrogate(codepoint);
213     }
214 
215     /**
216      * Returns whether the byte is not a valid continuation of the form '10XXXXXX'.
217      */
isNotTrailingByte(byte b)218     private static boolean isNotTrailingByte(byte b) {
219       return b > (byte) 0xBF;
220     }
221 
222     /**
223      * Returns the actual value of the trailing byte (removes the prefix '10') for composition.
224      */
trailingByteValue(byte b)225     private static int trailingByteValue(byte b) {
226       return b & 0x3F;
227     }
228 
highSurrogate(int codePoint)229     private static char highSurrogate(int codePoint) {
230       return (char) ((MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10))
231                          + (codePoint >>> 10));
232     }
233 
lowSurrogate(int codePoint)234     private static char lowSurrogate(int codePoint) {
235       return (char) (MIN_LOW_SURROGATE + (codePoint & 0x3ff));
236     }
237   }
238 
239   // These UTF-8 handling methods are copied from Guava's Utf8Unsafe class with a modification to throw
240   // a protocol buffer local exception. This exception is then caught in CodedOutputStream so it can
241   // fallback to more lenient behavior.
242   static class UnpairedSurrogateException extends IllegalArgumentException {
UnpairedSurrogateException(int index, int length)243     UnpairedSurrogateException(int index, int length) {
244       super("Unpaired surrogate at index " + index + " of " + length);
245     }
246   }
247 }
248