1 /* 2 * Copyright 2014 Google Inc. All rights reserved. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.flatbuffers; 18 19 import java.nio.ByteBuffer; 20 21 import static java.lang.Character.MAX_SURROGATE; 22 import static java.lang.Character.MIN_SURROGATE; 23 import static java.lang.Character.MIN_HIGH_SURROGATE; 24 import static java.lang.Character.MIN_LOW_SURROGATE; 25 import static java.lang.Character.MIN_SUPPLEMENTARY_CODE_POINT; 26 import static java.lang.Character.isSurrogatePair; 27 import static java.lang.Character.toCodePoint; 28 29 public abstract class Utf8 { 30 31 /** 32 * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string, 33 * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in 34 * both time and space. 35 * 36 * @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired 37 * surrogates) 38 */ encodedLength(CharSequence sequence)39 public abstract int encodedLength(CharSequence sequence); 40 41 /** 42 * Encodes the given characters to the target {@link ByteBuffer} using UTF-8 encoding. 43 * 44 * <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct) 45 * and the capabilities of the platform. 46 * 47 * @param in the source string to be encoded 48 * @param out the target buffer to receive the encoded string. 49 */ encodeUtf8(CharSequence in, ByteBuffer out)50 public abstract void encodeUtf8(CharSequence in, ByteBuffer out); 51 52 /** 53 * Decodes the given UTF-8 portion of the {@link ByteBuffer} into a {@link String}. 54 * 55 * @throws IllegalArgumentException if the input is not valid UTF-8. 56 */ decodeUtf8(ByteBuffer buffer, int offset, int length)57 public abstract String decodeUtf8(ByteBuffer buffer, int offset, int length); 58 59 private static Utf8 DEFAULT; 60 61 /** 62 * Get the default UTF-8 processor. 63 * @return the default processor 64 */ getDefault()65 public static Utf8 getDefault() { 66 if (DEFAULT == null) { 67 DEFAULT = new Utf8Safe(); 68 } 69 return DEFAULT; 70 } 71 72 /** 73 * Set the default instance of the UTF-8 processor. 74 * @param instance the new instance to use 75 */ setDefault(Utf8 instance)76 public static void setDefault(Utf8 instance) { 77 DEFAULT = instance; 78 } 79 80 /** 81 * Encode a Java's CharSequence UTF8 codepoint into a byte array. 82 * @param in CharSequence to be encoded 83 * @param start start position of the first char in the codepoint 84 * @param out byte array of 4 bytes to be filled 85 * @return return the amount of bytes occupied by the codepoint 86 */ encodeUtf8CodePoint(CharSequence in, int start, byte[] out)87 public static int encodeUtf8CodePoint(CharSequence in, int start, byte[] out) { 88 // utf8 codepoint needs at least 4 bytes 89 assert out.length >= 4; 90 91 final int inLength = in.length(); 92 if (start >= inLength) { 93 return 0; 94 } 95 96 char c = in.charAt(start); 97 if (c < 0x80) { 98 // One byte (0xxx xxxx) 99 out[0] = (byte) c; 100 return 1; 101 } else if (c < 0x800) { 102 // Two bytes (110x xxxx 10xx xxxx) 103 out[0] = (byte) (0xC0 | (c >>> 6)); 104 out[1] = (byte) (0x80 | (0x3F & c)); 105 return 2; 106 } else if (c < MIN_SURROGATE || MAX_SURROGATE < c) { 107 // Three bytes (1110 xxxx 10xx xxxx 10xx xxxx) 108 // Maximum single-char code point is 0xFFFF, 16 bits. 109 out[0] = (byte) (0xE0 | (c >>> 12)); 110 out[1] =(byte) (0x80 | (0x3F & (c >>> 6))); 111 out[2] = (byte) (0x80 | (0x3F & c)); 112 return 3; 113 } else { 114 // Four bytes (1111 xxxx 10xx xxxx 10xx xxxx 10xx xxxx) 115 // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8 116 // bytes 117 final char low; 118 if (start + 1 == inLength || !isSurrogatePair(c, (low = in.charAt(start+1)))) { 119 throw new UnpairedSurrogateException(start, inLength); 120 } 121 int codePoint = toCodePoint(c, low); 122 out[0] = (byte) ((0xF << 4) | (codePoint >>> 18)); 123 out[1] = (byte) (0x80 | (0x3F & (codePoint >>> 12))); 124 out[2] = (byte) (0x80 | (0x3F & (codePoint >>> 6))); 125 out[3] = (byte) (0x80 | (0x3F & codePoint)); 126 return 4; 127 } 128 } 129 130 /** 131 * Utility methods for decoding bytes into {@link String}. Callers are responsible for extracting 132 * bytes (possibly using Unsafe methods), and checking remaining bytes. All other UTF-8 validity 133 * checks and codepoint conversion happen in this class. 134 */ 135 static class DecodeUtil { 136 137 /** 138 * Returns whether this is a single-byte codepoint (i.e., ASCII) with the form '0XXXXXXX'. 139 */ isOneByte(byte b)140 static boolean isOneByte(byte b) { 141 return b >= 0; 142 } 143 144 /** 145 * Returns whether this is a two-byte codepoint with the form '10XXXXXX'. 146 */ isTwoBytes(byte b)147 static boolean isTwoBytes(byte b) { 148 return b < (byte) 0xE0; 149 } 150 151 /** 152 * Returns whether this is a three-byte codepoint with the form '110XXXXX'. 153 */ isThreeBytes(byte b)154 static boolean isThreeBytes(byte b) { 155 return b < (byte) 0xF0; 156 } 157 handleOneByte(byte byte1, char[] resultArr, int resultPos)158 static void handleOneByte(byte byte1, char[] resultArr, int resultPos) { 159 resultArr[resultPos] = (char) byte1; 160 } 161 handleTwoBytes( byte byte1, byte byte2, char[] resultArr, int resultPos)162 static void handleTwoBytes( 163 byte byte1, byte byte2, char[] resultArr, int resultPos) 164 throws IllegalArgumentException { 165 // Simultaneously checks for illegal trailing-byte in leading position (<= '11000000') and 166 // overlong 2-byte, '11000001'. 167 if (byte1 < (byte) 0xC2) { 168 throw new IllegalArgumentException("Invalid UTF-8: Illegal leading byte in 2 bytes utf"); 169 } 170 if (isNotTrailingByte(byte2)) { 171 throw new IllegalArgumentException("Invalid UTF-8: Illegal trailing byte in 2 bytes utf"); 172 } 173 resultArr[resultPos] = (char) (((byte1 & 0x1F) << 6) | trailingByteValue(byte2)); 174 } 175 handleThreeBytes( byte byte1, byte byte2, byte byte3, char[] resultArr, int resultPos)176 static void handleThreeBytes( 177 byte byte1, byte byte2, byte byte3, char[] resultArr, int resultPos) 178 throws IllegalArgumentException { 179 if (isNotTrailingByte(byte2) 180 // overlong? 5 most significant bits must not all be zero 181 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 182 // check for illegal surrogate codepoints 183 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 184 || isNotTrailingByte(byte3)) { 185 throw new IllegalArgumentException("Invalid UTF-8"); 186 } 187 resultArr[resultPos] = (char) 188 (((byte1 & 0x0F) << 12) | (trailingByteValue(byte2) << 6) | trailingByteValue(byte3)); 189 } 190 handleFourBytes( byte byte1, byte byte2, byte byte3, byte byte4, char[] resultArr, int resultPos)191 static void handleFourBytes( 192 byte byte1, byte byte2, byte byte3, byte byte4, char[] resultArr, int resultPos) 193 throws IllegalArgumentException{ 194 if (isNotTrailingByte(byte2) 195 // Check that 1 <= plane <= 16. Tricky optimized form of: 196 // valid 4-byte leading byte? 197 // if (byte1 > (byte) 0xF4 || 198 // overlong? 4 most significant bits must not all be zero 199 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 200 // codepoint larger than the highest code point (U+10FFFF)? 201 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 202 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 203 || isNotTrailingByte(byte3) 204 || isNotTrailingByte(byte4)) { 205 throw new IllegalArgumentException("Invalid UTF-8"); 206 } 207 int codepoint = ((byte1 & 0x07) << 18) 208 | (trailingByteValue(byte2) << 12) 209 | (trailingByteValue(byte3) << 6) 210 | trailingByteValue(byte4); 211 resultArr[resultPos] = DecodeUtil.highSurrogate(codepoint); 212 resultArr[resultPos + 1] = DecodeUtil.lowSurrogate(codepoint); 213 } 214 215 /** 216 * Returns whether the byte is not a valid continuation of the form '10XXXXXX'. 217 */ isNotTrailingByte(byte b)218 private static boolean isNotTrailingByte(byte b) { 219 return b > (byte) 0xBF; 220 } 221 222 /** 223 * Returns the actual value of the trailing byte (removes the prefix '10') for composition. 224 */ trailingByteValue(byte b)225 private static int trailingByteValue(byte b) { 226 return b & 0x3F; 227 } 228 highSurrogate(int codePoint)229 private static char highSurrogate(int codePoint) { 230 return (char) ((MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10)) 231 + (codePoint >>> 10)); 232 } 233 lowSurrogate(int codePoint)234 private static char lowSurrogate(int codePoint) { 235 return (char) (MIN_LOW_SURROGATE + (codePoint & 0x3ff)); 236 } 237 } 238 239 // These UTF-8 handling methods are copied from Guava's Utf8Unsafe class with a modification to throw 240 // a protocol buffer local exception. This exception is then caught in CodedOutputStream so it can 241 // fallback to more lenient behavior. 242 static class UnpairedSurrogateException extends IllegalArgumentException { UnpairedSurrogateException(int index, int length)243 UnpairedSurrogateException(int index, int length) { 244 super("Unpaired surrogate at index " + index + " of " + length); 245 } 246 } 247 } 248