google/flatbuffers/Utf8.java

/*
 * Copyright 2014 Google Inc. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.google.flatbuffers;

import java.nio.ByteBuffer;

import static java.lang.Character.MAX_SURROGATE;
import static java.lang.Character.MIN_SURROGATE;
import static java.lang.Character.MIN_HIGH_SURROGATE;
import static java.lang.Character.MIN_LOW_SURROGATE;
import static java.lang.Character.MIN_SUPPLEMENTARY_CODE_POINT;
import static java.lang.Character.isSurrogatePair;
import static java.lang.Character.toCodePoint;

public abstract class Utf8 {

  /**
   * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string,
   * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in
   * both time and space.
   *
   * @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired
   *     surrogates)
   */
  public abstract int encodedLength(CharSequence sequence);

  /**
   * Encodes the given characters to the target {@link ByteBuffer} using UTF-8 encoding.
   *
   * <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct)
   * and the capabilities of the platform.
   *
   * @param in the source string to be encoded
   * @param out the target buffer to receive the encoded string.
   */
  public abstract void encodeUtf8(CharSequence in, ByteBuffer out);

  /**
   * Decodes the given UTF-8 portion of the {@link ByteBuffer} into a {@link String}.
   *
   * @throws IllegalArgumentException if the input is not valid UTF-8.
   */
  public abstract String decodeUtf8(ByteBuffer buffer, int offset, int length);

  private static Utf8 DEFAULT;

  /**
   * Get the default UTF-8 processor.
   * @return the default processor
   */
  public static Utf8 getDefault() {
    if (DEFAULT == null) {
      DEFAULT = new Utf8Safe();
    }
    return DEFAULT;
  }

  /**
   * Set the default instance of the UTF-8 processor.
   * @param instance the new instance to use
   */
  public static void setDefault(Utf8 instance) {
    DEFAULT = instance;
  }

  /**
   * Encode a Java's CharSequence UTF8 codepoint into a byte array.
   * @param in CharSequence to be encoded
   * @param start start position of the first char in the codepoint
   * @param out byte array of 4 bytes to be filled
   * @return return the amount of bytes occupied by the codepoint
   */
  public static int encodeUtf8CodePoint(CharSequence in, int start, byte[] out) {
    // utf8 codepoint needs at least 4 bytes
    assert out.length >= 4;

    final int inLength = in.length();
    if (start >= inLength) {
      return 0;
    }

    char c = in.charAt(start);
     if (c < 0x80) {
       // One byte (0xxx xxxx)
       out[0] = (byte) c;
       return 1;
     } else if (c < 0x800) {
      // Two bytes (110x xxxx 10xx xxxx)
      out[0] = (byte) (0xC0 | (c >>> 6));
      out[1] = (byte) (0x80 | (0x3F & c));
      return 2;
    } else if (c < MIN_SURROGATE || MAX_SURROGATE < c) {
      // Three bytes (1110 xxxx 10xx xxxx 10xx xxxx)
      // Maximum single-char code point is 0xFFFF, 16 bits.
      out[0] = (byte) (0xE0 | (c >>> 12));
      out[1] =(byte) (0x80 | (0x3F & (c >>> 6)));
      out[2] = (byte) (0x80 | (0x3F & c));
      return 3;
    } else {
      // Four bytes (1111 xxxx 10xx xxxx 10xx xxxx 10xx xxxx)
      // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8
      // bytes
      final char low;
      if (start + 1 == inLength || !isSurrogatePair(c, (low = in.charAt(start+1)))) {
        throw new UnpairedSurrogateException(start, inLength);
      }
      int codePoint = toCodePoint(c, low);
      out[0] = (byte) ((0xF << 4) | (codePoint >>> 18));
      out[1] = (byte) (0x80 | (0x3F & (codePoint >>> 12)));
      out[2] = (byte) (0x80 | (0x3F & (codePoint >>> 6)));
      out[3] = (byte) (0x80 | (0x3F & codePoint));
      return 4;
    }
  }

  /**
   * Utility methods for decoding bytes into {@link String}. Callers are responsible for extracting
   * bytes (possibly using Unsafe methods), and checking remaining bytes. All other UTF-8 validity
   * checks and codepoint conversion happen in this class.
   */
  static class DecodeUtil {

    /**
     * Returns whether this is a single-byte codepoint (i.e., ASCII) with the form '0XXXXXXX'.
     */
    static boolean isOneByte(byte b) {
      return b >= 0;
    }

    /**
     * Returns whether this is a two-byte codepoint with the form '10XXXXXX'.
     */
    static boolean isTwoBytes(byte b) {
      return b < (byte) 0xE0;
    }

    /**
     * Returns whether this is a three-byte codepoint with the form '110XXXXX'.
     */
    static boolean isThreeBytes(byte b) {
      return b < (byte) 0xF0;
    }

    static void handleOneByte(byte byte1, char[] resultArr, int resultPos) {
      resultArr[resultPos] = (char) byte1;
    }

    static void handleTwoBytes(
        byte byte1, byte byte2, char[] resultArr, int resultPos)
        throws IllegalArgumentException {
      // Simultaneously checks for illegal trailing-byte in leading position (<= '11000000') and
      // overlong 2-byte, '11000001'.
      if (byte1 < (byte) 0xC2) {
        throw new IllegalArgumentException("Invalid UTF-8: Illegal leading byte in 2 bytes utf");
      }
      if (isNotTrailingByte(byte2)) {
        throw new IllegalArgumentException("Invalid UTF-8: Illegal trailing byte in 2 bytes utf");
      }
      resultArr[resultPos] = (char) (((byte1 & 0x1F) << 6) | trailingByteValue(byte2));
    }

    static void handleThreeBytes(
        byte byte1, byte byte2, byte byte3, char[] resultArr, int resultPos)
        throws IllegalArgumentException {
      if (isNotTrailingByte(byte2)
              // overlong? 5 most significant bits must not all be zero
              || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
              // check for illegal surrogate codepoints
              || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
              || isNotTrailingByte(byte3)) {
        throw new IllegalArgumentException("Invalid UTF-8");
      }
      resultArr[resultPos] = (char)
                                 (((byte1 & 0x0F) << 12) | (trailingByteValue(byte2) << 6) | trailingByteValue(byte3));
    }

    static void handleFourBytes(
        byte byte1, byte byte2, byte byte3, byte byte4, char[] resultArr, int resultPos)
        throws IllegalArgumentException{
      if (isNotTrailingByte(byte2)
              // Check that 1 <= plane <= 16.  Tricky optimized form of:
              //   valid 4-byte leading byte?
              // if (byte1 > (byte) 0xF4 ||
              //   overlong? 4 most significant bits must not all be zero
              //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
              //   codepoint larger than the highest code point (U+10FFFF)?
              //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
              || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
              || isNotTrailingByte(byte3)
              || isNotTrailingByte(byte4)) {
        throw new IllegalArgumentException("Invalid UTF-8");
      }
      int codepoint = ((byte1 & 0x07) << 18)
                          | (trailingByteValue(byte2) << 12)
                          | (trailingByteValue(byte3) << 6)
                          | trailingByteValue(byte4);
      resultArr[resultPos] = DecodeUtil.highSurrogate(codepoint);
      resultArr[resultPos + 1] = DecodeUtil.lowSurrogate(codepoint);
    }

    /**
     * Returns whether the byte is not a valid continuation of the form '10XXXXXX'.
     */
    private static boolean isNotTrailingByte(byte b) {
      return b > (byte) 0xBF;
    }

    /**
     * Returns the actual value of the trailing byte (removes the prefix '10') for composition.
     */
    private static int trailingByteValue(byte b) {
      return b & 0x3F;
    }

    private static char highSurrogate(int codePoint) {
      return (char) ((MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10))
                         + (codePoint >>> 10));
    }

    private static char lowSurrogate(int codePoint) {
      return (char) (MIN_LOW_SURROGATE + (codePoint & 0x3ff));
    }
  }

  // These UTF-8 handling methods are copied from Guava's Utf8Unsafe class with a modification to throw
  // a protocol buffer local exception. This exception is then caught in CodedOutputStream so it can
  // fallback to more lenient behavior.
  static class UnpairedSurrogateException extends IllegalArgumentException {
    UnpairedSurrogateException(int index, int length) {
      super("Unpaired surrogate at index " + index + " of " + length);
    }
  }
}