1 /** 2 * Licensed to the Apache Software Foundation (ASF) under one 3 * or more contributor license agreements. See the NOTICE file 4 * distributed with this work for additional information 5 * regarding copyright ownership. The ASF licenses this file 6 * to you under the Apache License, Version 2.0 (the 7 * "License"); you may not use this file except in compliance 8 * with the License. You may obtain a copy of the License at 9 * 10 * http://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, software 13 * distributed under the License is distributed on an "AS IS" BASIS, 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 * See the License for the specific language governing permissions and 16 * limitations under the License. 17 */ 18 19 package org.apache.hadoop.record; 20 21 import java.io.DataInput; 22 import java.io.DataOutput; 23 import java.io.IOException; 24 import org.apache.hadoop.io.WritableComparator; 25 import org.apache.hadoop.io.WritableUtils; 26 27 /** 28 * Various utility functions for Hadooop record I/O runtime. 29 */ 30 public class Utils { 31 32 /** Cannot create a new instance of Utils */ Utils()33 private Utils() { 34 } 35 36 public static final char[] hexchars = { '0', '1', '2', '3', '4', '5', 37 '6', '7', '8', '9', 'A', 'B', 38 'C', 'D', 'E', 'F' }; 39 /** 40 * 41 * @param s 42 * @return 43 */ toXMLString(String s)44 static String toXMLString(String s) { 45 StringBuffer sb = new StringBuffer(); 46 for (int idx = 0; idx < s.length(); idx++) { 47 char ch = s.charAt(idx); 48 if (ch == '<') { 49 sb.append("<"); 50 } else if (ch == '&') { 51 sb.append("&"); 52 } else if (ch == '%') { 53 sb.append("%0025"); 54 } else if (ch < 0x20 || 55 (ch > 0xD7FF && ch < 0xE000) || 56 (ch > 0xFFFD)) { 57 sb.append("%"); 58 sb.append(hexchars[(ch & 0xF000) >> 12]); 59 sb.append(hexchars[(ch & 0x0F00) >> 8]); 60 sb.append(hexchars[(ch & 0x00F0) >> 4]); 61 sb.append(hexchars[(ch & 0x000F)]); 62 } else { 63 sb.append(ch); 64 } 65 } 66 return sb.toString(); 67 } 68 h2c(char ch)69 static private int h2c(char ch) { 70 if (ch >= '0' && ch <= '9') { 71 return ch - '0'; 72 } else if (ch >= 'A' && ch <= 'F') { 73 return ch - 'A' + 10; 74 } else if (ch >= 'a' && ch <= 'f') { 75 return ch - 'a' + 10; 76 } 77 return 0; 78 } 79 80 /** 81 * 82 * @param s 83 * @return 84 */ fromXMLString(String s)85 static String fromXMLString(String s) { 86 StringBuffer sb = new StringBuffer(); 87 for (int idx = 0; idx < s.length();) { 88 char ch = s.charAt(idx++); 89 if (ch == '%') { 90 int ch1 = h2c(s.charAt(idx++)) << 12; 91 int ch2 = h2c(s.charAt(idx++)) << 8; 92 int ch3 = h2c(s.charAt(idx++)) << 4; 93 int ch4 = h2c(s.charAt(idx++)); 94 char res = (char)(ch1 | ch2 | ch3 | ch4); 95 sb.append(res); 96 } else { 97 sb.append(ch); 98 } 99 } 100 return sb.toString(); 101 } 102 103 /** 104 * 105 * @param s 106 * @return 107 */ toCSVString(String s)108 static String toCSVString(String s) { 109 StringBuffer sb = new StringBuffer(s.length()+1); 110 sb.append('\''); 111 int len = s.length(); 112 for (int i = 0; i < len; i++) { 113 char c = s.charAt(i); 114 switch(c) { 115 case '\0': 116 sb.append("%00"); 117 break; 118 case '\n': 119 sb.append("%0A"); 120 break; 121 case '\r': 122 sb.append("%0D"); 123 break; 124 case ',': 125 sb.append("%2C"); 126 break; 127 case '}': 128 sb.append("%7D"); 129 break; 130 case '%': 131 sb.append("%25"); 132 break; 133 default: 134 sb.append(c); 135 } 136 } 137 return sb.toString(); 138 } 139 140 /** 141 * 142 * @param s 143 * @throws java.io.IOException 144 * @return 145 */ fromCSVString(String s)146 static String fromCSVString(String s) throws IOException { 147 if (s.charAt(0) != '\'') { 148 throw new IOException("Error deserializing string."); 149 } 150 int len = s.length(); 151 StringBuffer sb = new StringBuffer(len-1); 152 for (int i = 1; i < len; i++) { 153 char c = s.charAt(i); 154 if (c == '%') { 155 char ch1 = s.charAt(i+1); 156 char ch2 = s.charAt(i+2); 157 i += 2; 158 if (ch1 == '0' && ch2 == '0') { 159 sb.append('\0'); 160 } else if (ch1 == '0' && ch2 == 'A') { 161 sb.append('\n'); 162 } else if (ch1 == '0' && ch2 == 'D') { 163 sb.append('\r'); 164 } else if (ch1 == '2' && ch2 == 'C') { 165 sb.append(','); 166 } else if (ch1 == '7' && ch2 == 'D') { 167 sb.append('}'); 168 } else if (ch1 == '2' && ch2 == '5') { 169 sb.append('%'); 170 } else { 171 throw new IOException("Error deserializing string."); 172 } 173 } else { 174 sb.append(c); 175 } 176 } 177 return sb.toString(); 178 } 179 180 /** 181 * 182 * @param s 183 * @return 184 */ toXMLBuffer(Buffer s)185 static String toXMLBuffer(Buffer s) { 186 return s.toString(); 187 } 188 189 /** 190 * 191 * @param s 192 * @throws java.io.IOException 193 * @return 194 */ fromXMLBuffer(String s)195 static Buffer fromXMLBuffer(String s) 196 throws IOException { 197 if (s.length() == 0) { return new Buffer(); } 198 int blen = s.length()/2; 199 byte[] barr = new byte[blen]; 200 for (int idx = 0; idx < blen; idx++) { 201 char c1 = s.charAt(2*idx); 202 char c2 = s.charAt(2*idx+1); 203 barr[idx] = (byte)Integer.parseInt(""+c1+c2, 16); 204 } 205 return new Buffer(barr); 206 } 207 208 /** 209 * 210 * @param buf 211 * @return 212 */ toCSVBuffer(Buffer buf)213 static String toCSVBuffer(Buffer buf) { 214 StringBuffer sb = new StringBuffer("#"); 215 sb.append(buf.toString()); 216 return sb.toString(); 217 } 218 219 /** 220 * Converts a CSV-serialized representation of buffer to a new 221 * Buffer 222 * @param s CSV-serialized representation of buffer 223 * @throws java.io.IOException 224 * @return Deserialized Buffer 225 */ fromCSVBuffer(String s)226 static Buffer fromCSVBuffer(String s) 227 throws IOException { 228 if (s.charAt(0) != '#') { 229 throw new IOException("Error deserializing buffer."); 230 } 231 if (s.length() == 1) { return new Buffer(); } 232 int blen = (s.length()-1)/2; 233 byte[] barr = new byte[blen]; 234 for (int idx = 0; idx < blen; idx++) { 235 char c1 = s.charAt(2*idx+1); 236 char c2 = s.charAt(2*idx+2); 237 barr[idx] = (byte)Integer.parseInt(""+c1+c2, 16); 238 } 239 return new Buffer(barr); 240 } 241 utf8LenForCodePoint(final int cpt)242 private static int utf8LenForCodePoint(final int cpt) throws IOException { 243 if (cpt >=0 && cpt <= 0x7F) { 244 return 1; 245 } 246 if (cpt >= 0x80 && cpt <= 0x07FF) { 247 return 2; 248 } 249 if ((cpt >= 0x0800 && cpt < 0xD800) || 250 (cpt > 0xDFFF && cpt <= 0xFFFD)) { 251 return 3; 252 } 253 if (cpt >= 0x10000 && cpt <= 0x10FFFF) { 254 return 4; 255 } 256 throw new IOException("Illegal Unicode Codepoint "+ 257 Integer.toHexString(cpt)+" in string."); 258 } 259 260 private static final int B10 = Integer.parseInt("10000000", 2); 261 private static final int B110 = Integer.parseInt("11000000", 2); 262 private static final int B1110 = Integer.parseInt("11100000", 2); 263 private static final int B11110 = Integer.parseInt("11110000", 2); 264 private static final int B11 = Integer.parseInt("11000000", 2); 265 private static final int B111 = Integer.parseInt("11100000", 2); 266 private static final int B1111 = Integer.parseInt("11110000", 2); 267 private static final int B11111 = Integer.parseInt("11111000", 2); 268 writeUtf8(int cpt, final byte[] bytes, final int offset)269 private static int writeUtf8(int cpt, final byte[] bytes, final int offset) 270 throws IOException { 271 if (cpt >=0 && cpt <= 0x7F) { 272 bytes[offset] = (byte) cpt; 273 return 1; 274 } 275 if (cpt >= 0x80 && cpt <= 0x07FF) { 276 bytes[offset+1] = (byte) (B10 | (cpt & 0x3F)); 277 cpt = cpt >> 6; 278 bytes[offset] = (byte) (B110 | (cpt & 0x1F)); 279 return 2; 280 } 281 if ((cpt >= 0x0800 && cpt < 0xD800) || 282 (cpt > 0xDFFF && cpt <= 0xFFFD)) { 283 bytes[offset+2] = (byte) (B10 | (cpt & 0x3F)); 284 cpt = cpt >> 6; 285 bytes[offset+1] = (byte) (B10 | (cpt & 0x3F)); 286 cpt = cpt >> 6; 287 bytes[offset] = (byte) (B1110 | (cpt & 0x0F)); 288 return 3; 289 } 290 if (cpt >= 0x10000 && cpt <= 0x10FFFF) { 291 bytes[offset+3] = (byte) (B10 | (cpt & 0x3F)); 292 cpt = cpt >> 6; 293 bytes[offset+2] = (byte) (B10 | (cpt & 0x3F)); 294 cpt = cpt >> 6; 295 bytes[offset+1] = (byte) (B10 | (cpt & 0x3F)); 296 cpt = cpt >> 6; 297 bytes[offset] = (byte) (B11110 | (cpt & 0x07)); 298 return 4; 299 } 300 throw new IOException("Illegal Unicode Codepoint "+ 301 Integer.toHexString(cpt)+" in string."); 302 } 303 toBinaryString(final DataOutput out, final String str)304 static void toBinaryString(final DataOutput out, final String str) 305 throws IOException { 306 final int strlen = str.length(); 307 byte[] bytes = new byte[strlen*4]; // Codepoints expand to 4 bytes max 308 int utf8Len = 0; 309 int idx = 0; 310 while(idx < strlen) { 311 final int cpt = str.codePointAt(idx); 312 idx += Character.isSupplementaryCodePoint(cpt) ? 2 : 1; 313 utf8Len += writeUtf8(cpt, bytes, utf8Len); 314 } 315 writeVInt(out, utf8Len); 316 out.write(bytes, 0, utf8Len); 317 } 318 isValidCodePoint(int cpt)319 static boolean isValidCodePoint(int cpt) { 320 return !((cpt > 0x10FFFF) || 321 (cpt >= 0xD800 && cpt <= 0xDFFF) || 322 (cpt >= 0xFFFE && cpt <=0xFFFF)); 323 } 324 utf8ToCodePoint(int b1, int b2, int b3, int b4)325 private static int utf8ToCodePoint(int b1, int b2, int b3, int b4) { 326 int cpt = 0; 327 cpt = (((b1 & ~B11111) << 18) | 328 ((b2 & ~B11) << 12) | 329 ((b3 & ~B11) << 6) | 330 (b4 & ~B11)); 331 return cpt; 332 } 333 utf8ToCodePoint(int b1, int b2, int b3)334 private static int utf8ToCodePoint(int b1, int b2, int b3) { 335 int cpt = 0; 336 cpt = (((b1 & ~B1111) << 12) | ((b2 & ~B11) << 6) | (b3 & ~B11)); 337 return cpt; 338 } 339 utf8ToCodePoint(int b1, int b2)340 private static int utf8ToCodePoint(int b1, int b2) { 341 int cpt = 0; 342 cpt = (((b1 & ~B111) << 6) | (b2 & ~B11)); 343 return cpt; 344 } 345 checkB10(int b)346 private static void checkB10(int b) throws IOException { 347 if ((b & B11) != B10) { 348 throw new IOException("Invalid UTF-8 representation."); 349 } 350 } 351 fromBinaryString(final DataInput din)352 static String fromBinaryString(final DataInput din) throws IOException { 353 final int utf8Len = readVInt(din); 354 final byte[] bytes = new byte[utf8Len]; 355 din.readFully(bytes); 356 int len = 0; 357 // For the most commmon case, i.e. ascii, numChars = utf8Len 358 StringBuilder sb = new StringBuilder(utf8Len); 359 while(len < utf8Len) { 360 int cpt = 0; 361 final int b1 = bytes[len++] & 0xFF; 362 if (b1 <= 0x7F) { 363 cpt = b1; 364 } else if ((b1 & B11111) == B11110) { 365 int b2 = bytes[len++] & 0xFF; 366 checkB10(b2); 367 int b3 = bytes[len++] & 0xFF; 368 checkB10(b3); 369 int b4 = bytes[len++] & 0xFF; 370 checkB10(b4); 371 cpt = utf8ToCodePoint(b1, b2, b3, b4); 372 } else if ((b1 & B1111) == B1110) { 373 int b2 = bytes[len++] & 0xFF; 374 checkB10(b2); 375 int b3 = bytes[len++] & 0xFF; 376 checkB10(b3); 377 cpt = utf8ToCodePoint(b1, b2, b3); 378 } else if ((b1 & B111) == B110) { 379 int b2 = bytes[len++] & 0xFF; 380 checkB10(b2); 381 cpt = utf8ToCodePoint(b1, b2); 382 } else { 383 throw new IOException("Invalid UTF-8 byte "+Integer.toHexString(b1)+ 384 " at offset "+(len-1)+" in length of "+utf8Len); 385 } 386 if (!isValidCodePoint(cpt)) { 387 throw new IOException("Illegal Unicode Codepoint "+ 388 Integer.toHexString(cpt)+" in stream."); 389 } 390 sb.appendCodePoint(cpt); 391 } 392 return sb.toString(); 393 } 394 395 /** Parse a float from a byte array. */ readFloat(byte[] bytes, int start)396 public static float readFloat(byte[] bytes, int start) { 397 return WritableComparator.readFloat(bytes, start); 398 } 399 400 /** Parse a double from a byte array. */ readDouble(byte[] bytes, int start)401 public static double readDouble(byte[] bytes, int start) { 402 return WritableComparator.readDouble(bytes, start); 403 } 404 405 /** 406 * Reads a zero-compressed encoded long from a byte array and returns it. 407 * @param bytes byte array with decode long 408 * @param start starting index 409 * @throws java.io.IOException 410 * @return deserialized long 411 */ readVLong(byte[] bytes, int start)412 public static long readVLong(byte[] bytes, int start) throws IOException { 413 return WritableComparator.readVLong(bytes, start); 414 } 415 416 /** 417 * Reads a zero-compressed encoded integer from a byte array and returns it. 418 * @param bytes byte array with the encoded integer 419 * @param start start index 420 * @throws java.io.IOException 421 * @return deserialized integer 422 */ readVInt(byte[] bytes, int start)423 public static int readVInt(byte[] bytes, int start) throws IOException { 424 return WritableComparator.readVInt(bytes, start); 425 } 426 427 /** 428 * Reads a zero-compressed encoded long from a stream and return it. 429 * @param in input stream 430 * @throws java.io.IOException 431 * @return deserialized long 432 */ readVLong(DataInput in)433 public static long readVLong(DataInput in) throws IOException { 434 return WritableUtils.readVLong(in); 435 } 436 437 /** 438 * Reads a zero-compressed encoded integer from a stream and returns it. 439 * @param in input stream 440 * @throws java.io.IOException 441 * @return deserialized integer 442 */ readVInt(DataInput in)443 public static int readVInt(DataInput in) throws IOException { 444 return WritableUtils.readVInt(in); 445 } 446 447 /** 448 * Get the encoded length if an integer is stored in a variable-length format 449 * @return the encoded length 450 */ getVIntSize(long i)451 public static int getVIntSize(long i) { 452 return WritableUtils.getVIntSize(i); 453 } 454 455 /** 456 * Serializes a long to a binary stream with zero-compressed encoding. 457 * For -112 <= i <= 127, only one byte is used with the actual value. 458 * For other values of i, the first byte value indicates whether the 459 * long is positive or negative, and the number of bytes that follow. 460 * If the first byte value v is between -113 and -120, the following long 461 * is positive, with number of bytes that follow are -(v+112). 462 * If the first byte value v is between -121 and -128, the following long 463 * is negative, with number of bytes that follow are -(v+120). Bytes are 464 * stored in the high-non-zero-byte-first order. 465 * 466 * @param stream Binary output stream 467 * @param i Long to be serialized 468 * @throws java.io.IOException 469 */ writeVLong(DataOutput stream, long i)470 public static void writeVLong(DataOutput stream, long i) throws IOException { 471 WritableUtils.writeVLong(stream, i); 472 } 473 474 /** 475 * Serializes an int to a binary stream with zero-compressed encoding. 476 * 477 * @param stream Binary output stream 478 * @param i int to be serialized 479 * @throws java.io.IOException 480 */ writeVInt(DataOutput stream, int i)481 public static void writeVInt(DataOutput stream, int i) throws IOException { 482 WritableUtils.writeVInt(stream, i); 483 } 484 485 /** Lexicographic order of binary data. */ compareBytes(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2)486 public static int compareBytes(byte[] b1, int s1, int l1, 487 byte[] b2, int s2, int l2) { 488 return WritableComparator.compareBytes(b1, s1, l1, b2, s2, l2); 489 } 490 } 491