1 /**
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements.  See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership.  The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License.  You may obtain a copy of the License at
9  *
10  *     http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 package org.apache.hadoop.record;
20 
21 import java.io.DataInput;
22 import java.io.DataOutput;
23 import java.io.IOException;
24 import org.apache.hadoop.io.WritableComparator;
25 import org.apache.hadoop.io.WritableUtils;
26 
27 /**
28  * Various utility functions for Hadooop record I/O runtime.
29  */
30 public class Utils {
31 
32   /** Cannot create a new instance of Utils */
Utils()33   private Utils() {
34   }
35 
36   public static final char[] hexchars = { '0', '1', '2', '3', '4', '5',
37                                           '6', '7', '8', '9', 'A', 'B',
38                                           'C', 'D', 'E', 'F' };
39   /**
40    *
41    * @param s
42    * @return
43    */
toXMLString(String s)44   static String toXMLString(String s) {
45     StringBuffer sb = new StringBuffer();
46     for (int idx = 0; idx < s.length(); idx++) {
47       char ch = s.charAt(idx);
48       if (ch == '<') {
49         sb.append("&lt;");
50       } else if (ch == '&') {
51         sb.append("&amp;");
52       } else if (ch == '%') {
53         sb.append("%0025");
54       } else if (ch < 0x20 ||
55                  (ch > 0xD7FF && ch < 0xE000) ||
56                  (ch > 0xFFFD)) {
57         sb.append("%");
58         sb.append(hexchars[(ch & 0xF000) >> 12]);
59         sb.append(hexchars[(ch & 0x0F00) >> 8]);
60         sb.append(hexchars[(ch & 0x00F0) >> 4]);
61         sb.append(hexchars[(ch & 0x000F)]);
62       } else {
63         sb.append(ch);
64       }
65     }
66     return sb.toString();
67   }
68 
h2c(char ch)69   static private int h2c(char ch) {
70     if (ch >= '0' && ch <= '9') {
71       return ch - '0';
72     } else if (ch >= 'A' && ch <= 'F') {
73       return ch - 'A' + 10;
74     } else if (ch >= 'a' && ch <= 'f') {
75       return ch - 'a' + 10;
76     }
77     return 0;
78   }
79 
80   /**
81    *
82    * @param s
83    * @return
84    */
fromXMLString(String s)85   static String fromXMLString(String s) {
86     StringBuffer sb = new StringBuffer();
87     for (int idx = 0; idx < s.length();) {
88       char ch = s.charAt(idx++);
89       if (ch == '%') {
90         int ch1 = h2c(s.charAt(idx++)) << 12;
91         int ch2 = h2c(s.charAt(idx++)) << 8;
92         int ch3 = h2c(s.charAt(idx++)) << 4;
93         int ch4 = h2c(s.charAt(idx++));
94         char res = (char)(ch1 | ch2 | ch3 | ch4);
95         sb.append(res);
96       } else {
97         sb.append(ch);
98       }
99     }
100     return sb.toString();
101   }
102 
103   /**
104    *
105    * @param s
106    * @return
107    */
toCSVString(String s)108   static String toCSVString(String s) {
109     StringBuffer sb = new StringBuffer(s.length()+1);
110     sb.append('\'');
111     int len = s.length();
112     for (int i = 0; i < len; i++) {
113       char c = s.charAt(i);
114       switch(c) {
115       case '\0':
116         sb.append("%00");
117         break;
118       case '\n':
119         sb.append("%0A");
120         break;
121       case '\r':
122         sb.append("%0D");
123         break;
124       case ',':
125         sb.append("%2C");
126         break;
127       case '}':
128         sb.append("%7D");
129         break;
130       case '%':
131         sb.append("%25");
132         break;
133       default:
134         sb.append(c);
135       }
136     }
137     return sb.toString();
138   }
139 
140   /**
141    *
142    * @param s
143    * @throws java.io.IOException
144    * @return
145    */
fromCSVString(String s)146   static String fromCSVString(String s) throws IOException {
147     if (s.charAt(0) != '\'') {
148       throw new IOException("Error deserializing string.");
149     }
150     int len = s.length();
151     StringBuffer sb = new StringBuffer(len-1);
152     for (int i = 1; i < len; i++) {
153       char c = s.charAt(i);
154       if (c == '%') {
155         char ch1 = s.charAt(i+1);
156         char ch2 = s.charAt(i+2);
157         i += 2;
158         if (ch1 == '0' && ch2 == '0') {
159           sb.append('\0');
160         } else if (ch1 == '0' && ch2 == 'A') {
161           sb.append('\n');
162         } else if (ch1 == '0' && ch2 == 'D') {
163           sb.append('\r');
164         } else if (ch1 == '2' && ch2 == 'C') {
165           sb.append(',');
166         } else if (ch1 == '7' && ch2 == 'D') {
167           sb.append('}');
168         } else if (ch1 == '2' && ch2 == '5') {
169           sb.append('%');
170         } else {
171           throw new IOException("Error deserializing string.");
172         }
173       } else {
174         sb.append(c);
175       }
176     }
177     return sb.toString();
178   }
179 
180   /**
181    *
182    * @param s
183    * @return
184    */
toXMLBuffer(Buffer s)185   static String toXMLBuffer(Buffer s) {
186     return s.toString();
187   }
188 
189   /**
190    *
191    * @param s
192    * @throws java.io.IOException
193    * @return
194    */
fromXMLBuffer(String s)195   static Buffer fromXMLBuffer(String s)
196     throws IOException {
197     if (s.length() == 0) { return new Buffer(); }
198     int blen = s.length()/2;
199     byte[] barr = new byte[blen];
200     for (int idx = 0; idx < blen; idx++) {
201       char c1 = s.charAt(2*idx);
202       char c2 = s.charAt(2*idx+1);
203       barr[idx] = (byte)Integer.parseInt(""+c1+c2, 16);
204     }
205     return new Buffer(barr);
206   }
207 
208   /**
209    *
210    * @param buf
211    * @return
212    */
toCSVBuffer(Buffer buf)213   static String toCSVBuffer(Buffer buf) {
214     StringBuffer sb = new StringBuffer("#");
215     sb.append(buf.toString());
216     return sb.toString();
217   }
218 
219   /**
220    * Converts a CSV-serialized representation of buffer to a new
221    * Buffer
222    * @param s CSV-serialized representation of buffer
223    * @throws java.io.IOException
224    * @return Deserialized Buffer
225    */
fromCSVBuffer(String s)226   static Buffer fromCSVBuffer(String s)
227     throws IOException {
228     if (s.charAt(0) != '#') {
229       throw new IOException("Error deserializing buffer.");
230     }
231     if (s.length() == 1) { return new Buffer(); }
232     int blen = (s.length()-1)/2;
233     byte[] barr = new byte[blen];
234     for (int idx = 0; idx < blen; idx++) {
235       char c1 = s.charAt(2*idx+1);
236       char c2 = s.charAt(2*idx+2);
237       barr[idx] = (byte)Integer.parseInt(""+c1+c2, 16);
238     }
239     return new Buffer(barr);
240   }
241 
utf8LenForCodePoint(final int cpt)242   private static int utf8LenForCodePoint(final int cpt) throws IOException {
243     if (cpt >=0 && cpt <= 0x7F) {
244       return 1;
245     }
246     if (cpt >= 0x80 && cpt <= 0x07FF) {
247       return 2;
248     }
249     if ((cpt >= 0x0800 && cpt < 0xD800) ||
250         (cpt > 0xDFFF && cpt <= 0xFFFD)) {
251       return 3;
252     }
253     if (cpt >= 0x10000 && cpt <= 0x10FFFF) {
254       return 4;
255     }
256     throw new IOException("Illegal Unicode Codepoint "+
257                           Integer.toHexString(cpt)+" in string.");
258   }
259 
260   private static final int B10 =    Integer.parseInt("10000000", 2);
261   private static final int B110 =   Integer.parseInt("11000000", 2);
262   private static final int B1110 =  Integer.parseInt("11100000", 2);
263   private static final int B11110 = Integer.parseInt("11110000", 2);
264   private static final int B11 =    Integer.parseInt("11000000", 2);
265   private static final int B111 =   Integer.parseInt("11100000", 2);
266   private static final int B1111 =  Integer.parseInt("11110000", 2);
267   private static final int B11111 = Integer.parseInt("11111000", 2);
268 
writeUtf8(int cpt, final byte[] bytes, final int offset)269   private static int writeUtf8(int cpt, final byte[] bytes, final int offset)
270     throws IOException {
271     if (cpt >=0 && cpt <= 0x7F) {
272       bytes[offset] = (byte) cpt;
273       return 1;
274     }
275     if (cpt >= 0x80 && cpt <= 0x07FF) {
276       bytes[offset+1] = (byte) (B10 | (cpt & 0x3F));
277       cpt = cpt >> 6;
278       bytes[offset] = (byte) (B110 | (cpt & 0x1F));
279       return 2;
280     }
281     if ((cpt >= 0x0800 && cpt < 0xD800) ||
282         (cpt > 0xDFFF && cpt <= 0xFFFD)) {
283       bytes[offset+2] = (byte) (B10 | (cpt & 0x3F));
284       cpt = cpt >> 6;
285       bytes[offset+1] = (byte) (B10 | (cpt & 0x3F));
286       cpt = cpt >> 6;
287       bytes[offset] = (byte) (B1110 | (cpt & 0x0F));
288       return 3;
289     }
290     if (cpt >= 0x10000 && cpt <= 0x10FFFF) {
291       bytes[offset+3] = (byte) (B10 | (cpt & 0x3F));
292       cpt = cpt >> 6;
293       bytes[offset+2] = (byte) (B10 | (cpt & 0x3F));
294       cpt = cpt >> 6;
295       bytes[offset+1] = (byte) (B10 | (cpt & 0x3F));
296       cpt = cpt >> 6;
297       bytes[offset] = (byte) (B11110 | (cpt & 0x07));
298       return 4;
299     }
300     throw new IOException("Illegal Unicode Codepoint "+
301                           Integer.toHexString(cpt)+" in string.");
302   }
303 
toBinaryString(final DataOutput out, final String str)304   static void toBinaryString(final DataOutput out, final String str)
305     throws IOException {
306     final int strlen = str.length();
307     byte[] bytes = new byte[strlen*4]; // Codepoints expand to 4 bytes max
308     int utf8Len = 0;
309     int idx = 0;
310     while(idx < strlen) {
311       final int cpt = str.codePointAt(idx);
312       idx += Character.isSupplementaryCodePoint(cpt) ? 2 : 1;
313       utf8Len += writeUtf8(cpt, bytes, utf8Len);
314     }
315     writeVInt(out, utf8Len);
316     out.write(bytes, 0, utf8Len);
317   }
318 
isValidCodePoint(int cpt)319   static boolean isValidCodePoint(int cpt) {
320     return !((cpt > 0x10FFFF) ||
321              (cpt >= 0xD800 && cpt <= 0xDFFF) ||
322              (cpt >= 0xFFFE && cpt <=0xFFFF));
323   }
324 
utf8ToCodePoint(int b1, int b2, int b3, int b4)325   private static int utf8ToCodePoint(int b1, int b2, int b3, int b4) {
326     int cpt = 0;
327     cpt = (((b1 & ~B11111) << 18) |
328            ((b2 & ~B11) << 12) |
329            ((b3 & ~B11) << 6) |
330            (b4 & ~B11));
331     return cpt;
332   }
333 
utf8ToCodePoint(int b1, int b2, int b3)334   private static int utf8ToCodePoint(int b1, int b2, int b3) {
335     int cpt = 0;
336     cpt = (((b1 & ~B1111) << 12) | ((b2 & ~B11) << 6) | (b3 & ~B11));
337     return cpt;
338   }
339 
utf8ToCodePoint(int b1, int b2)340   private static int utf8ToCodePoint(int b1, int b2) {
341     int cpt = 0;
342     cpt = (((b1 & ~B111) << 6) | (b2 & ~B11));
343     return cpt;
344   }
345 
checkB10(int b)346   private static void checkB10(int b) throws IOException {
347     if ((b & B11) != B10) {
348       throw new IOException("Invalid UTF-8 representation.");
349     }
350   }
351 
fromBinaryString(final DataInput din)352   static String fromBinaryString(final DataInput din) throws IOException {
353     final int utf8Len = readVInt(din);
354     final byte[] bytes = new byte[utf8Len];
355     din.readFully(bytes);
356     int len = 0;
357     // For the most commmon case, i.e. ascii, numChars = utf8Len
358     StringBuilder sb = new StringBuilder(utf8Len);
359     while(len < utf8Len) {
360       int cpt = 0;
361       final int b1 = bytes[len++] & 0xFF;
362       if (b1 <= 0x7F) {
363         cpt = b1;
364       } else if ((b1 & B11111) == B11110) {
365         int b2 = bytes[len++] & 0xFF;
366         checkB10(b2);
367         int b3 = bytes[len++] & 0xFF;
368         checkB10(b3);
369         int b4 = bytes[len++] & 0xFF;
370         checkB10(b4);
371         cpt = utf8ToCodePoint(b1, b2, b3, b4);
372       } else if ((b1 & B1111) == B1110) {
373         int b2 = bytes[len++] & 0xFF;
374         checkB10(b2);
375         int b3 = bytes[len++] & 0xFF;
376         checkB10(b3);
377         cpt = utf8ToCodePoint(b1, b2, b3);
378       } else if ((b1 & B111) == B110) {
379         int b2 = bytes[len++] & 0xFF;
380         checkB10(b2);
381         cpt = utf8ToCodePoint(b1, b2);
382       } else {
383         throw new IOException("Invalid UTF-8 byte "+Integer.toHexString(b1)+
384                               " at offset "+(len-1)+" in length of "+utf8Len);
385       }
386       if (!isValidCodePoint(cpt)) {
387         throw new IOException("Illegal Unicode Codepoint "+
388                               Integer.toHexString(cpt)+" in stream.");
389       }
390       sb.appendCodePoint(cpt);
391     }
392     return sb.toString();
393   }
394 
395   /** Parse a float from a byte array. */
readFloat(byte[] bytes, int start)396   public static float readFloat(byte[] bytes, int start) {
397     return WritableComparator.readFloat(bytes, start);
398   }
399 
400   /** Parse a double from a byte array. */
readDouble(byte[] bytes, int start)401   public static double readDouble(byte[] bytes, int start) {
402     return WritableComparator.readDouble(bytes, start);
403   }
404 
405   /**
406    * Reads a zero-compressed encoded long from a byte array and returns it.
407    * @param bytes byte array with decode long
408    * @param start starting index
409    * @throws java.io.IOException
410    * @return deserialized long
411    */
readVLong(byte[] bytes, int start)412   public static long readVLong(byte[] bytes, int start) throws IOException {
413     return WritableComparator.readVLong(bytes, start);
414   }
415 
416   /**
417    * Reads a zero-compressed encoded integer from a byte array and returns it.
418    * @param bytes byte array with the encoded integer
419    * @param start start index
420    * @throws java.io.IOException
421    * @return deserialized integer
422    */
readVInt(byte[] bytes, int start)423   public static int readVInt(byte[] bytes, int start) throws IOException {
424     return WritableComparator.readVInt(bytes, start);
425   }
426 
427   /**
428    * Reads a zero-compressed encoded long from a stream and return it.
429    * @param in input stream
430    * @throws java.io.IOException
431    * @return deserialized long
432    */
readVLong(DataInput in)433   public static long readVLong(DataInput in) throws IOException {
434     return WritableUtils.readVLong(in);
435   }
436 
437   /**
438    * Reads a zero-compressed encoded integer from a stream and returns it.
439    * @param in input stream
440    * @throws java.io.IOException
441    * @return deserialized integer
442    */
readVInt(DataInput in)443   public static int readVInt(DataInput in) throws IOException {
444     return WritableUtils.readVInt(in);
445   }
446 
447   /**
448    * Get the encoded length if an integer is stored in a variable-length format
449    * @return the encoded length
450    */
getVIntSize(long i)451   public static int getVIntSize(long i) {
452     return WritableUtils.getVIntSize(i);
453   }
454 
455   /**
456    * Serializes a long to a binary stream with zero-compressed encoding.
457    * For -112 <= i <= 127, only one byte is used with the actual value.
458    * For other values of i, the first byte value indicates whether the
459    * long is positive or negative, and the number of bytes that follow.
460    * If the first byte value v is between -113 and -120, the following long
461    * is positive, with number of bytes that follow are -(v+112).
462    * If the first byte value v is between -121 and -128, the following long
463    * is negative, with number of bytes that follow are -(v+120). Bytes are
464    * stored in the high-non-zero-byte-first order.
465    *
466    * @param stream Binary output stream
467    * @param i Long to be serialized
468    * @throws java.io.IOException
469    */
writeVLong(DataOutput stream, long i)470   public static void writeVLong(DataOutput stream, long i) throws IOException {
471     WritableUtils.writeVLong(stream, i);
472   }
473 
474   /**
475    * Serializes an int to a binary stream with zero-compressed encoding.
476    *
477    * @param stream Binary output stream
478    * @param i int to be serialized
479    * @throws java.io.IOException
480    */
writeVInt(DataOutput stream, int i)481   public static void writeVInt(DataOutput stream, int i) throws IOException {
482     WritableUtils.writeVInt(stream, i);
483   }
484 
485   /** Lexicographic order of binary data. */
compareBytes(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2)486   public static int compareBytes(byte[] b1, int s1, int l1,
487                                  byte[] b2, int s2, int l2) {
488     return WritableComparator.compareBytes(b1, s1, l1, b2, s2, l2);
489   }
490 }
491